1. Load Data
2. EDA
3. Data Cleaning
4. Feature Engineering (sqrt, log, new columns)
5. Split X and y
6. Train-Test Split
7. Encoding (Ordinal / OneHot)
8. Scaling (for Linear Regression)
9. Train Model
10. Predict
11. Evaluate Errors

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.preprocessing import StandardScaler,RobustScaler,minmax_scale
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder,LabelEncoder,OrdinalEncoder
from sklearn.metrics import mean_squared_error, r2_score

In [4]:
df = pd.read_csv('bengaluru_house_prices.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [6]:
df.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [7]:
df.drop(columns=['society'], inplace=True)
df['balcony'].fillna(df['balcony'].median(), inplace=True)
df['bath'].fillna(df['bath'].median(), inplace=True)

In [8]:
df['bhk'] = df['size'].str.extract('(\d+)').astype(float)
df.drop(columns=['size'], inplace=True)

In [9]:
def convert_sqft_to_num(x):
    # 1. Handle Ranges (e.g., '2100 - 2850')
    tokens = x.split('-')
    if len(tokens) == 2:
        return (float(tokens[0]) + float(tokens[1])) / 2
    
    # 2. Handle Normal Numbers & Units
    try:
        # Check for specific units and convert to sqft
        if 'Sq. Meter' in x:
            return float(x.replace('Sq. Meter', '')) * 10.764
        elif 'Sq. Yards' in x:
            return float(x.replace('Sq. Yards', '')) * 9.0
        elif 'Acres' in x:
            return float(x.replace('Acres', '')) * 43560
        elif 'Guntha' in x:
            return float(x.replace('Guntha', '')) * 1089
        elif 'Cents' in x:
            return float(x.replace('Cents', '')) * 435.6
        elif 'Grounds' in x:
            return float(x.replace('Grounds', '')) * 2400
        elif 'Perch' in x:
            return float(x.replace('Perch', '')) * 272.25
            
        # If no unit, just convert to float
        return float(x)
    except:
        return None

# Apply the Pro function
df['total_sqft'] = df['total_sqft'].apply(convert_sqft_to_num)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13303 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   area_type       13303 non-null  object 
 1   availability    13303 non-null  object 
 2   location        13303 non-null  object 
 3   total_sqft      13303 non-null  float64
 4   bath            13303 non-null  float64
 5   balcony         13303 non-null  float64
 6   price           13303 non-null  float64
 7   bhk             13303 non-null  float64
 8   price_per_sqft  13303 non-null  float64
dtypes: float64(6), object(3)
memory usage: 1.0+ MB


In [42]:
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()
df = df[df['total_sqft'] > 0]

In [22]:
df['price_per_sqft'] = (df['price'] * 100000) / df['total_sqft']

In [44]:
# Define x and y
x = df.drop(['price', 'price_per_sqft'], axis=1)
y = df['price']

In [45]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

In [46]:
categorical_cols = ['area_type', 'availability', 'location']

ohe = OneHotEncoder(drop='first', sparse_output=False)
ohe.fit(df[categorical_cols])  # Fit on entire df to know all categories

x_train_cat = ohe.transform(x_train[categorical_cols])
x_test_cat = ohe.transform(x_test[categorical_cols])

x_train_num = x_train.drop(categorical_cols, axis=1).values
x_test_num = x_test.drop(categorical_cols, axis=1).values

x_train = np.hstack([x_train_num, x_train_cat])
x_test = np.hstack([x_test_num, x_test_cat])

In [None]:
# Location already encoded with OrdinalEncoder, no need to drop

In [None]:
# No need for hstack since location is encoded in place

In [47]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

x_train_num_scaled = scaler.fit_transform(x_train_num)
x_test_num_scaled = scaler.transform(x_test_num)

x_train_scaled = np.hstack([x_train_num_scaled, x_train_cat])
x_test_scaled = np.hstack([x_test_num_scaled, x_test_cat])

In [48]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(x_train_scaled, y_train)

In [49]:
y_pred = model.predict(x_test_scaled)

In [51]:
from sklearn.metrics import mean_squared_error, r2_score

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

rmse, r2


(113.05964655800017, 0.4810968307853839)