# Bengaluru Real Estate Price prediction model 

In [26]:
# importing libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression


## Loading the dataset and investigating data

In [27]:
# Loading dataset
dt = pd.read_csv('Bengaluru_House_Data.csv')

Displaying the dataset:

In [28]:
dt

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.00
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.00
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.00
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.00
...,...,...,...,...,...,...,...,...,...
13315,Built-up Area,Ready To Move,Whitefield,5 Bedroom,ArsiaEx,3453,4.0,0.0,231.00
13316,Super built-up Area,Ready To Move,Richards Town,4 BHK,,3600,5.0,,400.00
13317,Built-up Area,Ready To Move,Raja Rajeshwari Nagar,2 BHK,Mahla T,1141,2.0,1.0,60.00
13318,Super built-up Area,18-Jun,Padmanabhanagar,4 BHK,SollyCl,4689,4.0,1.0,488.00


Exploring the dataset:

In [29]:
dt.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13320 entries, 0 to 13319
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   area_type     13320 non-null  object 
 1   availability  13320 non-null  object 
 2   location      13319 non-null  object 
 3   size          13304 non-null  object 
 4   society       7818 non-null   object 
 5   total_sqft    13320 non-null  object 
 6   bath          13247 non-null  float64
 7   balcony       12711 non-null  float64
 8   price         13320 non-null  float64
dtypes: float64(3), object(6)
memory usage: 936.7+ KB


In [30]:
dt.isnull().sum()

area_type          0
availability       0
location           1
size              16
society         5502
total_sqft         0
bath              73
balcony          609
price              0
dtype: int64

In [31]:
dt.describe()

Unnamed: 0,bath,balcony,price
count,13247.0,12711.0,13320.0
mean,2.69261,1.584376,112.565627
std,1.341458,0.817263,148.971674
min,1.0,0.0,8.0
25%,2.0,1.0,50.0
50%,2.0,2.0,72.0
75%,3.0,2.0,120.0
max,40.0,3.0,3600.0


## Preprocessing

Dropping unnecessary columns:

In [32]:
dt = dt.drop(['society','availability','location','balcony'],axis=1)

In [33]:
dt.isnull().sum()

area_type      0
size          16
total_sqft     0
bath          73
price          0
dtype: int64

In [34]:
dt = dt.dropna()

In [35]:
dt.isnull().sum()

area_type     0
size          0
total_sqft    0
bath          0
price         0
dtype: int64

In [36]:
dt['BHK'] = dt['size'].apply(lambda x: int(x.split(' ')[0]))

In [37]:
def isfloat(x):
    try:
        float(x)
    except:
        return False
    return True

dt[~dt['total_sqft'].apply(isfloat)]

Unnamed: 0,area_type,size,total_sqft,bath,price,BHK
30,Super built-up Area,4 BHK,2100 - 2850,4.0,186.000,4
122,Super built-up Area,4 BHK,3067 - 8156,4.0,477.000,4
137,Super built-up Area,2 BHK,1042 - 1105,2.0,54.005,2
165,Super built-up Area,2 BHK,1145 - 1340,2.0,43.490,2
188,Super built-up Area,2 BHK,1015 - 1540,2.0,56.800,2
...,...,...,...,...,...,...
12975,Super built-up Area,2 BHK,850 - 1060,2.0,38.190,2
12990,Super built-up Area,3 BHK,1804 - 2273,3.0,122.000,3
13059,Super built-up Area,2 BHK,1200 - 1470,2.0,72.760,2
13265,Super built-up Area,2 BHK,1133 - 1384,2.0,59.135,2


In [38]:
sum(~dt['total_sqft'].apply(isfloat))

190

Convert Total SQFT area into a float value:

In [39]:
def convert_sqft_to_num(x):
    token=x.split('-')
    if len(token)==2:
        return (float(token[0])+float(token[1]))/2
    try:
        return float(x)
    except:
        return None
    
dt['total_sqft'] = dt['total_sqft'].apply(convert_sqft_to_num)

In [40]:
sum(~dt['total_sqft'].apply(isfloat))

0

Converting string data into integer labels:

In [41]:
dt['area_type'].unique()

array(['Super built-up  Area', 'Plot  Area', 'Built-up  Area',
       'Carpet  Area'], dtype=object)

In [42]:
sz = LabelEncoder()
dt['area_type'] = sz.fit_transform(dt['area_type'])

Adding a new column of 'Price Per Square Feet vs Count'

In [43]:
dt["price_per_sqft"] = dt["price"]*100000/dt["total_sqft"]
dt.head()

Unnamed: 0,area_type,size,total_sqft,bath,price,BHK,price_per_sqft
0,3,2 BHK,1056.0,2.0,39.07,2,3699.810606
1,2,4 Bedroom,2600.0,5.0,120.0,4,4615.384615
2,0,3 BHK,1440.0,2.0,62.0,3,4305.555556
3,3,3 BHK,1521.0,3.0,95.0,3,6245.890861
4,3,2 BHK,1200.0,2.0,51.0,2,4250.0


In [44]:
dt = dt.dropna()

In [45]:
dt.isnull().sum()

area_type         0
size              0
total_sqft        0
bath              0
price             0
BHK               0
price_per_sqft    0
dtype: int64

Splitting the data:

In [46]:
Y = dt['price']
X = dt.drop(['size','price'],axis=1)
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

## Training Model and obtaining predictions

In [47]:
# Decision Tree Regressor

dct = DecisionTreeRegressor()
dct.fit(x_train,y_train)
y_pred = dct.predict(x_test)
print("Training Score: ",dct.score(x_train,y_train))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)
r2 = r2_score(y_test, y_pred)
print("R-squared (R2):", r2)

Training Score:  1.0
Mean Squared Error (MSE): 1259.7329230026505
Root Mean Squared Error (RMSE): 35.49271647821072
Mean Absolute Error (MAE): 4.282048466489965
R-squared (R2): 0.9389693534937478


In [48]:
# K Nearest Neighbours
knn = KNeighborsRegressor()
knn.fit(x_train,y_train)
y_pred = knn.predict(x_test)
print("Training Score: ",knn.score(x_train,y_train))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)
r2 = r2_score(y_test, y_pred)
print("R-squared (R2):", r2)

Training Score:  0.9333357967762292
Mean Squared Error (MSE): 2504.2623019905336
Root Mean Squared Error (RMSE): 50.042604868157426
Mean Absolute Error (MAE): 5.365817872018175
R-squared (R2): 0.8786752774965809


In [49]:
# Random Forest Regressor
rf = RandomForestRegressor()
rf.fit(x_train,y_train)
y_pred = rf.predict(x_test)
print("Training Score: ",rf.score(x_train,y_train))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)
r2 = r2_score(y_test, y_pred)
print("R-squared (R2):", r2)

Training Score:  0.9956630167236984
Mean Squared Error (MSE): 982.2633000211691
Root Mean Squared Error (RMSE): 31.34108007106917
Mean Absolute Error (MAE): 2.641172074971602
R-squared (R2): 0.9524120048424498


In [50]:
# Linear Regression
lr = LinearRegression()
lr.fit(x_train,y_train)
y_pred = lr.predict(x_test)
print("Training Score: ",lr.score(x_train,y_train))
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)
rmse = np.sqrt(mse)
print("Root Mean Squared Error (RMSE):", rmse)
mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error (MAE):", mae)
r2 = r2_score(y_test, y_pred)
print("R-squared (R2):", r2)

Training Score:  0.38173566606136444
Mean Squared Error (MSE): 11566.885400391955
Root Mean Squared Error (RMSE): 107.54945560248994
Mean Absolute Error (MAE): 46.97371937302007
R-squared (R2): 0.43961574619562016
