# House Price Prediction Dataset

In [1]:
#importing the necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [2]:
# load dataset
data = pd.read_csv('bengaluru_house_prices.csv')

In [3]:
#shape
data.shape

(13320, 9)

In [4]:
data.head()

Unnamed: 0,area_type,availability,location,size,society,total_sqft,bath,balcony,price
0,Super built-up Area,19-Dec,Electronic City Phase II,2 BHK,Coomee,1056,2.0,1.0,39.07
1,Plot Area,Ready To Move,Chikka Tirupathi,4 Bedroom,Theanmp,2600,5.0,3.0,120.0
2,Built-up Area,Ready To Move,Uttarahalli,3 BHK,,1440,2.0,3.0,62.0
3,Super built-up Area,Ready To Move,Lingadheeranahalli,3 BHK,Soiewre,1521,3.0,1.0,95.0
4,Super built-up Area,Ready To Move,Kothanur,2 BHK,,1200,2.0,1.0,51.0


In [5]:
# split into input (X) and output (y) variables
price = data['price']
price = pd.DataFrame(price)
print(price)

        price
0       39.07
1      120.00
2       62.00
3       95.00
4       51.00
...       ...
13315  231.00
13316  400.00
13317   60.00
13318  488.00
13319   17.00

[13320 rows x 1 columns]


In [6]:
# Feature scaling using NORMALIZATION

from sklearn.preprocessing import MinMaxScaler
sc = MinMaxScaler(feature_range = (0.9,1))
scaled_data = sc.fit_transform(price)
print(scaled_data)

[[0.90086498]
 [0.90311804]
 [0.90150334]
 ...
 [0.90144766]
 [0.91336303]
 [0.90025056]]


In [7]:
# Creating a data structure with 7 timesteps and 1 output
X = []
y = []
for i in range(7,4746):
    X.append(scaled_data[i-7:i,0])
    y.append(scaled_data[i,0])
    
X,y = np.array(X),np.array(y)

In [8]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

# Root Mean Squared Error

In [9]:
def rmse(actual, pred):
  return np.sqrt(mean_squared_error(actual, pred))

# Mean absolute Percentage Error

In [10]:
def mape(actual, pred): 
    actual, pred = np.array(actual), np.array(pred)
    return np.mean(np.abs((actual - pred) / actual)) * 100

# ANN

In [11]:
# define the keras model
model = Sequential()
model.add(Dense(20, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(10, activation='relu', kernel_initializer='he_normal'))
model.add(Dense(1, activation='linear'))

In [12]:
# compile the keras model
model.compile(loss='mean_absolute_percentage_error', optimizer='adam')

In [13]:
# fit the keras model on the dataset
model.fit(X_train, y_train, epochs=100, batch_size=32, verbose=2)

Epoch 1/100
104/104 - 1s - 11ms/step - loss: 103.9171
Epoch 2/100
104/104 - 0s - 1ms/step - loss: 0.4309
Epoch 3/100
104/104 - 0s - 1ms/step - loss: 0.3113
Epoch 4/100
104/104 - 0s - 1ms/step - loss: 0.2979
Epoch 5/100
104/104 - 0s - 1ms/step - loss: 0.3347
Epoch 6/100
104/104 - 0s - 1ms/step - loss: 0.3244
Epoch 7/100
104/104 - 0s - 1ms/step - loss: 0.2909
Epoch 8/100
104/104 - 0s - 1ms/step - loss: 0.3123
Epoch 9/100
104/104 - 0s - 891us/step - loss: 0.2903
Epoch 10/100
104/104 - 0s - 1ms/step - loss: 0.3049
Epoch 11/100
104/104 - 0s - 1ms/step - loss: 0.3230
Epoch 12/100
104/104 - 0s - 1ms/step - loss: 0.3122
Epoch 13/100
104/104 - 0s - 1ms/step - loss: 0.3124
Epoch 14/100
104/104 - 0s - 1ms/step - loss: 0.3328
Epoch 15/100
104/104 - 0s - 2ms/step - loss: 0.2899
Epoch 16/100
104/104 - 0s - 924us/step - loss: 0.3117
Epoch 17/100
104/104 - 0s - 1ms/step - loss: 0.2949
Epoch 18/100
104/104 - 0s - 1ms/step - loss: 0.3044
Epoch 19/100
104/104 - 0s - 994us/step - loss: 0.3123
Epoch 20/100

<keras.src.callbacks.history.History at 0x1ff5f6921e0>

In [14]:
ANN_rmse = rmse(y_test, model.predict(X_test))
print('RMSE: %.3f' % ANN_rmse)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
RMSE: 0.005


In [15]:
ANN_rmse = rmse(y_test, model.predict(X_test))
print('RMSE: ',ANN_rmse)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 759us/step
RMSE:  0.004634041492037808


In [16]:
ANN_mape = mape(y_test,model.predict(X_test))

print("MAPE =",ANN_mape)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 763us/step
MAPE = 0.2948810372950225


# LINEAR REGRESSION

In [17]:
lr = LinearRegression()
lr.fit(X_train, y_train)

In [18]:
lr_rmse = rmse(y_test, lr.predict(X_test))
print("Root Mean Square Error=",lr_rmse)

Root Mean Square Error= 0.003581470568102824


In [19]:
lr_mape=mape(y_test,lr.predict(X_test))
print("Mean Absolute Percentage error=",lr_mape)

Mean Absolute Percentage error= 0.21633186686492253


# DECISION TREE

In [20]:
dt = DecisionTreeRegressor(min_samples_leaf=1000,min_weight_fraction_leaf=0.5)
dt.fit(X_train,y_train)

In [21]:
dt_rmse = rmse(y_test, dt.predict(X_test))
print("Root Mean Square Error=",dt_rmse)

Root Mean Square Error= 0.003573854567054637


In [22]:
dt_mape=mape(y_test,dt.predict(X_test))
print("Mean Absolute Percentage error=",dt_mape)

Mean Absolute Percentage error= 0.21529937882644937


# RANDOM FOREST

In [23]:
rfr = RandomForestRegressor()
rfr.fit(X_train,y_train)

In [24]:
rfr_rmse = rmse(y_test , rfr.predict(X_test))
print("Root Mean Square Error=",rfr_rmse)

Root Mean Square Error= 0.003898586593483645


In [25]:
rfr_mape = mape(y_test , rfr.predict(X_test))
print("Mean Absolute Percentage error =" , rfr_mape)

Mean Absolute Percentage error = 0.2479031494330995


# LSTM

In [26]:
#Before feeding into lstm we must convert dataset into 3d 
# reshape input to be [samples, time steps, features] which is required for LSTM
X_train =X_train.reshape(X_train.shape[0],X_train.shape[1] , 1)
X_test = X_test.reshape(X_test.shape[0],X_test.shape[1] , 1)

(X_train.shape, X_test.shape)

((3317, 7, 1), (1422, 7, 1))

In [27]:
from tensorflow.keras.layers import LSTM
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

In [28]:
model=Sequential()
model.add(LSTM(50,return_sequences=True,input_shape=(7,1)))
model.add(LSTM(50,return_sequences=True))
model.add(LSTM(50))
model.add(Dense(1))

model.compile(loss='mean_squared_error',optimizer='adam')

In [29]:
model.summary()

In [30]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=20)

Epoch 1/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - loss: 0.2303 - val_loss: 1.7012e-05
Epoch 2/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1.5883e-05 - val_loss: 1.4059e-05
Epoch 3/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1.4658e-05 - val_loss: 1.7363e-05
Epoch 4/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1.5027e-05 - val_loss: 1.4548e-05
Epoch 5/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1.5485e-05 - val_loss: 1.4332e-05
Epoch 6/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1.8110e-05 - val_loss: 1.4201e-05
Epoch 7/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 1.9310e-05 - val_loss: 1.3610e-05
Epoch 8/20
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - loss: 1.4534e-05 - 

<keras.src.callbacks.history.History at 0x1ff67e796d0>

In [31]:
train_predict=model.predict(X_train)
test_predict=model.predict(X_test)

[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step
[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [32]:
# We have scaled it, so we need to reverse scale it to find the o/p
train_predict=sc.inverse_transform(train_predict)
test_predict=sc.inverse_transform(test_predict)

In [33]:
#rmse
import math
from sklearn.metrics import mean_squared_error
math.sqrt(mean_squared_error(y_train,train_predict))

110.03249606414036

In [34]:
math.sqrt(mean_squared_error(y_test,test_predict))

107.53268781582622

In [35]:
lstm_rmse = rmse(y_test, model.predict(X_test))
print('RMSE: %.3f' % lstm_rmse)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
RMSE: 0.004


In [36]:
lstm_mape = mape(y_test,model.predict(X_test))

print("MAPE: " , lstm_mape)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
MAPE:  0.21062841147370787


In [37]:
lstm_rmse = rmse(y_test, model.predict(X_test))
print('RMSE: ',lstm_rmse)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
RMSE:  0.0036884285065135946


# Graph

In [38]:
import plotly.express as px

In [39]:
algo_name = ('ANN', 'LR', 'DT', 'RF','LSTM')
rmse_list = (ANN_rmse, lr_rmse, dt_rmse, rfr_rmse, lstm_rmse)
mape_list = (ANN_mape, lr_mape, dt_mape, rfr_mape, lstm_mape)

In [40]:
fig = px.bar(x=algo_name,y=rmse_list, title = 'RMSE Plot', color=algo_name, height=500, width= 700)
fig.update_layout(xaxis_title="Algorithms Applied", yaxis_title="RMSE")
fig.show()

In [41]:
fig1 = px.bar(x=algo_name,y=mape_list, title = 'MAPE Plot', color=algo_name, height=500, width= 700)
fig1.update_layout(xaxis_title="Algorithms Applied", yaxis_title="MAPE")
fig1.show()