In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import matplotlib
matplotlib.rcParams["figure.figsize"] = (20,10)

# Model Building

In [181]:
df=pd.read_csv("Dataset after outlier removal.csv")
df=df.drop(['Unnamed: 0'],axis='columns')
df

Unnamed: 0,location,total_sqft,bath,price,size
0,1st Block Jayanagar,2850.0,4.0,428.00,4
1,1st Block Jayanagar,1630.0,3.0,194.00,3
2,1st Block Jayanagar,1200.0,6.0,125.00,6
3,1st Block Jayanagar,1875.0,2.0,235.00,3
4,1st Block Jayanagar,2400.0,4.0,450.00,4
...,...,...,...,...,...
8690,Yeshwanthpur,1195.0,2.0,100.00,2
8691,Yeshwanthpur,2500.0,5.0,185.00,6
8692,Yeshwanthpur,1160.0,2.0,64.08,2
8693,Yeshwanthpur,1855.0,3.0,135.00,3


Use one hot encoder to change cateorical column to numeric column (location column)

In [182]:
dummies=pd.get_dummies(df['location'])
dummies.head(10)

Unnamed: 0,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,7th Phase JP Nagar,8th Phase JP Nagar,9th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Drop one column to avoid dummy variable trap and concatenate with the existing dataframe

In [183]:
df2=pd.concat([df,dummies.drop(['Other'],axis='columns')],axis='columns')

In [184]:
df2

Unnamed: 0,location,total_sqft,bath,price,size,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,1st Block Jayanagar,2850.0,4.0,428.00,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1st Block Jayanagar,1630.0,3.0,194.00,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1st Block Jayanagar,1200.0,6.0,125.00,6,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1st Block Jayanagar,1875.0,2.0,235.00,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1st Block Jayanagar,2400.0,4.0,450.00,4,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8690,Yeshwanthpur,1195.0,2.0,100.00,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8691,Yeshwanthpur,2500.0,5.0,185.00,6,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8692,Yeshwanthpur,1160.0,2.0,64.08,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8693,Yeshwanthpur,1855.0,3.0,135.00,3,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [185]:
df3=df2.drop(['location'],axis='columns')
df3.head()

Unnamed: 0,total_sqft,bath,price,size,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,428.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,194.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1200.0,6.0,125.0,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1875.0,2.0,235.0,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2400.0,4.0,450.0,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Dependent variable is price and rest of the columns are independent variables. So, separate dependent variable and independent variables

In [186]:
x=df3.drop(['price'],axis='columns')
x.head(3)

Unnamed: 0,total_sqft,bath,size,1st Block Jayanagar,1st Phase JP Nagar,2nd Phase Judicial Layout,2nd Stage Nagarbhavi,5th Block Hbr Layout,5th Phase JP Nagar,6th Phase JP Nagar,...,Vijayanagar,Vishveshwarya Layout,Vishwapriya Layout,Vittasandra,Whitefield,Yelachenahalli,Yelahanka,Yelahanka New Town,Yelenahalli,Yeshwanthpur
0,2850.0,4.0,4,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1630.0,3.0,3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1200.0,6.0,6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [187]:
y=df3['price']
y.head(3)

0    428.0
1    194.0
2    125.0
Name: price, dtype: float64

Split the data into training set and test set

In [188]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=99)

From here, we will apply various regression models and see which model works better...

# Multiple Linear Regression

In [189]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(x_train,y_train)
lr.score(x_test,y_test)

0.8027306233425024

Apply K-fold cross validation to evaluate the model performance for different training and test data points...

In [207]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=99)
a=cross_val_score(LinearRegression(),x,y,cv=cv)
print(a)
score1=np.mean(a)
print("Mean accuracy: ",score1)

[0.80273062 0.80912049 0.78476255 0.77274002 0.74625023]
Mean accuracy:  0.7831207817025754


# Decision Tree Regression

In [219]:
from sklearn.tree import DecisionTreeRegressor
dt=DecisionTreeRegressor(random_state=99)
dt.fit(x_train,y_train)
dt.score(x_test,y_test)

0.584054963961299

Apply K-fold cross validation to evaluate the model performance for different training and test data points...

In [214]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=99)
b=cross_val_score(DecisionTreeRegressor(),x,y,cv=cv)
score2=np.mean(b)
print(b)
print("Mean accuracy: ",score2)

[0.57582385 0.64920525 0.68289483 0.63504724 0.49098799]
Mean accuracy:  0.606791830503653


# Random Forest Regression

In [206]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state =5)
regressor.fit(x_train, y_train)
regressor.score(x_test,y_test)

0.7371776902553668

Apply K-fold cross validation to evaluate the model performance for different training and test data points...

In [215]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
cv=ShuffleSplit(n_splits=5,test_size=0.2,random_state=99)
c=cross_val_score(RandomForestRegressor(),x,y,cv=cv)
score3=np.mean(c)
print(c)
print("Mean accuracy: ",score3)

[0.73618105 0.79444832 0.79382543 0.77574799 0.64245299]
Mean accuracy:  0.7485311540979105


In [220]:
accuracy_dict={
    'Model':['Multiple Linear Regression','Decision Tree Regression','Random Forest Regression'],
    'Accuracy':[score1,score2,score3]
}
model_accuracies=pd.DataFrame(accuracy_dict)
model_accuracies

Unnamed: 0,Model,Accuracy
0,Multiple Linear Regression,0.783121
1,Decision Tree Regression,0.606792
2,Random Forest Regression,0.748531


From the above table, we see that Multiple Linear Regression model performs better for this dataset with a decent accuracy of 78.31% followed by Random Forest Regression model with an accuracy of 74.85%. So, we will consider multiple linear regression model for prediction

# Prediction function

In [228]:
x.columns

Index(['total_sqft', 'bath', 'size', '1st Block Jayanagar',
       '1st Phase JP Nagar', '2nd Phase Judicial Layout',
       '2nd Stage Nagarbhavi', '5th Block Hbr Layout', '5th Phase JP Nagar',
       '6th Phase JP Nagar',
       ...
       'Vijayanagar', 'Vishveshwarya Layout', 'Vishwapriya Layout',
       'Vittasandra', 'Whitefield', 'Yelachenahalli', 'Yelahanka',
       'Yelahanka New Town', 'Yelenahalli', 'Yeshwanthpur'],
      dtype='object', length=243)

In [244]:
def predict(location,sqft,bath,rooms):
    #get column number of the given location
    location_index=np.where(x.columns==location)[0][0]
    
    #fill all columns with 0
    q=np.zeros(len(x.columns))
    
    #then fill 0,1,2 columns with given values of total_sqft,number of bathrooms and number of bedrooms
    q[0]=sqft
    q[1]=bath
    q[2]=rooms
    q[location_index]=1
    return lr.predict([q])[0]
    #we used [0] because lr.predict() will return an array with length 0. To get the value, we use index[0]

In [257]:
predict('Yelahanka New Town',1000,2,2)

69.43690944179089

In [259]:
predict('Yelahanka New Town',1500,3,3)

120.9671003981537

In [261]:
predict('Indira Nagar',1800,5,4)

260.2006125223611

Store model weights in a pickle file

In [263]:
import pickle
with open('model_weights.pickle','wb') as f:
    pickle.dump(lr,f)

Store location names in json format so that it can be used while creating flask server

In [266]:
import json
columns={
    'column names': [i.lower() for i in x.columns]
}
with open('column_names.json','w') as f:
    f.write(json.dumps(columns))