In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import numpy as np
import pandas as pd

In [4]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/Mumbai_house_rent_99acers/Mumbai_99acers_cleaned.csv')
logdf = pd.read_csv('/content/drive/MyDrive/Datasets/Mumbai_house_rent_99acers/Mumbai_99acers_logtransformed.csv')

In [5]:
logdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12797 entries, 0 to 12796
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   society        12797 non-null  object 
 1   type           12797 non-null  object 
 2   location       12797 non-null  object 
 3   bedrooms       12797 non-null  int64  
 4   built-up area  12797 non-null  float64
 5   furnishing     12797 non-null  object 
 6   age            12797 non-null  object 
 7   floor          12797 non-null  int64  
 8   total floors   12797 non-null  int64  
 9   monthly rent   12797 non-null  float64
dtypes: float64(2), int64(3), object(5)
memory usage: 999.9+ KB


# Machine Learning Model Building.
<a href = '#top'>Back on Top</a>

## OneHotEncoding Text features

In [10]:
temp = pd.get_dummies(df, drop_first=True)
corr = temp.corr()['monthly rent'].sort_values()

logtemp = pd.get_dummies(logdf, drop_first=True)
logcorr = logtemp.corr()['monthly rent'].sort_values()

In [12]:
print('without log transform negative correlated:\n')
print(corr.head(10))
print('*'*100)
print('with log transform negative correlated:\n')
print(logcorr.head(10))

without log transform negative correlated:

furnishing_Unfurnished                    -0.361680
location_Dombivli (East)                  -0.175266
society_Lodha Lakeshore Greens            -0.143397
location_other location                   -0.125308
location_Kharghar                         -0.108375
location_Sector 17 Ulwe                   -0.102996
location_Kasar Vadavali                   -0.094774
society_Chatrapati Shivaji Raje Complex   -0.093411
location_Ekta Nagar                       -0.093411
society_other society                     -0.093317
Name: monthly rent, dtype: float64
****************************************************************************************************
with log transform negative correlated:

furnishing_Unfurnished                    -0.456255
location_Dombivli (East)                  -0.297393
society_Lodha Lakeshore Greens            -0.281717
location_Sector 17 Ulwe                   -0.200992
location_other location                   -0.163988

In [13]:
print('without log transform positive correlated:\n')
print(corr.tail(11))
print('*'*100)
print('with log transform positive correlated:\n')
print(logcorr.tail(11))

without log transform positive correlated:

location_Bandra (West)      0.169606
society_Oberoi Esquire      0.172564
furnishing_Semifurnished    0.191221
location_Khar West          0.203989
society_Crescent Bay        0.238524
location_Parel              0.240461
floor                       0.379980
total floors                0.383646
bedrooms                    0.553519
built-up area               0.615666
monthly rent                1.000000
Name: monthly rent, dtype: float64
****************************************************************************************************
with log transform positive correlated:

location_Goregaon (East)    0.154638
location_Bandra (West)      0.168216
location_Khar West          0.168600
society_Crescent Bay        0.176711
location_Parel              0.190611
furnishing_Semifurnished    0.225378
floor                       0.350188
total floors                0.385382
bedrooms                    0.524892
built-up area               0.553981
mo

## Preparing Training and Testing Datasets

In [57]:
from sklearn.model_selection import train_test_split

x = temp.drop('monthly rent', axis = 1)
y = temp['monthly rent']

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((10237, 355), (2560, 355), (10237,), (2560,))

In [58]:
logx = logtemp.drop('monthly rent', axis = 1)
logy = logtemp['monthly rent']

logx_train, logx_test, logy_train, logy_test = train_test_split(logx, logy, test_size=0.2, random_state=42)
logx_train.shape, logx_test.shape, logy_train.shape, logy_test.shape

((10237, 355), (2560, 355), (10237,), (2560,))

## Scaling Datasets to convert them into same range.

In [59]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
x_train = scaler.transform(X_train)
x_test = scaler.transform(X_test)

In [60]:
logscaler = StandardScaler()
logscaler.fit(logx_train)
logx_train = logscaler.transform(logx_train)
logx_test = logscaler.transform(logx_test)

## LinearRegression model with standard scaling.

In [61]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.metrics import r2_score, mean_squared_error

linear_model = LinearRegression()
linear_model.fit(x_train,y_train)
print('r2 score:',r2_score(y_test, linear_model.predict(x_test)))
print('rmse:',mean_squared_error(y_test, linear_model.predict(x_test))**.5)

r2 score: 0.8850250933959646
rmse: 12192.729520008921


In [98]:
linear_model = LinearRegression()
linear_model.fit(logx_train,logy_train)
print('r2 score:',r2_score(logy_test, linear_model.predict(logx_test)))
print('rmse:',mean_squared_error(logy_test, linear_model.predict(logx_test))**.5)

r2 score: 0.5302628523443393
rmse: 0.5055615321062806


## Lets store it in a DataFrame. so that we can compare different model's scores in last.

In [63]:
Final_Scores = pd.DataFrame( index = ['With Training Data', 'With Test Data'])

In [99]:
Final_Scores['Linear Regression %'] = [r2_score(logy_train, linear_model.predict(logx_train))*100, 
                                       r2_score(logy_test, linear_model.predict(logx_test))*100]

##Ridge

In [65]:
ridge = Ridge()
ridge.fit(x_train,y_train)
print('r2 score:',r2_score(y_test, ridge.predict(x_test)))
print('rmse:',mean_squared_error(y_test, ridge.predict(x_test))**.5)

r2 score: 0.8850479799093492
rmse: 12191.515938003859


In [66]:
ridge = Ridge()
ridge.fit(logx_train,logy_train)
print('r2 score:',r2_score(logy_test, ridge.predict(logx_test)))
print('rmse:',mean_squared_error(logy_test, ridge.predict(logx_test))**.5)

r2 score: 0.9272344512327572
rmse: 0.19898002111917532


In [96]:
Final_Scores['Ridge Regression %'] = [r2_score(logy_train, ridge.predict(logx_train))*100, 
                                       r2_score(logy_test, ridge.predict(logx_test))*100]

ValueError: ignored

## Lasso

In [68]:
lasso = Lasso()
lasso.fit(x_train,y_train)
print('r2 score:',r2_score(y_test, lasso.predict(x_test)))
print('rmse:',mean_squared_error(y_test, lasso.predict(x_test))**.5)

r2 score: 0.8851027112279908
rmse: 12188.613260929851


  positive)


In [69]:
lasso = Lasso()
lasso.fit(logx_train,logy_train)
print('r2 score:',r2_score(logy_test, lasso.predict(logx_test)))
print('rmse:',mean_squared_error(logy_test, lasso.predict(logx_test))**.5)

r2 score: -7.359715042265513e-05
rmse: 0.7376707257923452


In [95]:
Final_Scores['Lasso Regression %'] = [r2_score(logy_train, lasso.predict(logx_train))*100, 
                                       r2_score(logy_test, lasso.predict(logx_test))*100]

ValueError: ignored

##LabelEncoder

In [71]:
df1 = df.copy()
logdf1 = logdf.copy()

logdf1.head(3)

Unnamed: 0,society,type,location,bedrooms,built-up area,furnishing,age,floor,total floors,monthly rent
0,other society,Residential Apartment,Bandra (West),2,800.0,Furnished,5 to 10 years old,2,6,11.350407
1,other society,Residential Apartment,Khar West,2,1050.0,Furnished,1 to 5 years old,8,12,11.608236
2,Gajra Bhoomi Symphony,Residential Apartment,Sector-20 Koparkhairane,2,1050.0,Unfurnished,1 to 5 years old,14,17,10.373491


In [72]:
from sklearn.preprocessing import LabelEncoder

society_encoder = LabelEncoder().fit(df1['society'])
df1['society'] = society_encoder.transform(df1['society'])

type_encoder = LabelEncoder().fit(df1['type'])
df1['type'] = type_encoder.transform(df1['type'])

furnishing_encoder = LabelEncoder().fit(df1['furnishing'])
df1['furnishing'] = furnishing_encoder.transform(df1['furnishing'])

location_encoder = LabelEncoder().fit(df1['location'])
df1['location'] = location_encoder.transform(df1['location'])

age_encoder = LabelEncoder().fit(df1['age'])
df1['age'] = age_encoder.transform(df1['age'])

In [73]:
#type_encoder.classes_

In [74]:
df1

Unnamed: 0,society,type,location,bedrooms,built-up area,furnishing,age,floor,total floors,monthly rent
0,176,1,8,2,800.0,0,3,2,6,85000
1,176,1,63,2,1050.0,0,1,8,12,110000
2,36,1,128,2,1050.0,3,1,14,17,32000
3,20,1,88,3,2275.0,2,1,40,41,150000
4,8,1,150,1,550.0,2,3,4,7,27000
...,...,...,...,...,...,...,...,...,...,...
12792,176,1,94,3,2529.0,0,1,12,32,175000
12793,150,1,83,3,1382.0,2,0,27,40,48000
12794,176,1,165,3,1750.0,3,2,9,16,200000
12795,176,1,109,2,950.0,2,3,8,10,25500


In [75]:
from sklearn.preprocessing import LabelEncoder

society_encoder = LabelEncoder().fit(logdf1['society'])
logdf1['society'] = society_encoder.transform(logdf1['society'])

type_encoder = LabelEncoder().fit(logdf1['type'])
logdf1['type'] = type_encoder.transform(logdf1['type'])

furnishing_encoder = LabelEncoder().fit(logdf1['furnishing'])
logdf1['furnishing'] = furnishing_encoder.transform(logdf1['furnishing'])

location_encoder = LabelEncoder().fit(logdf1['location'])
logdf1['location'] = location_encoder.transform(logdf1['location'])

age_encoder = LabelEncoder().fit(logdf1['age'])
logdf1['age'] = age_encoder.transform(logdf1['age'])

In [76]:
logdf1

Unnamed: 0,society,type,location,bedrooms,built-up area,furnishing,age,floor,total floors,monthly rent
0,176,1,8,2,800.0,0,3,2,6,11.350407
1,176,1,63,2,1050.0,0,1,8,12,11.608236
2,36,1,128,2,1050.0,3,1,14,17,10.373491
3,20,1,88,3,2275.0,2,1,40,41,11.918391
4,8,1,150,1,550.0,2,3,4,7,10.203592
...,...,...,...,...,...,...,...,...,...,...
12792,176,1,94,3,2529.0,0,1,12,32,12.072541
12793,150,1,83,3,1382.0,2,0,27,40,10.778956
12794,176,1,165,3,1750.0,3,2,9,16,12.206073
12795,176,1,109,2,950.0,2,3,8,10,10.146434


##  Decision Tree Regressor model

In [77]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(df1.drop('monthly rent', axis = 1), df1['monthly rent'], test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((10237, 9), (2560, 9), (10237,), (2560,))

In [78]:
logx = logdf1.drop('monthly rent', axis = 1)
logy = logdf1['monthly rent']

logx_train, logx_test, logy_train, logy_test = train_test_split(logx, logy, test_size=0.2, random_state=42)
logx_train.shape, logx_test.shape, logy_train.shape, logy_test.shape

((10237, 9), (2560, 9), (10237,), (2560,))

In [81]:
from sklearn.tree import DecisionTreeRegressor

dt_model = DecisionTreeRegressor()
dt_model.fit(x_train, y_train)

print('r2 score:',r2_score(y_test, dt_model.predict(x_test)))
print('rmse:',mean_squared_error(y_test, dt_model.predict(x_test))**.5)

r2 score: 0.6579468328444391
rmse: 21030.336169188467


In [82]:
dt_model = DecisionTreeRegressor()
dt_model.fit(logx_train,logy_train)
print('r2 score:',r2_score(logy_test, dt_model.predict(logx_test)))
print('rmse:',mean_squared_error(logy_test, dt_model.predict(logx_test))**.5)

r2 score: 0.7561082684828557
rmse: 0.36428820235775905


In [94]:
Final_Scores['Decision tree Regression %'] = [r2_score(logy_train, dt_model.predict(logx_train))*100, 
                                       r2_score(logy_test, dt_model.predict(logx_test))*100]

## Random Forest Regressor model

In [85]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()
rf_model.fit(x_train, y_train)

print('r2 score:',r2_score(y_test, rf_model.predict(x_test)))
print('rmse:',mean_squared_error(y_test, rf_model.predict(x_test))**.5)

r2 score: 0.8389953294627807
rmse: 14428.415627211989


In [89]:
rf_model = RandomForestRegressor()
rf_model.fit(logx_train,logy_train)
print('r2 score:',r2_score(logy_test, rf_model.predict(logx_test)))
print('rmse:',mean_squared_error(logy_test, rf_model.predict(logx_test))**.5)

r2 score: 0.8566913739905627
rmse: 0.279243282332012


In [106]:
Final_Scores['Random Forest Regression %'] = [r2_score(logy_train, rf_model.predict(logx_train))*100, 
                                       r2_score(logy_test, rf_model.predict(logx_test))*100]

# XGBoost

In [101]:
from xgboost import XGBRegressor

xgb = XGBRegressor()
xgb.fit(x_train, y_train)

print('r2 score:',r2_score(y_test, xgb.predict(x_test)))
print('rmse:',mean_squared_error(y_test, xgb.predict(x_test))**.5)

r2 score: 0.7731322486321677
rmse: 17127.177166286638


In [102]:
xgb = XGBRegressor()
xgb.fit(logx_train,logy_train)
print('r2 score:',r2_score(logy_test, xgb.predict(logx_test)))
print('rmse:',mean_squared_error(logy_test, xgb.predict(logx_test))**.5)

r2 score: 0.7945377409766139
rmse: 0.33435867654605317


In [105]:
Final_Scores['XGBoost Regression %'] = [r2_score(logy_train, xgb.predict(logx_train))*100, 
                                       r2_score(logy_test, xgb.predict(logx_test))*100]

#Final_Scores 

In [107]:
Final_Scores

Unnamed: 0,Linear Regression %,Ridge Regression %,Lasso Regression %,Decision tree Regression %,Random Forest Regression %,XGBoost Regression %
With Training Data,53.625912,-189.507577,-189.510983,99.884222,97.805231,80.429553
With Test Data,53.026285,-188.784523,-188.78794,75.610827,85.669137,79.453774
