In [1]:
# Importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
# Loading the data and creating DataFrame
df = pd.read_csv('./data/data_eda.csv')

In [3]:
# Checking the data
df.head()

Unnamed: 0,log_price,price,review_scores_rating,has_availability,property_type,instant_bookable,number_of_reviews_ltm,availability_60,review_scores_checkin,room_type,...,beds,host_response_time,host_is_superhost,bedrooms,latitude,minimum_nights,reviews_per_month,availability_90,bathrooms_text,availability_365
0,6.150603,469.0,100.0,True,14,False,0,0,10.0,0,...,7.0,1,False,5.0,43.6459,28,0.1,0,11,0
1,4.543295,94.0,97.0,True,21,True,0,60,10.0,2,...,1.0,0,False,1.0,43.6408,180,1.19,90,5,365
2,4.276666,72.0,95.0,True,36,True,0,60,10.0,2,...,1.0,1,False,1.0,43.69805,28,1.64,90,4,365
3,4.60517,100.0,93.0,True,14,False,2,0,10.0,0,...,2.0,4,False,2.0,43.63539,30,0.86,8,2,283
4,4.532599,93.0,99.0,True,24,False,1,58,10.0,2,...,2.0,3,False,2.0,43.74922,2,0.6,88,3,363


### In our dataframe I have both price and log_price. I will use both of them separately for modelling and compare the results.

### I will drop log_price first and do modelling on the remaining data

In [4]:
# Dropping the log_price
dfp = df.drop(['log_price'], axis = 1)
dfp.head()

Unnamed: 0,price,review_scores_rating,has_availability,property_type,instant_bookable,number_of_reviews_ltm,availability_60,review_scores_checkin,room_type,neighbourhood_cleansed,...,beds,host_response_time,host_is_superhost,bedrooms,latitude,minimum_nights,reviews_per_month,availability_90,bathrooms_text,availability_365
0,469.0,100.0,True,14,False,0,0,10.0,0,71,...,7.0,1,False,5.0,43.6459,28,0.1,0,11,0
1,94.0,97.0,True,21,True,0,60,10.0,2,122,...,1.0,0,False,1.0,43.6408,180,1.19,90,5,365
2,72.0,95.0,True,36,True,0,60,10.0,2,15,...,1.0,1,False,1.0,43.69805,28,1.64,90,4,365
3,100.0,93.0,True,14,False,2,0,10.0,0,109,...,2.0,4,False,2.0,43.63539,30,0.86,8,2,283
4,93.0,99.0,True,24,False,1,58,10.0,2,128,...,2.0,3,False,2.0,43.74922,2,0.6,88,3,363


In [5]:
# Split data into X and y
X = dfp.drop('price', axis = 1)
y = dfp['price']

In [6]:
# Checking X 
X

Unnamed: 0,review_scores_rating,has_availability,property_type,instant_bookable,number_of_reviews_ltm,availability_60,review_scores_checkin,room_type,neighbourhood_cleansed,accommodates,...,beds,host_response_time,host_is_superhost,bedrooms,latitude,minimum_nights,reviews_per_month,availability_90,bathrooms_text,availability_365
0,100.0,True,14,False,0,0,10.0,0,71,10,...,7.0,1,False,5.0,43.64590,28,0.10,0,11,0
1,97.0,True,21,True,0,60,10.0,2,122,2,...,1.0,0,False,1.0,43.64080,180,1.19,90,5,365
2,95.0,True,36,True,0,60,10.0,2,15,3,...,1.0,1,False,1.0,43.69805,28,1.64,90,4,365
3,93.0,True,14,False,2,0,10.0,0,109,5,...,2.0,4,False,2.0,43.63539,30,0.86,8,2,283
4,99.0,True,24,False,1,58,10.0,2,128,4,...,2.0,3,False,2.0,43.74922,2,0.60,88,3,363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10794,80.0,True,8,True,1,51,10.0,0,87,2,...,1.0,4,False,1.0,43.63780,1,1.00,81,7,81
10795,80.0,True,5,False,1,50,10.0,0,62,4,...,2.0,4,False,2.0,43.65734,2,1.00,70,2,70
10796,100.0,True,28,False,1,60,10.0,2,122,2,...,1.0,4,False,1.0,43.64015,1,1.00,89,4,89
10797,100.0,True,8,True,1,35,10.0,0,77,3,...,2.0,4,False,1.0,43.62431,2,1.00,65,2,65


In [7]:
# Checking y
y

0        469.0
1         94.0
2         72.0
3        100.0
4         93.0
         ...  
10794    114.0
10795     71.0
10796    195.0
10797    128.0
10798     88.0
Name: price, Length: 10799, dtype: float64

In [8]:
# Importing the train- test model from sklearn library
from sklearn.model_selection import train_test_split

In [9]:
# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [10]:
# Checking the length of different test and train sets
len(X_train),len(y_train),len(X_test),len(y_test)

(8639, 8639, 2160, 2160)

I am going to use 5 different machine learning models:

1. Linear Regression
2. Ridge Regression
3. Lasso Regression
4. K-Nearest Neighbors
5. Random Forest

### Using Linear Regression Model

In [11]:
# Importing LinearRegression model from sklearn library
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [12]:
# Fitting the model
lr.fit(X_train, y_train)

LinearRegression()

In [13]:
# Training Model Score
lr.score(X_train,y_train)

0.05906887048413623

In [14]:
#Making the predictions
y_preds = lr.predict(X_test)

In [15]:
# Making Predictions
y_preds

array([105.32905577, 161.15083412, -19.10102274, ..., 161.56875696,
       132.74998292,  79.35397132])

In [16]:
# Test Model Score
lr.score(X_test, y_test)

0.07407009899022432

In [17]:
# Importing mean_absolute_errora and mean_squared_error
from sklearn.metrics import mean_absolute_error, mean_squared_error
print(mean_absolute_error(y_test, y_preds))

75.12224073479874


In [18]:
# Calculating mean_squared_error
print(mean_squared_error(y_test, y_preds))

133812.21780540876


In [19]:
# Calculating mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, y_preds))
rmse

365.8035235005381

### Using Ridge Regression

In [20]:
# Importing the model
from sklearn.linear_model import Ridge
ri = Ridge()

In [21]:
# Fitting the model on training data
ri.fit(X_train, y_train)

Ridge()

In [22]:
# Finding Train model score
ri.score(X_train, y_train)

0.059062454469824055

In [23]:
# Making Predictions
yr_preds = ri.predict(X_test)

In [24]:
# Getting Predictions
yr_preds

array([105.3593108 , 161.11960548, -17.67035725, ..., 161.04968418,
       131.98675141,  80.72775815])

In [25]:
# Finding test model score
ri.score(X_test, y_test)

0.07386987328329353

In [26]:
# Calculating Mean absolute error
print(mean_absolute_error(y_test, yr_preds))

74.98185731548095


In [27]:
# Calculating Mean Squared error
print(mean_squared_error(y_test, yr_preds))

133841.15373876275


In [28]:
# Calculating Root Mean Squared error
print(np.sqrt(mean_squared_error(y_test, yr_preds)))

365.84307255811575


### Using Lasso Regression

In [29]:
# Importing the model
from sklearn.linear_model import Lasso
la=Lasso()

In [30]:
# Fitting the model on training data
la.fit(X_train, y_train)

Lasso()

In [31]:
# Finding Train model score
la.score(X_train, y_train)

0.056442561277262104

In [32]:
# Making Predictions
yl_preds = la.predict(X_test)

In [33]:
# Getting Predictions
yl_preds

array([106.416984  , 153.53244181,  12.47678376, ..., 154.71535875,
       118.97319688, 103.93322531])

In [34]:
# Finding test model score
la.score(X_test, y_test)

0.06514240980300035

In [35]:
# Calculating Mean absolute error
print(mean_absolute_error(y_test, yl_preds))

74.35390914983695


In [36]:
# Calculating Mean Squared  error
print(mean_squared_error(y_test, yl_preds))

135102.41686768873


In [37]:
# Calculating Root Mean squared error
print(np.sqrt(mean_squared_error(y_test, yl_preds)))

367.5628066979693


### Using K-Nearest Neighbors

In [38]:
# Importing the model
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()

In [39]:
# Fitting the model on training data
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [40]:
# Finding Train model score
knn.score(X_train, y_train)

0.17857068557334177

In [41]:
# Making Predictions
yk_preds = knn.predict(X_test)

In [42]:
# Getting Predictions
yk_preds

array([110.2,  80.8, 137.4, ..., 153.6, 104. , 101.4])

In [43]:
# Finding test model score
knn.score(X_test, y_test)

0.037820311791262995

In [44]:
# Calculating Mean absolute error
print(mean_absolute_error(y_test, yk_preds))

87.39407407407408


In [45]:
# Calculating Mean Squared error
print(mean_squared_error(y_test, yk_preds))

139050.91288888888


In [46]:
# Calculating Root Mean Squared error
print(np.sqrt(mean_squared_error(y_test, yk_preds)))

372.89531089689086


### Using Random Forest

In [47]:
# Importing the model
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()

In [48]:
# Fitting the model on training data
rf.fit(X_train, y_train)

RandomForestRegressor()

In [49]:
# Finding Train model score
rf.score(X_train, y_train)

0.8707496775180534

In [50]:
# Making Predictions
yrf_preds = rf.predict(X_test)

In [51]:
yrf_preds

array([102.32, 140.03,  35.06, ..., 151.62, 152.13,  86.31])

In [52]:
# Finding test model score
rf.score(X_test, y_test)

0.5193558390467832

In [53]:
# Calculating Mean absolute error
print(mean_absolute_error(y_test, yrf_preds))

65.02390277777778


In [54]:
# Calculating Mean Squared error
print(mean_squared_error(y_test, yrf_preds))

69461.04784199076


In [55]:
# Calculating Root Mean Squared error
print(np.sqrt(mean_squared_error(y_test, yrf_preds)))

263.5546391964876


### I will drop price column first and do modelling on the remaining data

In [56]:
# Checking the original data
df.head()

Unnamed: 0,log_price,price,review_scores_rating,has_availability,property_type,instant_bookable,number_of_reviews_ltm,availability_60,review_scores_checkin,room_type,...,beds,host_response_time,host_is_superhost,bedrooms,latitude,minimum_nights,reviews_per_month,availability_90,bathrooms_text,availability_365
0,6.150603,469.0,100.0,True,14,False,0,0,10.0,0,...,7.0,1,False,5.0,43.6459,28,0.1,0,11,0
1,4.543295,94.0,97.0,True,21,True,0,60,10.0,2,...,1.0,0,False,1.0,43.6408,180,1.19,90,5,365
2,4.276666,72.0,95.0,True,36,True,0,60,10.0,2,...,1.0,1,False,1.0,43.69805,28,1.64,90,4,365
3,4.60517,100.0,93.0,True,14,False,2,0,10.0,0,...,2.0,4,False,2.0,43.63539,30,0.86,8,2,283
4,4.532599,93.0,99.0,True,24,False,1,58,10.0,2,...,2.0,3,False,2.0,43.74922,2,0.6,88,3,363


In [57]:
# Dropping the price column
dfp = df.drop(['price'], axis = 1)
dfp.head()

Unnamed: 0,log_price,review_scores_rating,has_availability,property_type,instant_bookable,number_of_reviews_ltm,availability_60,review_scores_checkin,room_type,neighbourhood_cleansed,...,beds,host_response_time,host_is_superhost,bedrooms,latitude,minimum_nights,reviews_per_month,availability_90,bathrooms_text,availability_365
0,6.150603,100.0,True,14,False,0,0,10.0,0,71,...,7.0,1,False,5.0,43.6459,28,0.1,0,11,0
1,4.543295,97.0,True,21,True,0,60,10.0,2,122,...,1.0,0,False,1.0,43.6408,180,1.19,90,5,365
2,4.276666,95.0,True,36,True,0,60,10.0,2,15,...,1.0,1,False,1.0,43.69805,28,1.64,90,4,365
3,4.60517,93.0,True,14,False,2,0,10.0,0,109,...,2.0,4,False,2.0,43.63539,30,0.86,8,2,283
4,4.532599,99.0,True,24,False,1,58,10.0,2,128,...,2.0,3,False,2.0,43.74922,2,0.6,88,3,363


In [58]:
# Split data into X and y
X = dfp.drop('log_price', axis = 1)
y = dfp['log_price']

In [59]:
# Checking X
X

Unnamed: 0,review_scores_rating,has_availability,property_type,instant_bookable,number_of_reviews_ltm,availability_60,review_scores_checkin,room_type,neighbourhood_cleansed,accommodates,...,beds,host_response_time,host_is_superhost,bedrooms,latitude,minimum_nights,reviews_per_month,availability_90,bathrooms_text,availability_365
0,100.0,True,14,False,0,0,10.0,0,71,10,...,7.0,1,False,5.0,43.64590,28,0.10,0,11,0
1,97.0,True,21,True,0,60,10.0,2,122,2,...,1.0,0,False,1.0,43.64080,180,1.19,90,5,365
2,95.0,True,36,True,0,60,10.0,2,15,3,...,1.0,1,False,1.0,43.69805,28,1.64,90,4,365
3,93.0,True,14,False,2,0,10.0,0,109,5,...,2.0,4,False,2.0,43.63539,30,0.86,8,2,283
4,99.0,True,24,False,1,58,10.0,2,128,4,...,2.0,3,False,2.0,43.74922,2,0.60,88,3,363
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10794,80.0,True,8,True,1,51,10.0,0,87,2,...,1.0,4,False,1.0,43.63780,1,1.00,81,7,81
10795,80.0,True,5,False,1,50,10.0,0,62,4,...,2.0,4,False,2.0,43.65734,2,1.00,70,2,70
10796,100.0,True,28,False,1,60,10.0,2,122,2,...,1.0,4,False,1.0,43.64015,1,1.00,89,4,89
10797,100.0,True,8,True,1,35,10.0,0,77,3,...,2.0,4,False,1.0,43.62431,2,1.00,65,2,65


In [60]:
# Checking y
y

0        6.150603
1        4.543295
2        4.276666
3        4.605170
4        4.532599
           ...   
10794    4.736198
10795    4.262680
10796    5.273000
10797    4.852030
10798    4.477337
Name: log_price, Length: 10799, dtype: float64

In [61]:
# Split into train and test data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [62]:
# Checking the length of different test and train sets
len(X_train),len(y_train),len(X_test),len(y_test)

(8639, 8639, 2160, 2160)

### Using Linear Regression Model

In [63]:
# Importing LinearRegression model from sklearn library
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [64]:
# Fitting the model
lr.fit(X_train, y_train)

LinearRegression()

In [65]:
# Training Model Score
lr.score(X_train,y_train)

0.5229223248379324

In [66]:
#Making the predictions
y_preds = lr.predict(X_test)

In [67]:
y_preds

array([4.14223251, 5.73497392, 3.6785728 , ..., 3.615739  , 5.04075913,
       3.98066422])

In [68]:
# Test Model Score
lr.score(X_test, y_test)

0.5461744930742458

In [69]:
# Calculating Mean Absolute Error
print(mean_absolute_error(y_test, y_preds))

0.3438097139934095


In [70]:
# Calculating mean_squared_error
print(mean_squared_error(y_test, y_preds))

0.21150367292695735


In [71]:
# Calculating Root Mean Squared Error
print(np.sqrt(mean_squared_error(y_test, y_preds)))

0.4598952847409477


### Using Ridge Regression

In [72]:
# Importing the model
from sklearn.linear_model import Ridge
ri = Ridge()

In [73]:
# Fitting the model on training data
ri.fit(X_train, y_train)

Ridge()

In [74]:
# Finding Train model score
ri.score(X_train, y_train)

0.5228500647960767

In [75]:
# Making Predictions
yr_preds = ri.predict(X_test)

In [76]:
yr_preds

array([4.14795212, 5.73253621, 3.69308479, ..., 3.6258681 , 5.03912891,
       3.97773939])

In [77]:
# Finding test model score
ri.score(X_test, y_test)

0.5460559268701968

In [78]:
# Calculating Mean absolute error
print(mean_absolute_error(y_test, yr_preds))

0.3437183087823847


In [79]:
# Calculating Mean Squared error
print(mean_squared_error(y_test, yr_preds))

0.21155893026101788


In [80]:
# Calculating Root Mean Squared error
print(np.sqrt(mean_squared_error(y_test, yr_preds)))

0.45995535681304756


### Using Lasso Regression

In [81]:
# Importing the model
from sklearn.linear_model import Lasso
la=Lasso()

In [82]:
# Fitting the model on training data
la.fit(X_train, y_train)

Lasso()

In [83]:
# Finding Train model score
la.score(X_train, y_train)

0.23619865291517295

In [84]:
# Making Predictions
yl_preds = la.predict(X_test)

In [85]:
# Getting Predictions
yl_preds

array([4.24582918, 4.67472089, 4.51267536, ..., 3.63919522, 4.72688874,
       4.52205469])

In [86]:
# Finding test model score
la.score(X_test, y_test)

0.23577730415399833

In [87]:
# Calculating Mean absolute error
print(mean_absolute_error(y_test, yl_preds))

0.4511403543559708


In [88]:
# Calculating Mean Squared  error
print(mean_squared_error(y_test, yl_preds))

0.3561631169664819


In [89]:
# Calculating Root Mean squared error
print(np.sqrt(mean_squared_error(y_test, yl_preds)))

0.5967940322812234


### Using K-Nearest Neighbors

In [90]:
# Importing the model
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()

In [91]:
# Fitting the model on training data
knn.fit(X_train, y_train)

KNeighborsRegressor()

In [92]:
# Finding Train model score
knn.score(X_train, y_train)

0.5764916972293442

In [93]:
# Making Predictions
yk_preds = knn.predict(X_test)

In [94]:
yk_preds

array([4.55630313, 4.42783351, 4.50965797, ..., 2.70311879, 5.11797232,
       5.14281995])

In [95]:
# Finding test model score
knn.score(X_test, y_test)

0.3659891466558134

In [96]:
# Calculating Mean absolute error
print(mean_absolute_error(y_test, yk_preds))

0.40164658397123


In [97]:
# Calculating Mean Squared error
print(mean_squared_error(y_test, yk_preds))

0.29547837684625333


In [98]:
# Calculating Root Mean Squared error
print(np.sqrt(mean_squared_error(y_test, yk_preds)))

0.5435792277545688


### Using Random Forest

In [99]:
# Importing the model
from sklearn.ensemble import RandomForestRegressor
rf=RandomForestRegressor()

In [100]:
# Fitting the model on training data
rf.fit(X_train, y_train)

RandomForestRegressor()

In [101]:
# Finding Train model score
rf.score(X_train, y_train)

0.9467118063979025

In [102]:
# Making Predictions
yrf_preds = rf.predict(X_test)
yrf_preds

array([4.24264455, 5.53152316, 3.80445024, ..., 2.71574102, 5.35096334,
       4.02541607])

In [103]:
# Finding test model score
rf.score(X_test, y_test)

0.6224484049259307

In [104]:
# Calculating Mean absolute error
print(mean_absolute_error(y_test, yrf_preds))

0.30393747867313975


In [105]:
# Calculating Mean Squared error
print(mean_squared_error(y_test, yrf_preds))

0.17595650279450037


In [106]:
# Calculating Root Mean Squared error
print(np.sqrt(mean_squared_error(y_test, yrf_preds)))

0.4194716948668889
