# Problem Statement:-
- We are given the listings of one of the most popular cities in central Europe: Amsterdam.
- We have to build a machine learning model that will automatically predict the price for lodging or homestays.

In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv('airbnb_listing_train.csv')
data.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price
0,11602914,Luxury home 100m2+garden in center!,3123809,Marjolein,,De Pijp - Rivierenbuurt,52.35368,4.90177,Entire home/apt,3,13,15-02-2020,0.27,1,0,220
1,13289321,East is where your home is!,10259430,Ana,,Oud-Oost,52.35728,4.92052,Entire home/apt,4,14,19-06-2019,0.29,1,0,110
2,40779315,Amsterdam Central - LUXURY house in BUSY STREET,224969266,Mark,,Centrum-West,52.37582,4.8985,Entire home/apt,2,9,16-03-2020,1.65,1,7,100
3,7820311,great appartment down town,693472,Ayden,,Westerpark,52.37792,4.873,Entire home/apt,3,42,17-02-2020,0.72,1,0,130
4,27346603,Private room with canal view in peaceful area,41888346,Alissa,,Westerpark,52.38051,4.87493,Private room,2,89,26-02-2020,4.02,1,24,90


In [3]:
data.shape

(12901, 16)

# Observation:-
- There are 12901 rows and 16 columns in dataset.
- Each row represent the info of individual person.

In [4]:
# Checking for null values in dataset.
data.isnull().sum()

id                                    0
name                                 22
host_id                               0
host_name                             8
neighbourhood_group               12901
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
minimum_nights                        0
number_of_reviews                     0
last_review                        1596
reviews_per_month                  1596
calculated_host_listings_count        0
availability_365                      0
price                                 0
dtype: int64

In [5]:
# Checking the unique value in dataset
data.nunique()

id                                12901
name                              12653
host_id                           11601
host_name                          4536
neighbourhood_group                   0
neighbourhood                        22
latitude                           5227
longitude                          7887
room_type                             4
minimum_nights                       56
number_of_reviews                   362
last_review                        1495
reviews_per_month                   610
calculated_host_listings_count       26
availability_365                    366
price                               400
dtype: int64

# dropping unusual columns from dataset.

In [6]:
col_drop_list = ['id','name','host_id','host_name','neighbourhood_group','latitude','longitude']

In [7]:
data.drop(col_drop_list,1,inplace=True)

In [8]:
data.isnull().sum()

neighbourhood                        0
room_type                            0
minimum_nights                       0
number_of_reviews                    0
last_review                       1596
reviews_per_month                 1596
calculated_host_listings_count       0
availability_365                     0
price                                0
dtype: int64

In [9]:
# Checking % of null value in dataset
data.isnull().sum()/len(data) * 100

neighbourhood                      0.000000
room_type                          0.000000
minimum_nights                     0.000000
number_of_reviews                  0.000000
last_review                       12.371134
reviews_per_month                 12.371134
calculated_host_listings_count     0.000000
availability_365                   0.000000
price                              0.000000
dtype: float64

In [10]:
data.dtypes

neighbourhood                      object
room_type                          object
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
price                               int64
dtype: object

In [11]:
# data['last_review'] = pd.to_datetime(data['last_review'],format="%m/%d/%Y")

In [12]:
data.head()

Unnamed: 0,neighbourhood,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,price
0,De Pijp - Rivierenbuurt,Entire home/apt,3,13,15-02-2020,0.27,1,0,220
1,Oud-Oost,Entire home/apt,4,14,19-06-2019,0.29,1,0,110
2,Centrum-West,Entire home/apt,2,9,16-03-2020,1.65,1,7,100
3,Westerpark,Entire home/apt,3,42,17-02-2020,0.72,1,0,130
4,Westerpark,Private room,2,89,26-02-2020,4.02,1,24,90


# Missing value treatment 

In [13]:
# missing value treatment in reviews_per_month and last_review
from sklearn.impute import SimpleImputer
si_num = SimpleImputer(strategy='mean')
si_cat = SimpleImputer(strategy='most_frequent')
si_num,si_cat

(SimpleImputer(), SimpleImputer(strategy='most_frequent'))

In [14]:
si_num

In [15]:
si_cat

In [16]:
data['last_review']=si_cat.fit_transform(data[['last_review']])

In [17]:
data['reviews_per_month'] = si_num.fit_transform(data[['reviews_per_month']])

In [18]:
data.isnull().sum()

neighbourhood                     0
room_type                         0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
price                             0
dtype: int64

All the null values have been fixed.

In [19]:
data.nunique()

neighbourhood                       22
room_type                            4
minimum_nights                      56
number_of_reviews                  362
last_review                       1495
reviews_per_month                  611
calculated_host_listings_count      26
availability_365                   366
price                              400
dtype: int64

In [20]:
data.dtypes

neighbourhood                      object
room_type                          object
minimum_nights                      int64
number_of_reviews                   int64
last_review                        object
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
price                               int64
dtype: object

In [21]:
# Chaning datatypes of last_review column to datetime
data['last_review'] = pd.to_datetime(data['last_review'])

In [22]:
# Extracting month and year from last_review column
data['year'] = data['last_review'].dt.year
data['month'] = data['last_review'].dt.month

Since, we have extract year and month from last_review column so it has no use for us.

In [23]:
# droping last_review column
data.drop('last_review',1,inplace=True)

In [24]:
# Outliers treatement 
def remove_outiler(data,col):
    q3 = data[col].quantile(0.75)
    q1 = data[col].quantile(0.25)
    iqr = q3-q1
    uw = q3+1.5*iqr
    lw = q1-1.5*iqr
    data[col] = np.where(data[col]>uw,data[col].quantile(0.85),data[col])
    data[col] = np.where(data[col]<lw,data[col].quantile(0.85),data[col])   
    
    
    

# Removing outliers from different columns

In [25]:
remove_outiler(data,'minimum_nights')

In [26]:
remove_outiler(data,'number_of_reviews')
remove_outiler(data,'reviews_per_month')
remove_outiler(data,'calculated_host_listings_count')
remove_outiler(data,'availability_365')
remove_outiler(data,'price')

In [27]:
data.skew()

minimum_nights                    0.151528
number_of_reviews                 0.949930
reviews_per_month                 0.447669
calculated_host_listings_count    1.507583
availability_365                  1.183311
price                             0.616179
year                             -1.225742
month                            -0.117989
dtype: float64

All the outliers from the data have been fixed.

In [28]:
data.describe()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price,year,month
count,12901.0,12901.0,12901.0,12901.0,12901.0,12901.0,12901.0,12901.0
mean,2.485311,14.402294,0.521705,1.199054,44.415007,145.525541,2018.884505,6.329044
std,0.99247,14.657009,0.360886,0.399304,69.29924,61.845705,1.289069,3.31113
min,1.0,0.0,0.01,1.0,0.0,6.0,2012.0,1.0
25%,2.0,2.0,0.2,1.0,0.0,99.0,2018.0,3.0
50%,2.0,9.0,0.47,1.0,0.0,135.0,2019.0,7.0
75%,3.0,24.0,0.748501,1.0,87.0,190.0,2020.0,9.0
max,4.0,57.0,1.57,2.0,217.0,325.0,2020.0,12.0


#  Applying Feature Scaling

In [29]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
mms

In [30]:
col_for_std =['minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365']

In [31]:
data[col_for_std]=mms.fit_transform(data[col_for_std])

In [32]:
data.head()

Unnamed: 0,neighbourhood,room_type,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price,year,month
0,De Pijp - Rivierenbuurt,Entire home/apt,0.666667,0.22807,0.166667,0.0,0.0,220.0,2020,2
1,Oud-Oost,Entire home/apt,1.0,0.245614,0.179487,0.0,0.0,110.0,2019,6
2,Centrum-West,Entire home/apt,0.333333,0.157895,0.653846,0.0,0.032258,100.0,2020,3
3,Westerpark,Entire home/apt,0.666667,0.736842,0.455128,0.0,0.0,130.0,2020,2
4,Westerpark,Private room,0.333333,0.684211,0.653846,0.0,0.110599,90.0,2020,2


# Applying One hot encoding

In [33]:
data_neighbour = pd.get_dummies(data['neighbourhood'])

In [34]:
data_room_type = pd.get_dummies(data['room_type'])

In [35]:
data_neighbour

Unnamed: 0,Bijlmer-Centrum,Bijlmer-Oost,Bos en Lommer,Buitenveldert - Zuidas,Centrum-Oost,Centrum-West,De Aker - Nieuw Sloten,De Baarsjes - Oud-West,De Pijp - Rivierenbuurt,Gaasperdam - Driemond,...,Noord-Oost,Noord-West,Oostelijk Havengebied - Indische Buurt,Osdorp,Oud-Noord,Oud-Oost,Slotervaart,Watergraafsmeer,Westerpark,Zuid
0,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12896,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
12897,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12898,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
12899,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [36]:
data_room_type

Unnamed: 0,Entire home/apt,Hotel room,Private room,Shared room
0,1,0,0,0
1,1,0,0,0
2,1,0,0,0
3,1,0,0,0
4,0,0,1,0
...,...,...,...,...
12896,1,0,0,0
12897,1,0,0,0
12898,1,0,0,0
12899,1,0,0,0


In [37]:
# merging these two in data
data = pd.concat([data,data_room_type,data_neighbour],axis=1)

In [38]:
# droping neighbourhood and room_type from data
data.drop(['neighbourhood','room_type'],1,inplace=True)

In [39]:
data.head()

Unnamed: 0,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,price,year,month,Entire home/apt,Hotel room,...,Noord-Oost,Noord-West,Oostelijk Havengebied - Indische Buurt,Osdorp,Oud-Noord,Oud-Oost,Slotervaart,Watergraafsmeer,Westerpark,Zuid
0,0.666667,0.22807,0.166667,0.0,0.0,220.0,2020,2,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,0.245614,0.179487,0.0,0.0,110.0,2019,6,1,0,...,0,0,0,0,0,1,0,0,0,0
2,0.333333,0.157895,0.653846,0.0,0.032258,100.0,2020,3,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0.666667,0.736842,0.455128,0.0,0.0,130.0,2020,2,1,0,...,0,0,0,0,0,0,0,0,1,0
4,0.333333,0.684211,0.653846,0.0,0.110599,90.0,2020,2,0,0,...,0,0,0,0,0,0,0,0,1,0


#### All the preprocessing is done for our dataset.

# Separating dataset into train and test set.

In [40]:
x = data.drop('price',1)
y = data['price']

# Spliting dataset into train and test set.

In [41]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

# Applying linear Regression to dataset

In [42]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr

In [43]:
lr.fit(x_train,y_train)

# Predicting value on x_test

In [44]:
y_pred = lr.predict(x_test)
y_pred

array([ 68.75390625, 138.29296875, 132.7890625 , ..., 125.71875   ,
       139.8515625 , 157.12890625])

In [45]:
# value of m 
m = lr.coef_
m

array([ 3.30556873e+00, -1.13953553e+01, -1.47649297e+01,  6.87346954e-01,
        3.58725878e+01,  7.10507551e+00,  6.07096316e-01, -2.41217965e+13,
       -2.41217965e+13, -2.41217965e+13, -2.41217965e+13, -1.40371634e+10,
       -1.40371634e+10, -1.40371634e+10, -1.40371633e+10, -1.40371633e+10,
       -1.40371633e+10, -1.40371633e+10, -1.40371633e+10, -1.40371633e+10,
       -1.40371634e+10, -1.40371634e+10, -1.40371633e+10, -1.40371633e+10,
       -1.40371633e+10, -1.40371633e+10, -1.40371633e+10, -1.40371633e+10,
       -1.40371633e+10, -1.40371633e+10, -1.40371633e+10, -1.40371633e+10,
       -1.40371633e+10])

In [46]:
# value of c
c = lr.intercept_
c

24135833642672.613

In [47]:
# Checking accuracy of model
from sklearn.metrics import r2_score
r2_score(y_test,y_pred)

0.25744452705485243

In [48]:
from sklearn.model_selection import cross_val_score
cross_val_score(lr,x,y,cv=15).mean()

0.2546798616577801

In [50]:
# data.head()