## Data Preparation

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# enable viewing all columns regardless how many
pd.set_option("display.max_columns", None)

In [4]:
# load the dataset and create a data frame
dataset = pd.read_csv("AB_NYC_2019.csv")
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [7]:
# The size of the dataset
print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

Number of rows:  48895
Number of columns:  16


In [5]:
# make all column names and values of categorical columns uniform in lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(" ", "_")

categorical_columns = list(df.dtypes[df.dtypes == "object"].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(" ", "_")

In [6]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,clean_&_quiet_apt_home_by_the_park,2787,john,brooklyn,kensington,40.64749,-73.97237,private_room,149,1,9,2018-10-19,0.21,6,365
1,2595,skylit_midtown_castle,2845,jennifer,manhattan,midtown,40.75362,-73.98377,entire_home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,the_village_of_harlem....new_york_!,4632,elisabeth,manhattan,harlem,40.80902,-73.9419,private_room,150,3,0,,,1,365
3,3831,cozy_entire_floor_of_brownstone,4869,lisaroxanne,brooklyn,clinton_hill,40.68514,-73.95976,entire_home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,entire_apt:_spacious_studio/loft_by_central_park,7192,laura,manhattan,east_harlem,40.79851,-73.94399,entire_home/apt,80,10,9,2018-11-19,0.1,1,0


In [31]:
# Data Frames's columns
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [73]:
df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In the whole dataset, <b>name, host_name, last_review and reviews_per_month</b> have missing values

In [37]:
df_subset= df[['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']]
df_subset.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,brooklyn,private_room,40.64749,-73.97237,149,1,9,0.21,6,365
1,manhattan,entire_home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,manhattan,private_room,40.80902,-73.9419,150,3,0,,1,365
3,brooklyn,entire_home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,manhattan,entire_home/apt,40.79851,-73.94399,80,10,9,0.1,1,0


In [38]:
df_subset.neighbourhood_group.value_counts()

manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
Name: neighbourhood_group, dtype: int64

The most frequent neighbourhood group is <b>manhattan</b>

## Setting up the Validation Framework

In [47]:
# import train_test_split
from sklearn.model_selection import train_test_split

In [48]:
# split the dataframe into full train and test sets
df_full_train, df_test = train_test_split(df_subset, test_size=0.2, random_state=42)

In [49]:
# check the lengths of both full_train and test sets
len(df_full_train), len(df_test)

(39116, 9779)

In [56]:
# check the number of columns of of each set
print("Number fo columns of full train set: ", df_full_train.shape[1])
print("Number fo columns of test set: ", df_test.shape[1])

Number fo columns of full train set:  10
Number fo columns of test set:  10


In [57]:
#now split the full train into tran and validation sets
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [58]:
# check the lengths of train, validation and test sets
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [61]:
# check the number of columns of train, validation and test sets
print("Number fo columns of train set: ", df_train.shape[1])
print("Number fo columns of validation set: ", df_val.shape[1])
print("Number fo columns of test set: ", df_test.shape[1])

Number fo columns of train set:  10
Number fo columns of validation set:  10
Number fo columns of test set:  10


In [65]:
# reset index of al sets
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [66]:
# create y variables from all sets
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [67]:
# delete the target variable from all sets so that it is not accidentally used as X variable
del df_train["price"]
del df_val["price"]
del df_test["price"]

## Exploratory Data Analysis

- Check for missing values
- Look at the target variable(price)
- Look at numerical and categorical variables

In [71]:
# reset the index of the full dataset
df_full_train = df_full_train.reset_index(drop=True)

In [72]:
# check for null values in the dataset
df_full_train.isnull().sum()

neighbourhood_group                  0
room_type                            0
latitude                             0
longitude                            0
price                                0
minimum_nights                       0
number_of_reviews                    0
reviews_per_month                 8106
calculated_host_listings_count       0
availability_365                     0
dtype: int64

In this subset, only <b>reviews_per_month</b> have null values

In [74]:
# mean price
df_full_train.price.mean()

153.75158502914408

In [75]:
# highest price
df_full_train.price.max()

10000

In [76]:
# check for the features' dtypes
df_full_train.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [77]:
df_full_train.columns

Index(['neighbourhood_group', 'room_type', 'latitude', 'longitude', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [78]:
# extract numerical features
numerical = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count',
       'availability_365']

In [83]:
# extract categorical features
categorical = [ 'neighbourhood_group', 'room_type']

In [85]:
# check the number of unique values per each categorical feature
df_full_train[categorical].nunique()

neighbourhood_group    5
room_type              3
dtype: int64

## Feature Importance: Correlation Matrix

<b>Correlation</b> calculates feature importance for numerical variables with relation to the target variable <b>price</b>

In [86]:
# all numerical variables
df_full_train[numerical]

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,40.71577,-73.95530,3,11,0.87,1,1
1,40.84917,-73.94048,2,2,0.16,1,0
2,40.68993,-73.95947,2,0,,2,0
3,40.68427,-73.93118,3,87,4.91,1,267
4,40.74705,-73.89564,5,13,0.25,1,0
...,...,...,...,...,...,...,...
39111,40.84650,-73.94319,1,0,,1,0
39112,40.73957,-74.00082,2,4,1.90,1,76
39113,40.78318,-73.97372,30,1,0.34,5,261
39114,40.77508,-73.97990,2,11,0.13,1,2


In [94]:
# check for correlation of the numerical variables with the target variable
df_full_train[numerical].corrwith(df_full_train.price)

latitude                          0.035015
longitude                        -0.149080
minimum_nights                    0.042740
number_of_reviews                -0.048926
reviews_per_month                -0.031568
calculated_host_listings_count    0.055336
availability_365                  0.080562
dtype: float64

Price and availability_365 pair have the highest correlation

In [95]:
# check for the order of importance using abs()
df_full_train[numerical].corrwith(df_full_train.price).abs()

latitude                          0.035015
longitude                         0.149080
minimum_nights                    0.042740
number_of_reviews                 0.048926
reviews_per_month                 0.031568
calculated_host_listings_count    0.055336
availability_365                  0.080562
dtype: float64

In the correlation results above, availability_365, calculated_host_listings_count, minimum_nights and latitude are positive meaning that the bigger they are, the more the price                    