## Data Preparation

In [1]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
# enable viewing all columns regardless how many
pd.set_option("display.max_columns", None)

In [3]:
# load the dataset and create a data frame
dataset = pd.read_csv("AB_NYC_2019.csv")
df = pd.DataFrame(dataset)
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [4]:
# The size of the dataset
print("Number of rows: ", df.shape[0])
print("Number of columns: ", df.shape[1])

Number of rows:  48895
Number of columns:  16


In [5]:
# make all column names and values of categorical columns uniform in lowercase and replace spaces with underscores
df.columns = df.columns.str.lower().str.replace(" ", "_")

categorical_columns = list(df.dtypes[df.dtypes == "object"].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(" ", "_")

In [6]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,clean_&_quiet_apt_home_by_the_park,2787,john,brooklyn,kensington,40.64749,-73.97237,private_room,149,1,9,2018-10-19,0.21,6,365
1,2595,skylit_midtown_castle,2845,jennifer,manhattan,midtown,40.75362,-73.98377,entire_home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,the_village_of_harlem....new_york_!,4632,elisabeth,manhattan,harlem,40.80902,-73.9419,private_room,150,3,0,,,1,365
3,3831,cozy_entire_floor_of_brownstone,4869,lisaroxanne,brooklyn,clinton_hill,40.68514,-73.95976,entire_home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,entire_apt:_spacious_studio/loft_by_central_park,7192,laura,manhattan,east_harlem,40.79851,-73.94399,entire_home/apt,80,10,9,2018-11-19,0.1,1,0


In [7]:
# Data Frames's columns
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

In [8]:
# check for missing values
df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In the whole dataset, <b>name, host_name, last_review and reviews_per_month</b> have missing values

In [9]:
# Extract a subset
df_subset= df[['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']]
df_subset.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,brooklyn,private_room,40.64749,-73.97237,149,1,9,0.21,6,365
1,manhattan,entire_home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,manhattan,private_room,40.80902,-73.9419,150,3,0,,1,365
3,brooklyn,entire_home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,manhattan,entire_home/apt,40.79851,-73.94399,80,10,9,0.1,1,0


In [10]:
# calculate mean of price
mean = df_subset["price"].mean()
mean

152.7206871868289

### Make Price Binary Classes

Convert price from numerical values to binary values

In [11]:
# create two classes based on price
df_subset["above_average"] = np.where(df_subset["price"] >= mean, 1,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset["above_average"] = np.where(df_subset["price"] >= mean, 1,0)


In [12]:
# above_average feature values
df_subset["above_average"].head()

0    0
1    1
2    0
3    0
4    0
Name: above_average, dtype: int32

In [13]:
# confirm that above average is in the dataframe
df_subset.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
0,brooklyn,private_room,40.64749,-73.97237,149,1,9,0.21,6,365,0
1,manhattan,entire_home/apt,40.75362,-73.98377,225,1,45,0.38,2,355,1
2,manhattan,private_room,40.80902,-73.9419,150,3,0,,1,365,0
3,brooklyn,entire_home/apt,40.68514,-73.95976,89,1,270,4.64,1,194,0
4,manhattan,entire_home/apt,40.79851,-73.94399,80,10,9,0.1,1,0,0


In [14]:
# drop the price column
df_subset = df_subset.drop("price", axis=1)

In [18]:
# check for a neighbohood group that is more frequent than others
df_subset.neighbourhood_group.value_counts()

manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
Name: neighbourhood_group, dtype: int64

In [19]:
# you can also use the mode() method to check for a more frequent neighborhood group than others
df_subset.neighbourhood_group.mode()

0    manhattan
Name: neighbourhood_group, dtype: object

The most frequent neighbourhood group is <b>manhattan</b>

## Setting up the Validation Framework

In [20]:
# import train_test_split
from sklearn.model_selection import train_test_split

In [21]:
# split the dataframe into full train and test sets
df_full_train, df_test = train_test_split(df_subset, test_size=0.2, random_state=42)

In [22]:
# check the lengths of both full_train and test sets
len(df_full_train), len(df_test)

(39116, 9779)

In [23]:
# check the number of columns of of each set
print("Number fo columns of full train set: ", df_full_train.shape[1])
print("Number fo columns of test set: ", df_test.shape[1])

Number fo columns of full train set:  10
Number fo columns of test set:  10


In [24]:
#now split the full train into tran and validation sets
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [25]:
# check the lengths of train, validation and test sets
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [26]:
# check the number of columns of train, validation and test sets
print("Number fo columns of train set: ", df_train.shape[1])
print("Number fo columns of validation set: ", df_val.shape[1])
print("Number fo columns of test set: ", df_test.shape[1])

Number fo columns of train set:  10
Number fo columns of validation set:  10
Number fo columns of test set:  10


In [27]:
# reset index of all sets
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [28]:
# create y variables from all sets
y_train = df_train.above_average.values
y_val = df_val.above_average.values
y_test = df_test.above_average.values

In [29]:
# delete the target variable from all sets so that it is not accidentally used as X variable
del df_train["above_average"]
del df_val["above_average"]
del df_test["above_average"]

## Exploratory Data Analysis

- Check for missing values
- Look at the target variable(price)
- Look at numerical and categorical variables

In [30]:
# reset the index of the full dataset
df_full_train = df_full_train.reset_index(drop=True)

In [31]:
# check for null values in the dataset
df_full_train.isnull().sum()

neighbourhood_group                  0
room_type                            0
latitude                             0
longitude                            0
minimum_nights                       0
number_of_reviews                    0
reviews_per_month                 8106
calculated_host_listings_count       0
availability_365                     0
above_average                        0
dtype: int64

In this subset, only <b>reviews_per_month</b> have null values

In [33]:
# check for the features' dtypes
df_full_train.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
above_average                       int32
dtype: object

In [34]:
df_full_train.columns

Index(['neighbourhood_group', 'room_type', 'latitude', 'longitude',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365', 'above_average'],
      dtype='object')

In [35]:
# extract numerical features
numerical = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count',
       'availability_365']

In [36]:
# extract categorical features
categorical = [ 'neighbourhood_group', 'room_type']

In [37]:
# check the number of unique values per each categorical feature
df_full_train[categorical].nunique()

neighbourhood_group    5
room_type              3
dtype: int64

## Feature Importance: Correlation Matrix

<b>Correlation</b> calculates feature importance for numerical variables with relation to the target variable <b>above_average</b>

In [39]:
# all numerical variables
df_full_train[numerical].head()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,40.71577,-73.9553,3,11,0.87,1,1
1,40.84917,-73.94048,2,2,0.16,1,0
2,40.68993,-73.95947,2,0,,2,0
3,40.68427,-73.93118,3,87,4.91,1,267
4,40.74705,-73.89564,5,13,0.25,1,0


In [40]:
# check for correlation of the numerical variables with the target variable
df_full_train[numerical].corrwith(df_full_train.above_average)

latitude                          0.056349
longitude                        -0.267426
minimum_nights                    0.031670
number_of_reviews                -0.054054
reviews_per_month                -0.032818
calculated_host_listings_count    0.171498
availability_365                  0.102663
dtype: float64

In [44]:
# calculate the correlation of the numerical features
df_full_train[numerical].corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080704,0.025497,-0.011836,-0.004397,0.020299,-0.008341
longitude,0.080704,1.0,-0.063498,0.05757,0.14122,-0.115289,0.082994
minimum_nights,0.025497,-0.063498,1.0,-0.07786,-0.116444,0.121748,0.140596
number_of_reviews,-0.011836,0.05757,-0.07786,1.0,0.544277,-0.072603,0.175428
reviews_per_month,-0.004397,0.14122,-0.116444,0.544277,1.0,-0.008412,0.188701
calculated_host_listings_count,0.020299,-0.115289,0.121748,-0.072603,-0.008412,1.0,0.223328
availability_365,-0.008341,0.082994,0.140596,0.175428,0.188701,0.223328,1.0


In the correlation results above, availability_365, calculated_host_listings_count, minimum_nights and latitude are positive meaning that the bigger they are, the more the price                    