# In-depth analysis: Prediction of booking scores 

Now, using all the features possible, we will try to inference relations between them to used for the prediction of booking scores.

In [1]:
import gzip
import json
import csv
import pandas as pd
import numpy as np
from scipy import stats
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
listing = pd.read_csv('../Data/raw/listings.csv.gz', 
                      compression='gzip',
                      error_bad_lines=False, 
                      low_memory=False)

In [4]:
listing.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       ...
       'instant_bookable', 'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month'],
      dtype='object', length=106)

Seaborn package offers the function `pairplot` that allows to create scatterplots of all the variables used as input. If you enter the name of your dataset, you get the visual relation between all the variables and you can start your analysis quickly. As we saw before, this dataset contains 106 columns in different formats. A lot of them neither categorical or numerical. Thus, we select carefully the columns that could it makes sense for the next analysis. How? Using as reference the previos analysis.

In this section we are using df_dataset that select only a couple of columns:

In [5]:
list_columns = ['id',
                'host_id',
                'host_since', 
                'host_response_time', 
                'host_response_rate', 
                'host_is_superhost',
                'neighbourhood_cleansed', 
                'room_type',
                'property_type', 
                'accommodates', 
                'bathrooms', 
                'bedrooms', 
                'beds', 
                'bed_type', 
                'amenities', 
                'price', 
                'extra_people', 
                'minimum_nights',
                'maximum_nights',
                'alendar_updated',
                'has_availability',
                'availability_30',
                'availability_60',
                'availability_90',
                'availability_365',
                'number_of_reviews',
                'number_of_reviews_ltm',
                'first_review',
                'last_review',
                'review_scores_rating',
                'review_scores_accuracy',
                'review_scores_cleanliness',
                'review_scores_checkin',
                'review_scores_communication',
                'review_scores_location',
                'review_scores_value',
                'calculated_host_listings_count',
                'calculated_host_listings_count_entire_homes',
                'calculated_host_listings_count_private_rooms',
                'calculated_host_listings_count_shared_rooms',
                'reviews_per_month']

In [6]:
df_dataset = listing.loc[:, list_columns].reindex()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [7]:
df_dataset.head()

Unnamed: 0,id,host_id,host_since,host_response_time,host_response_rate,host_is_superhost,neighbourhood_cleansed,room_type,property_type,accommodates,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2818,3159,2008-09-24,within an hour,100%,t,Oostelijk Havengebied - Indische Buurt,Private room,Apartment,2,...,10.0,10.0,10.0,9.0,10.0,1,0,1,0,2.13
1,20168,59484,2009-12-02,within an hour,100%,f,Centrum-Oost,Private room,Townhouse,2,...,10.0,10.0,10.0,10.0,9.0,2,0,2,0,2.57
2,25428,56142,2009-11-20,within an hour,100%,f,Centrum-West,Entire home/apt,Apartment,3,...,10.0,10.0,10.0,10.0,10.0,2,2,0,0,0.13
3,27886,97647,2010-03-23,within an hour,100%,t,Centrum-West,Private room,Houseboat,2,...,10.0,10.0,10.0,10.0,10.0,1,0,1,0,2.14
4,28871,124245,2010-05-13,within an hour,100%,t,Centrum-West,Private room,Apartment,2,...,10.0,10.0,10.0,10.0,10.0,3,0,3,0,2.81


Transforming non-numerical `host_response_rate` to numerical:

In [8]:
def str_rate2int(rate):
    if type(rate) is str:
        return float(rate.replace("%", ""))
    else:
        return rate 

In [9]:
df_dataset['host_response_rate_float'] = df_dataset.host_response_rate.apply(str_rate2int)

In [10]:
def str2boolean(row):
    if row == 't':
        return 1
    elif row == 'f':
        return 0
    else:
        return np.nan

In [11]:
df_dataset['superhost'] = df_dataset.host_is_superhost.apply(str2boolean)

In [12]:
def price2float(string_price):
    return float(string_price.split('.')[0].replace('$', '').replace(',', ''))

In [13]:
df_dataset['price_float'] = df_dataset.price.apply(price2float)

In [14]:
df_dataset.drop(columns=['host_response_rate', 'host_is_superhost', 'price'], inplace=True)

In [15]:
df_dataset.head()

Unnamed: 0,id,host_id,host_since,host_response_time,neighbourhood_cleansed,room_type,property_type,accommodates,bathrooms,bedrooms,...,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,host_response_rate_float,superhost,price_float
0,2818,3159,2008-09-24,within an hour,Oostelijk Havengebied - Indische Buurt,Private room,Apartment,2,1.5,1.0,...,9.0,10.0,1,0,1,0,2.13,100.0,1.0,59.0
1,20168,59484,2009-12-02,within an hour,Centrum-Oost,Private room,Townhouse,2,1.0,1.0,...,10.0,9.0,2,0,2,0,2.57,100.0,0.0,80.0
2,25428,56142,2009-11-20,within an hour,Centrum-West,Entire home/apt,Apartment,3,1.0,1.0,...,10.0,10.0,2,2,0,0,0.13,100.0,0.0,125.0
3,27886,97647,2010-03-23,within an hour,Centrum-West,Private room,Houseboat,2,1.0,1.0,...,10.0,10.0,1,0,1,0,2.14,100.0,1.0,155.0
4,28871,124245,2010-05-13,within an hour,Centrum-West,Private room,Apartment,2,1.0,1.0,...,10.0,10.0,3,0,3,0,2.81,100.0,1.0,75.0


In [16]:
df_dataset.dropna(subset=['host_response_time'], inplace=True)

In [17]:
df_dataset['host_response_time'].unique()

array(['within an hour', 'within a day', 'within a few hours',
       'a few days or more'], dtype=object)

In [18]:
lb_make = LabelEncoder()
df_dataset['host_response_time_code'] = lb_make.fit_transform(df_dataset['host_response_time'])

In [19]:
df_dataset.drop(columns=['host_response_time'], inplace=True)

In [20]:
df_dataset.head()

Unnamed: 0,id,host_id,host_since,neighbourhood_cleansed,room_type,property_type,accommodates,bathrooms,bedrooms,beds,...,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,host_response_rate_float,superhost,price_float,host_response_time_code
0,2818,3159,2008-09-24,Oostelijk Havengebied - Indische Buurt,Private room,Apartment,2,1.5,1.0,2.0,...,10.0,1,0,1,0,2.13,100.0,1.0,59.0,3
1,20168,59484,2009-12-02,Centrum-Oost,Private room,Townhouse,2,1.0,1.0,1.0,...,9.0,2,0,2,0,2.57,100.0,0.0,80.0,3
2,25428,56142,2009-11-20,Centrum-West,Entire home/apt,Apartment,3,1.0,1.0,1.0,...,10.0,2,2,0,0,0.13,100.0,0.0,125.0,3
3,27886,97647,2010-03-23,Centrum-West,Private room,Houseboat,2,1.0,1.0,1.0,...,10.0,1,0,1,0,2.14,100.0,1.0,155.0,3
4,28871,124245,2010-05-13,Centrum-West,Private room,Apartment,2,1.0,1.0,1.0,...,10.0,3,0,3,0,2.81,100.0,1.0,75.0,3


In [21]:
df_dataset['neighbourhood_cleansed_code'] = lb_make.fit_transform(df_dataset['neighbourhood_cleansed'])

In [22]:
df_dataset['room_type_code'] = lb_make.fit_transform(df_dataset['room_type'])

In [23]:
df_dataset['property_type_code'] = lb_make.fit_transform(df_dataset['property_type'])

In [24]:
df_dataset.drop(columns=['neighbourhood_cleansed', 'room_type', 'property_type'], inplace=True)

In [25]:
df_dataset.head()

Unnamed: 0,id,host_id,host_since,accommodates,bathrooms,bedrooms,beds,bed_type,amenities,extra_people,...,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,host_response_rate_float,superhost,price_float,host_response_time_code,neighbourhood_cleansed_code,room_type_code,property_type_code
0,2818,3159,2008-09-24,2,1.5,1.0,2.0,Real Bed,"{Internet,Wifi,""Paid parking off premises"",""Bu...",$20.00,...,1,0,2.13,100.0,1.0,59.0,3,14,2,1
1,20168,59484,2009-12-02,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,""Paid parking off premises"",...",$0.00,...,2,0,2.57,100.0,0.0,80.0,3,4,2,29
2,25428,56142,2009-11-20,3,1.0,1.0,1.0,Real Bed,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",$10.00,...,0,0,0.13,100.0,0.0,125.0,3,5,0,1
3,27886,97647,2010-03-23,2,1.0,1.0,1.0,Real Bed,"{TV,Internet,Wifi,Breakfast,Heating,""Smoke det...",$0.00,...,1,0,2.14,100.0,1.0,155.0,3,5,2,21
4,28871,124245,2010-05-13,2,1.0,1.0,1.0,Real Bed,"{Internet,Wifi,""Pets live on this property"",Ca...",$0.00,...,3,0,2.81,100.0,1.0,75.0,3,5,2,1


In [None]:
# change bed_type and extra_people