# In-depth analysis: Prediction of booking scores 

Now, using all the features possible, we will try to inference relations between them to used for the prediction of booking scores.

In [1]:
import gzip
import json
import csv
import pandas as pd
import numpy as np
from scipy import stats
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
from sklearn.preprocessing import LabelEncoder

In [3]:
listing = pd.read_csv('../Data/raw/listings.csv.gz', 
                      compression='gzip',
                      error_bad_lines=False, 
                      low_memory=False)

In [4]:
listing.columns

Index(['id', 'listing_url', 'scrape_id', 'last_scraped', 'name', 'summary',
       'space', 'description', 'experiences_offered', 'neighborhood_overview',
       ...
       'instant_bookable', 'is_business_travel_ready', 'cancellation_policy',
       'require_guest_profile_picture', 'require_guest_phone_verification',
       'calculated_host_listings_count',
       'calculated_host_listings_count_entire_homes',
       'calculated_host_listings_count_private_rooms',
       'calculated_host_listings_count_shared_rooms', 'reviews_per_month'],
      dtype='object', length=106)

Seaborn package offers the function `pairplot` that allows to create scatterplots of all the variables used as input. If you enter the name of your dataset, you get the visual relation between all the variables and you can start your analysis quickly. As we saw before, this dataset contains 106 columns in different formats. A lot of them neither categorical or numerical. Thus, we select carefully the columns that could it makes sense for the next analysis. How? Using as reference the previos analysis.

In this section we are using df_dataset that select only a couple of columns:

In [5]:
list_columns = ['id',
                'host_id',
                'host_since', 
                'host_response_time', 
                'host_response_rate', 
                'host_is_superhost',
                'neighbourhood_cleansed', 
                'room_type',
                'property_type', 
                'accommodates', 
                'bathrooms', 
                'bedrooms', 
                'beds', 
                'bed_type', 
                'amenities', 
                'price', 
                'extra_people', 
                'minimum_nights',
                'maximum_nights',
                'alendar_updated',
                'has_availability',
                'availability_30',
                'availability_60',
                'availability_90',
                'availability_365',
                'number_of_reviews',
                'number_of_reviews_ltm',
                'first_review',
                'last_review',
                'review_scores_rating',
                'review_scores_accuracy',
                'review_scores_cleanliness',
                'review_scores_checkin',
                'review_scores_communication',
                'review_scores_location',
                'review_scores_value',
                'calculated_host_listings_count',
                'calculated_host_listings_count_entire_homes',
                'calculated_host_listings_count_private_rooms',
                'calculated_host_listings_count_shared_rooms',
                'reviews_per_month']

In [6]:
df_dataset = listing.loc[:, list_columns].reindex()

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [7]:
df_dataset.head()

Unnamed: 0,id,host_id,host_since,host_response_time,host_response_rate,host_is_superhost,neighbourhood_cleansed,room_type,property_type,accommodates,...,review_scores_cleanliness,review_scores_checkin,review_scores_communication,review_scores_location,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month
0,2818,3159,2008-09-24,within an hour,100%,t,Oostelijk Havengebied - Indische Buurt,Private room,Apartment,2,...,10.0,10.0,10.0,9.0,10.0,1,0,1,0,2.13
1,20168,59484,2009-12-02,within an hour,100%,f,Centrum-Oost,Private room,Townhouse,2,...,10.0,10.0,10.0,10.0,9.0,2,0,2,0,2.57
2,25428,56142,2009-11-20,within an hour,100%,f,Centrum-West,Entire home/apt,Apartment,3,...,10.0,10.0,10.0,10.0,10.0,2,2,0,0,0.13
3,27886,97647,2010-03-23,within an hour,100%,t,Centrum-West,Private room,Houseboat,2,...,10.0,10.0,10.0,10.0,10.0,1,0,1,0,2.14
4,28871,124245,2010-05-13,within an hour,100%,t,Centrum-West,Private room,Apartment,2,...,10.0,10.0,10.0,10.0,10.0,3,0,3,0,2.81


Transforming non-numerical `host_response_rate` to numerical:

In [8]:
def str_rate2int(rate):
    if type(rate) is str:
        return float(rate.replace("%", ""))
    else:
        return rate 

In [9]:
df_dataset['host_response_rate_float'] = df_dataset.host_response_rate.apply(str_rate2int)

In [10]:
def str2boolean(row):
    if row == 't':
        return 1
    elif row == 'f':
        return 0
    else:
        return np.nan

In [11]:
df_dataset['superhost'] = df_dataset.host_is_superhost.apply(str2boolean)

In [12]:
def price2float(string_price):
    return float(string_price.split('.')[0].replace('$', '').replace(',', ''))

In [13]:
df_dataset['price_float'] = df_dataset.price.apply(price2float)

In [14]:
df_dataset['extra_people_float'] = df_dataset.extra_people.apply(price2float)

In [15]:
df_dataset.drop(columns=['host_response_rate', 'host_is_superhost', 'price', 'extra_people'], inplace=True)

In [16]:
df_dataset.head()

Unnamed: 0,id,host_id,host_since,host_response_time,neighbourhood_cleansed,room_type,property_type,accommodates,bathrooms,bedrooms,...,review_scores_value,calculated_host_listings_count,calculated_host_listings_count_entire_homes,calculated_host_listings_count_private_rooms,calculated_host_listings_count_shared_rooms,reviews_per_month,host_response_rate_float,superhost,price_float,extra_people_float
0,2818,3159,2008-09-24,within an hour,Oostelijk Havengebied - Indische Buurt,Private room,Apartment,2,1.5,1.0,...,10.0,1,0,1,0,2.13,100.0,1.0,59.0,20.0
1,20168,59484,2009-12-02,within an hour,Centrum-Oost,Private room,Townhouse,2,1.0,1.0,...,9.0,2,0,2,0,2.57,100.0,0.0,80.0,0.0
2,25428,56142,2009-11-20,within an hour,Centrum-West,Entire home/apt,Apartment,3,1.0,1.0,...,10.0,2,2,0,0,0.13,100.0,0.0,125.0,10.0
3,27886,97647,2010-03-23,within an hour,Centrum-West,Private room,Houseboat,2,1.0,1.0,...,10.0,1,0,1,0,2.14,100.0,1.0,155.0,0.0
4,28871,124245,2010-05-13,within an hour,Centrum-West,Private room,Apartment,2,1.0,1.0,...,10.0,3,0,3,0,2.81,100.0,1.0,75.0,0.0


In [17]:
df_dataset.dropna(subset=['host_response_time'], inplace=True)

In [18]:
df_dataset['host_response_time'].unique()

array(['within an hour', 'within a day', 'within a few hours',
       'a few days or more'], dtype=object)

#### Label Encoding

Encode target labels with value between 0 and n_classes-1. Label encoding is applying to different columns. To avoid redundate code, the function `label_encoding` is built:

In [19]:
def label_encoding(df, target_column, replace_column=False):
    """
    This method receive a dataframe and a string as the name of the column encoded. 
    It returns a column into the dataframe. The new column could replace the original 
    turning replace_column to True.
    """
    lb_make = LabelEncoder()
    encoded_name = lb_make.fit_transform(df[target_column])
    if replace_column:
        df.drop(columns=[target_column], inplace=True)
        
    return encoded_name

In [20]:
df_dataset['host_response_time_encode'] = label_encoding(df_dataset, 
                                                         'host_response_time', 
                                                         replace_column=True)

In [21]:
df_dataset['neighbourhood_cleansed_encode'] = label_encoding(df_dataset, 
                                                             'neighbourhood_cleansed', 
                                                             replace_column=True)

In [22]:
df_dataset['room_type_encode'] = label_encoding(df_dataset, 
                                                'room_type', 
                                                replace_column=True)

In [23]:
df_dataset['property_type_encode'] = label_encoding(df_dataset, 
                                                    'property_type', 
                                                    replace_column=True)

In [24]:
df_dataset['bed_type_encode'] = label_encoding(df_dataset, 
                                               'bed_type', 
                                               replace_column=True)

In [25]:
df_dataset.head()

Unnamed: 0,id,host_id,host_since,accommodates,bathrooms,bedrooms,beds,amenities,minimum_nights,maximum_nights,...,reviews_per_month,host_response_rate_float,superhost,price_float,extra_people_float,host_response_time_encode,neighbourhood_cleansed_encode,room_type_encode,property_type_encode,bed_type_encode
0,2818,3159,2008-09-24,2,1.5,1.0,2.0,"{Internet,Wifi,""Paid parking off premises"",""Bu...",3,15,...,2.13,100.0,1.0,59.0,20.0,3,14,2,1,4
1,20168,59484,2009-12-02,2,1.0,1.0,1.0,"{TV,Internet,Wifi,""Paid parking off premises"",...",1,1000,...,2.57,100.0,0.0,80.0,0.0,3,4,2,29,4
2,25428,56142,2009-11-20,3,1.0,1.0,1.0,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",14,60,...,0.13,100.0,0.0,125.0,10.0,3,5,0,1,4
3,27886,97647,2010-03-23,2,1.0,1.0,1.0,"{TV,Internet,Wifi,Breakfast,Heating,""Smoke det...",2,730,...,2.14,100.0,1.0,155.0,0.0,3,5,2,21,4
4,28871,124245,2010-05-13,2,1.0,1.0,1.0,"{Internet,Wifi,""Pets live on this property"",Ca...",2,1825,...,2.81,100.0,1.0,75.0,0.0,3,5,2,1,4


In the notebook `analysis_neighborhoods`, we got the main amenities on properties. Using them, the following categories have been created:

1. **Safety**: "Smoke detector", "Carbon monoxide detector", "Fire extinguisher", "First aid kit"
2. **Entertainment/work**: Wifi, TV, Cable TV", "Laptop friendly workspace"
3. **Personal care**: Essentials, Shampoo, "Hair dryer" 
4. **Check-in**: In 2 subcategories
    - Host-check-in: Heading, "Host greets you"
    - Self-check-in: "24-hour check-in", "Self check-in"
5. **Kitchen**: Refrigerator, Microwave, "Coffee maker", Dishwasher, "Dishes and silverware", Oven, Kitchen, "Cooking basics" 
6. **Comfort**: Washer, Stove, Dryer, Iron, Hangers, "Hot water", "Bed linens", Air conditioning, ""Extra pillows and blankets", "Indoor fireplace", Breakfast, Bathtub,  "High chair"
7. **Family/kid**: "Family/kid friendly", "Garden or backyard", "Children’s books and toys", "Pets allowed", Crib
8. **Safety entrance**: "Private entrance", "Buzzer/wireless intercom", "Safety card", "Lock on bedroom door", "Well-lit path to entrance"
9. **Parking**: In 3 subcategories
    - "Paid parking off premises"
    - "Free parking on premises"
    - "Paid parking on premises"

In [26]:
safety = ['Smoke detector', 'Carbon monoxide detector', 'Fire extinguisher', 'First aid kit']
entertainment_work = ['Wifi', 'TV', 'Cable TV', 'Laptop friendly workspace']
personal_care = ['Essentials', 'Shampoo', 'Hair dryer']
host_check_in = ['Heading', 'Host greets you']
self_check_in = ['24-hour check-in', 'Self check-in']
kitchen = ['Refrigerator', 'Microwave', 'Coffee maker', 
           'Dishwasher', 'Dishes and silverware', 'Oven', 
           'Kitchen', 'Cooking basics']
comfort = ['Washer', 'Stove', 'Dryer', 'Iron', 'Hangers', 
           'Hot water', "Bed linens", 'Air conditioning', 
           'Extra pillows and blankets', 'Indoor fireplace', 
           'Breakfast', 'Bathtub', 'High chair']
family_kid = ['Family/kid friendly', 'Garden or backyard', 'Children’s books and toys', 'Pets allowed', 'Crib']
safety_entrance = ['Private entrance', 'Buzzer/wireless intercom', 
                   'Safety card', 'Lock on bedroom door', 
                   'Well-lit path to entrance']
paid_parking_off = ['Paid parking off premises']
paid_parking_on = ['Paid parking on premises']
free_parking_on = ['Free parking on premises']

In [27]:
safety_col = []
entertainment_work_col = []
personal_care_col = []
host_check_in_col = []
self_check_in_col = []
kitchen_col = []
comfort_col = []
family_kid_col = []
safety_entrance_col = []
paid_parking_off_col = []
paid_parking_on_col = []
free_parking_on_col = []

In [28]:
def amenities(df, column):
    for j in range(df.shape[0]):
        s = df[column][j].replace('{', '').replace('}', '').replace('"', '').split(',')
        safe = 0
        en = 0
        care = 0
        host = 0
        self = 0
        kitch = 0
        com = 0
        fam = 0
        entrance = 0
        paid_off = 0
        paid_on = 0
        free_on = 0
        for i in s:
            if i in safety:
                safe += 1
            if i in entertainment_work:
                en += 1
            if i in personal_care:
                care += 1
            if i in host_check_in:
                host += 1
            if i in self_check_in:
                self += 1
            if i in kitchen:
                kitch += 1
            if i in comfort:
                com += 1
            if i in family_kid:
                fam += 1
            if i in safety_entrance:
                entrance += 1
            if i in paid_parking_off:
                paid_off += 1
            if i in paid_parking_on:
                paid_on += 1
            if i in free_parking_on:
                free_on += 1
        
        safety_col.append(safe)
        entertainment_work_col.append(en)
        personal_care_col.append(care)
        host_check_in_col.append(host)
        self_check_in_col.append(self)
        kitchen_col.append(kitch)
        comfort_col.append(com)
        family_kid_col.append(fam)
        safety_entrance_col.append(entrance)
        paid_parking_off_col.append(paid_off)
        paid_parking_on_col.append(paid_on)
        free_parking_on_col.append(free_on)
        
    df['safety'] = safety_col
    df['entertainment_work'] = entertainment_work_col
    df['personal_care'] = personal_care_col
    df['host_check_in'] = host_check_in_col
    df['self_check_in'] =  self_check_in_col
    df['kitchen'] = kitchen_col
    df['comfort'] = comfort_col
    df['family_kid'] = family_kid_col
    df['safety_entrance'] = safety_entrance_col
    df['paid_parking_off'] = paid_parking_off_col
    df['paid_parking_on'] = paid_parking_on_col
    df['free_parking_on'] = free_parking_on_col
        
    
    
    return df

In [29]:
df_dataset.reset_index(inplace=True)

In [30]:
df_training = amenities(df_dataset, 'amenities')

In [31]:
df_training.head()

Unnamed: 0,index,id,host_id,host_since,accommodates,bathrooms,bedrooms,beds,amenities,minimum_nights,...,personal_care,host_check_in,self_check_in,kitchen,comfort,family_kid,safety_entrance,paid_parking_off,paid_parking_on,free_parking_on
0,0,2818,3159,2008-09-24,2,1.5,1.0,2.0,"{Internet,Wifi,""Paid parking off premises"",""Bu...",3,...,3,1,1,0,6,1,4,1,1,0
1,1,20168,59484,2009-12-02,2,1.0,1.0,1.0,"{TV,Internet,Wifi,""Paid parking off premises"",...",1,...,2,1,0,1,3,0,0,1,0,0
2,2,25428,56142,2009-11-20,3,1.0,1.0,1.0,"{TV,""Cable TV"",Internet,Wifi,Kitchen,Elevator,...",14,...,3,0,0,1,8,1,2,0,0,0
3,3,27886,97647,2010-03-23,2,1.0,1.0,1.0,"{TV,Internet,Wifi,Breakfast,Heating,""Smoke det...",2,...,3,0,2,0,3,0,1,0,0,0
4,4,28871,124245,2010-05-13,2,1.0,1.0,1.0,"{Internet,Wifi,""Pets live on this property"",Ca...",2,...,3,1,0,1,4,0,2,0,0,0
