In [13]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [14]:
DATASET_PATH = "data/train_airbnb_berlin.csv"

In [15]:
# Read the raw data
raw_data = pd.read_csv(DATASET_PATH)
raw_data.head(10)

Unnamed: 0,Listing ID,Listing Name,Host ID,Host Name,Host Since,Host Response Time,Host Response Rate,Is Superhost,neighbourhood,Neighborhood Group,...,Overall Rating,Accuracy Rating,Cleanliness Rating,Checkin Rating,Communication Rating,Location Rating,Value Rating,Instant Bookable,Business Travel Ready,Price
0,19665213.0,*,156079597.0,Maximilian,2016-01-20,,,f,Prenzlauer Berg,Pankow,...,100.0,10.0,10.0,10.0,10.0,9.0,10.0,t,f,26.0
1,6436842.0,*,5302290.0,Dulie,2013-04-07,,,f,Pankow,Pankow,...,90.0,9.0,9.0,10.0,10.0,9.0,10.0,f,f,41.0
2,10559468.0,*,59151456.0,Geank,2016-02-07,,,f,Prenzlauer Berg,Pankow,...,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,f,50.0
3,27215482.0,*,193452785.0,Alix,2018-06-26,,,f,Friedrichshain,Friedrichshain-Kreuzberg,...,100.0,10.0,10.0,10.0,10.0,10.0,9.0,f,f,50.0
4,27287546.0,*,205870244.0,Lurina,2013-05-16,within a few hours,92%,t,Prenzlauer Berg,Pankow,...,,,,,,,,t,f,55.0
5,26590915.0,*,90250336.0,Zan,2016-08-22,within an hour,100%,t,Mariendorf,Tempelhof - SchÃ¶neberg,...,99.0,10.0,9.0,10.0,10.0,10.0,10.0,t,f,39.0
6,32996974.0,*,5947005.0,Meler,2013-03-13,within a day,80%,f,Wilmersdorf,Charlottenburg-Wilm.,...,97.0,10.0,10.0,10.0,10.0,10.0,10.0,f,f,94.0
7,17364275.0,*,19435959.0,Alise,2015-04-30,within an hour,100%,f,Kreuzberg,Friedrichshain-Kreuzberg,...,94.0,9.0,9.0,10.0,10.0,10.0,10.0,t,f,73.0
8,23775462.0,*,151814038.0,Ologfi,2017-09-11,,,f,Kreuzberg,Friedrichshain-Kreuzberg,...,93.0,9.0,10.0,9.0,9.0,10.0,9.0,f,f,100.0
9,13483316.0,*,72266241.0,Aselgiiud,2016-05-06,within an hour,100%,f,Moabit,Mitte,...,100.0,10.0,9.0,10.0,10.0,9.0,10.0,f,f,50.0


In [16]:
# Keep only the useful features
columns_to_keep = [
    "Host Since",
    "Host Response Time",
    "Host Response Rate",
    "Is Superhost",
    "neighbourhood",
    "Neighborhood Group",
    "Postal Code",
    "Property Type",
    "Room Type",
    "Accomodates",
    "Bathrooms",
    "Bedrooms",
    "Beds",
    "Guests Included",
    "Min Nights",
    "Reviews",
    "Overall Rating",
    "Accuracy Rating",
    "Cleanliness Rating",
    "Checkin Rating",
    "Communication Rating",
    "Location Rating",
    "Value Rating",
    "Value Rating",
    "Instant Bookable",
    "Business Travel Ready",
    "Price",
]

kept_data = raw_data[columns_to_keep]
kept_data.columns

Index(['Host Since', 'Host Response Time', 'Host Response Rate',
       'Is Superhost', 'neighbourhood', 'Neighborhood Group', 'Postal Code',
       'Property Type', 'Room Type', 'Accomodates', 'Bathrooms', 'Bedrooms',
       'Beds', 'Guests Included', 'Min Nights', 'Reviews', 'Overall Rating',
       'Accuracy Rating', 'Cleanliness Rating', 'Checkin Rating',
       'Communication Rating', 'Location Rating', 'Value Rating',
       'Value Rating', 'Instant Bookable', 'Business Travel Ready', 'Price'],
      dtype='object')

In [17]:
# One hot encoding for categorical features

def to_one_hot(data:pd.DataFrame, columns:list) -> pd.DataFrame:
    """
    Convert the given columns to one hot encoding
    The original columns are dropped and the new ones should be named according to the category they represent

    :param data: The data to convert
    :param columns: The columns to convert

    :return: The converted data
    """
    for column in columns:
        one_hot = pd.get_dummies(data[column], prefix=column)
        data = data.drop(column, axis=1)
        data = data.join(one_hot)
    return data

categorical_columns = ["Host Response Time", "Property Type", "Room Type"]

one_hot_data = to_one_hot(kept_data, categorical_columns)
one_hot_data.columns

Index(['Host Since', 'Host Response Rate', 'Is Superhost', 'neighbourhood',
       'Neighborhood Group', 'Postal Code', 'Accomodates', 'Bathrooms',
       'Bedrooms', 'Beds', 'Guests Included', 'Min Nights', 'Reviews',
       'Overall Rating', 'Accuracy Rating', 'Cleanliness Rating',
       'Checkin Rating', 'Communication Rating', 'Location Rating',
       'Value Rating', 'Value Rating', 'Instant Bookable',
       'Business Travel Ready', 'Price',
       'Host Response Time_a few days or more',
       'Host Response Time_within a day',
       'Host Response Time_within a few hours',
       'Host Response Time_within an hour', 'Property Type_*',
       'Property Type_Apartment', 'Property Type_Bed and breakfast',
       'Property Type_Boat', 'Property Type_Boutique hotel',
       'Property Type_Bungalow', 'Property Type_Condominium',
       'Property Type_Guest suite', 'Property Type_Guesthouse',
       'Property Type_Hostel', 'Property Type_Hotel', 'Property Type_House',
       'Prope