# Drive Connection

In [1]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/My Drive/dphi airbnb')

Mounted at /content/drive


# Basic EDA

In [2]:
# get latest version of sklearn to use handle_unknown param in OrdinalEncoder

!pip install -U scikit-learn
import sklearn
sklearn.__version__

Collecting scikit-learn
  Downloading scikit_learn-0.24.2-cp37-cp37m-manylinux2010_x86_64.whl (22.3 MB)
[K     |████████████████████████████████| 22.3 MB 3.7 MB/s 
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.2.post1
    Uninstalling scikit-learn-0.22.2.post1:
      Successfully uninstalled scikit-learn-0.22.2.post1
Successfully installed scikit-learn-0.24.2 threadpoolctl-2.2.0


'0.24.2'

In [3]:
# explore dataset

import numpy as np
import pandas as pd

train = pd.read_csv('data/train_airbnb_berlin.csv')
train.head()

Unnamed: 0,Listing ID,Listing Name,Host ID,Host Name,Host Since,Host Response Time,Host Response Rate,Is Superhost,neighbourhood,Neighborhood Group,City,Postal Code,Country Code,Country,Latitude,Longitude,Is Exact Location,Property Type,Room Type,Accomodates,Bathrooms,Bedrooms,Beds,Square Feet,Guests Included,Min Nights,Reviews,First Review,Last Review,Overall Rating,Accuracy Rating,Cleanliness Rating,Checkin Rating,Communication Rating,Location Rating,Value Rating,Instant Bookable,Business Travel Ready,Price
0,19665213.0,*,156079597.0,Maximilian,2016-01-20,,,f,Prenzlauer Berg,Pankow,Berlin,10437.0,DE,Germany,52.54652,13.41792,t,Apartment,Private room,2,1.0,1.0,1.0,,1,2,6,2017-07-07,2017-08-08,100.0,10.0,10.0,10.0,10.0,9.0,10.0,t,f,26.0
1,6436842.0,*,5302290.0,Dulie,2013-04-07,,,f,Pankow,Pankow,Berlin,13187.0,DE,Germany,52.56512,13.42214,t,Apartment,Entire home/apt,2,1.0,2.0,2.0,,2,7,6,2015-05-26,2019-04-30,90.0,9.0,9.0,10.0,10.0,9.0,10.0,f,f,41.0
2,10559468.0,*,59151456.0,Geank,2016-02-07,,,f,Prenzlauer Berg,Pankow,Berlin,10439.0,DE,Germany,52.54741,13.42521,t,Apartment,Entire home/apt,3,1.0,1.0,2.0,,1,1,2,2016-04-19,2016-07-04,100.0,10.0,10.0,10.0,10.0,10.0,10.0,f,f,50.0
3,27215482.0,*,193452785.0,Alix,2018-06-26,,,f,Friedrichshain,Friedrichshain-Kreuzberg,Berlin,10245.0,DE,Germany,52.50958,13.45144,t,Apartment,Private room,2,1.0,1.0,1.0,,1,2,4,2018-07-31,2018-08-12,100.0,10.0,10.0,10.0,10.0,10.0,9.0,f,f,50.0
4,27287546.0,*,205870244.0,Lurina,2013-05-16,within a few hours,92%,t,Prenzlauer Berg,Pankow,Berlin,10405.0,DE,Germany,52.52995,13.41558,t,Apartment,Private room,3,1.0,1.0,2.0,,1,6,0,,,,,,,,,,t,f,55.0


In [4]:
train.shape

(15692, 39)

In [6]:
# remove rows missing target value

train.dropna(subset=['Price'], inplace=True)
train.shape

(15683, 39)

In [7]:
# replace junk values with NaN

for col in train:
  train[col] = train[col].replace('*',np.NaN)

In [8]:
# explore distinct values for each column

for col in train:
  print(col, train[col].unique(), '\n')

Listing ID [19665213.  6436842. 10559468. ... 12246808. 10886411. 32774348.] 

Listing Name [nan] 

Host ID [1.56079597e+08 5.30229000e+06 5.91514560e+07 ... 8.99723060e+07
 1.01089520e+07 1.48795651e+08] 

Host Name ['Maximilian' 'Dulie' 'Geank' ... 'Ada' 'Lanna' 'OloreA'] 

Host Since ['2016-01-20' '2013-04-07' '2016-02-07' ... '2010-09-05' '2011-09-20'
 '2011-12-08'] 

Host Response Time [nan 'within a few hours' 'within an hour' 'within a day'
 'a few days or more'] 

Host Response Rate [nan '92%' '100%' '80%' '71%' '83%' '98%' '78%' '89%' '50%' '90%' '70%'
 '20%' '97%' '91%' '88%' '75%' '93%' '0%' '67%' '33%' '40%' '58%' '86%'
 '60%' '95%' '94%' '96%' '82%' '63%' '99%' '87%' '25%' '30%'] 

Is Superhost ['f' 't' nan] 

neighbourhood ['Prenzlauer Berg' 'Pankow' 'Friedrichshain' 'Mariendorf' 'Wilmersdorf'
 'Kreuzberg' 'Moabit' 'NeukÃ¶lln' 'Alt-Treptow' 'SchÃ¶neberg'
 'Reinickendorf' 'Mitte' 'Karlshorst' 'WeiÃ\x9fensee' 'Friedenau'
 'Hansaviertel' 'Wedding' 'Charlottenburg' 'Steglitz'

# Clean and Prepare Data

In [9]:
'''
remove unwanted and uncorrelated columns, i.e.,
Listing ID, Listing Name, Host ID, Host Name,
City, Country Code, Country, Business Travel Ready
'''

features = train[[
              'Host Since',
       'Host Response Time', 'Host Response Rate', 'Is Superhost',
       'neighbourhood', 'Neighborhood Group', 'Postal Code',
       'Latitude', 'Longitude', 'Is Exact Location',
       'Property Type', 'Room Type', 'Accomodates', 'Bathrooms', 'Bedrooms',
       'Beds', 'Square Feet', 'Guests Included', 'Min Nights', 'Reviews',
       'First Review', 'Last Review', 'Overall Rating', 'Accuracy Rating',
       'Cleanliness Rating', 'Checkin Rating', 'Communication Rating',
       'Location Rating', 'Value Rating', 'Instant Bookable',
       'Price'    
]].copy()

In [10]:
# check feature datatypes

features.dtypes

Host Since               object
Host Response Time       object
Host Response Rate       object
Is Superhost             object
neighbourhood            object
Neighborhood Group       object
Postal Code              object
Latitude                float64
Longitude               float64
Is Exact Location        object
Property Type            object
Room Type                object
Accomodates              object
Bathrooms                object
Bedrooms                 object
Beds                     object
Square Feet             float64
Guests Included          object
Min Nights               object
Reviews                   int64
First Review             object
Last Review              object
Overall Rating          float64
Accuracy Rating         float64
Cleanliness Rating      float64
Checkin Rating          float64
Communication Rating    float64
Location Rating         float64
Value Rating            float64
Instant Bookable         object
Price                   float64
dtype: o

In [12]:
# use placeholder value for NaN in numerical columns with dtype=object

features['Accomodates'].fillna('-1', inplace=True)
features['Bathrooms'].fillna('-1', inplace=True)
features['Bedrooms'].fillna('-1', inplace=True)
features['Beds'].fillna('-1', inplace=True)
features['Guests Included'].fillna('-1', inplace=True)
features['Min Nights'].fillna('-1', inplace=True)

In [13]:
# convert above columns to numerical

features['Accomodates'] = features['Accomodates'].astype('int64')
features['Bathrooms'] = features['Bathrooms'].astype('float64')
features['Bedrooms'] = features['Bedrooms'].astype('float64')
features['Beds'] = features['Beds'].astype('float64')
features['Guests Included'] = features['Guests Included'].astype('int64')
features['Min Nights'] = features['Min Nights'].astype('int64')

In [14]:
# replace placeholder with NaN

features['Accomodates'] = features['Accomodates'].replace(-1, np.NaN)
features['Bathrooms'] = features['Bathrooms'].replace(-1, np.NaN)
features['Bedrooms'] = features['Bedrooms'].replace(-1, np.NaN)
features['Beds'] = features['Beds'].replace(-1, np.NaN)
features['Guests Included'] = features['Guests Included'].replace(-1, np.NaN)
features['Min Nights'] = features['Min Nights'].replace(-1, np.NaN)

In [15]:
numerical_cols = [
                  'Accomodates', 'Bathrooms', 'Bedrooms',
                  'Beds', 'Guests Included', 'Min Nights',
                  'Square Feet', 'Overall Rating', 'Accuracy Rating',
                  'Cleanliness Rating', 'Checkin Rating', 'Communication Rating',
                  'Location Rating', 'Value Rating'
]

In [16]:
categorical_cols = [
                     'Host Since',
       'Host Response Time', 'Host Response Rate', 'Is Superhost',
       'neighbourhood', 'Neighborhood Group', 'Postal Code',
       'Is Exact Location', 'Property Type', 'Room Type', 'Accomodates',
       'Bathrooms', 'Bedrooms', 'Beds', 'Guests Included', 'Min Nights',
       'First Review', 'Last Review', 'Instant Bookable'
]

# Define Preprocessing Steps

In [17]:
# use KNNImputer for numerical missing values
# use mode value for categorical values aong with OrdinalEncoder

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder

numerical_transformer = KNNImputer()

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encode', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=99999))
])

# define preprocessing steps

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

In [18]:
all_features = [
            'Host Since',
       'Host Response Time', 'Host Response Rate', 'Is Superhost',
       'neighbourhood', 'Neighborhood Group', 'Postal Code',
       'Latitude', 'Longitude', 'Is Exact Location',
       'Property Type', 'Room Type', 'Accomodates', 'Bathrooms', 'Bedrooms',
       'Beds', 'Square Feet', 'Guests Included', 'Min Nights', 'Reviews',
       'First Review', 'Last Review', 'Overall Rating', 'Accuracy Rating',
       'Cleanliness Rating', 'Checkin Rating', 'Communication Rating',
       'Location Rating', 'Value Rating', 'Instant Bookable'    
]

# Split Data

In [19]:
# split data into train and test sets

from sklearn.model_selection import train_test_split
x = features[all_features]
y =  features['Price']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1)#, random_state=0)

In [20]:
!pip install catboost

Collecting catboost
  Downloading catboost-0.26.1-cp37-none-manylinux1_x86_64.whl (67.4 MB)
[K     |████████████████████████████████| 67.4 MB 30 kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26.1


# Build and Train Pipeline

In [21]:
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor

# Build pipeline and fit data

my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', CatBoostRegressor())
                             ])

 
my_pipeline.fit(x_train, y_train)

Learning rate set to 0.062523
0:	learn: 47.9290186	total: 62.3ms	remaining: 1m 2s
1:	learn: 47.0091161	total: 68.1ms	remaining: 34s
2:	learn: 46.2165498	total: 73.8ms	remaining: 24.5s
3:	learn: 45.4944632	total: 80.2ms	remaining: 20s
4:	learn: 44.8136325	total: 85.6ms	remaining: 17s
5:	learn: 44.2268821	total: 91.3ms	remaining: 15.1s
6:	learn: 43.6739599	total: 96.7ms	remaining: 13.7s
7:	learn: 43.1781212	total: 102ms	remaining: 12.6s
8:	learn: 42.7244214	total: 108ms	remaining: 11.9s
9:	learn: 42.3209220	total: 113ms	remaining: 11.2s
10:	learn: 41.9692712	total: 119ms	remaining: 10.7s
11:	learn: 41.6410052	total: 125ms	remaining: 10.3s
12:	learn: 41.3161179	total: 132ms	remaining: 10.1s
13:	learn: 41.0368307	total: 140ms	remaining: 9.87s
14:	learn: 40.7504884	total: 148ms	remaining: 9.71s
15:	learn: 40.4850914	total: 153ms	remaining: 9.44s
16:	learn: 40.2305929	total: 159ms	remaining: 9.18s
17:	learn: 40.0085975	total: 166ms	remaining: 9.05s
18:	learn: 39.8117475	total: 173ms	remainin

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num', KNNImputer(),
                                                  ['Accomodates', 'Bathrooms',
                                                   'Bedrooms', 'Beds',
                                                   'Guests Included',
                                                   'Min Nights', 'Square Feet',
                                                   'Overall Rating',
                                                   'Accuracy Rating',
                                                   'Cleanliness Rating',
                                                   'Checkin Rating',
                                                   'Communication Rating',
                                                   'Location Rating',
                                                   'Value Rating']),
                                                 ('cat',
                                            

# Evaluate Model

In [22]:
# get RMSE for all train data

preds = my_pipeline.predict(x)

score = mean_squared_error(y, preds, squared=False)
print(score)

26.75302308675863


In [23]:
# get accuracy scores

my_pipeline.score(x_train, y_train), my_pipeline.score(x_test, y_test)

(0.7269664827744199, 0.44965782115396147)

# Generate Predictions

In [24]:
# repeat all train set steps

test = pd.read_csv('data/test_airbnb_berlin.csv')

In [25]:
for col in test:
  test[col] = test[col].replace('*',np.NaN)

In [26]:
features = test[[
              'Host Since',
       'Host Response Time', 'Host Response Rate', 'Is Superhost',
       'neighbourhood', 'Neighborhood Group', 'Postal Code',
       'Latitude', 'Longitude', 'Is Exact Location',
       'Property Type', 'Room Type', 'Accomodates', 'Bathrooms', 'Bedrooms',
       'Beds', 'Square Feet', 'Guests Included', 'Min Nights', 'Reviews',
       'First Review', 'Last Review', 'Overall Rating', 'Accuracy Rating',
       'Cleanliness Rating', 'Checkin Rating', 'Communication Rating',
       'Location Rating', 'Value Rating', 'Instant Bookable',    
]].copy()

In [27]:
features['Accomodates'].fillna('-1', inplace=True)
features['Bathrooms'].fillna('-1', inplace=True)
features['Bedrooms'].fillna('-1', inplace=True)
features['Beds'].fillna('-1', inplace=True)
features['Guests Included'].fillna('-1', inplace=True)
features['Min Nights'].fillna('-1', inplace=True)

In [28]:
features['Accomodates'] = features['Accomodates'].astype('int64')
features['Bathrooms'] = features['Bathrooms'].astype('float64')
features['Bedrooms'] = features['Bedrooms'].astype('float64')
features['Beds'] = features['Beds'].astype('float64')
features['Guests Included'] = features['Guests Included'].astype('int64')
features['Min Nights'] = features['Min Nights'].astype('int64')

In [29]:
features['Accomodates'] = features['Accomodates'].replace(-1, np.NaN)
features['Bathrooms'] = features['Bathrooms'].replace(-1, np.NaN)
features['Bedrooms'] = features['Bedrooms'].replace(-1, np.NaN)
features['Beds'] = features['Beds'].replace(-1, np.NaN)
features['Guests Included'] = features['Guests Included'].replace(-1, np.NaN)
features['Min Nights'] = features['Min Nights'].replace(-1, np.NaN)

In [30]:
# store predictions as dataframe

submission = pd.DataFrame(my_pipeline.predict(features[all_features]))
submission.index = test.index
submission.columns = ['prediction']
submission.head()

Unnamed: 0,prediction
0,75.051376
1,91.343948
2,36.492088
3,93.764422
4,38.094615


In [31]:
# save as csv

submission.to_csv('submission.csv', index=False)