# Airbnb Berlin Price Prediction

This is a Kaggle competition and can be found [here](https://www.kaggle.com/datasets/gauravduttakiit/airbnb-berlin-price-prediction)

In [97]:
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from joblib import dump
import pickle

# Define functions

In [98]:
def print_best_model_metrics(gs, X, y):
    """"
    gs: fitted GridSearch object
    X: DataFrame with features
    y: actual target
    """
    print(f"Best parameters:\n{gs.best_params_}")
    print(f"\nBest score: {gs.best_score_:.3f}")
    print(f"RMSE: {np.sqrt(-1*gs.best_score_):.3f}")
    score = gs.score(X, y)
    print(f"\nneg_mean_squared_error on the full train set: {score:.3f}")
    print(f"RMSE on the full train set: {np.sqrt(-1*score):.3f}")
    y_pred = gs.predict(X)
    print(f"\nMean squared error = {mean_squared_error(y, y_pred, squared=False):.2f}")
    print(f"Root Mean Square Percentage Error: {RMSPE(y, y_pred):.2f}")

# Load data

In [99]:
# Load the dataset
def load_ds(path: Path, filename: str) -> pd.DataFrame:
    """Read the dataset csv file as a pandas dataframe."""
    return pd.read_csv(path / filename)

# Load dataset
dataset_path = Path().absolute() / "data"
filename = "X_y_train.csv"
X_y_train = load_ds(dataset_path, filename)

print(f"Shape: {X_y_train.shape}")

Shape: (12546, 39)


In [100]:
X_y_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12546 entries, 0 to 12545
Data columns (total 39 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Listing ID             12545 non-null  float64
 1   Listing Name           12505 non-null  object 
 2   Host ID                12546 non-null  float64
 3   Host Name              12529 non-null  object 
 4   Host Since             12530 non-null  object 
 5   Host Response Time     6917 non-null   object 
 6   Host Response Rate     6917 non-null   object 
 7   Is Superhost           12528 non-null  object 
 8   neighbourhood          12546 non-null  object 
 9   Neighborhood Group     12546 non-null  object 
 10  City                   12545 non-null  object 
 11  Postal Code            12369 non-null  object 
 12  Country Code           12546 non-null  object 
 13  Country                12546 non-null  object 
 14  Latitude               12546 non-null  float64
 15  Lo

In [101]:
y_train = X_y_train.loc[:, "Price"].copy()
X_train = X_y_train.drop(columns="Price")

In [102]:
X_train.columns

Index(['Listing ID', 'Listing Name', 'Host ID', 'Host Name', 'Host Since',
       'Host Response Time', 'Host Response Rate', 'Is Superhost',
       'neighbourhood', 'Neighborhood Group', 'City', 'Postal Code',
       'Country Code', 'Country', 'Latitude', 'Longitude', 'Is Exact Location',
       'Property Type', 'Room Type', 'Accomodates', 'Bathrooms', 'Bedrooms',
       'Beds', 'Square Feet', 'Guests Included', 'Min Nights', 'Reviews',
       'First Review', 'Last Review', 'Overall Rating', 'Accuracy Rating',
       'Cleanliness Rating', 'Checkin Rating', 'Communication Rating',
       'Location Rating', 'Value Rating', 'Instant Bookable',
       'Business Travel Ready'],
      dtype='object')

# Preprocessing pipeline

In [103]:
# TransformerMixin: add method ".fit_transform()"
# BaseEstimator: add methods ".get_params()" and ".set_params()"
# We need 3 methods:
# 1) .fit()
# 2) .transform()
# 3) .fit_transform() (provided by "TransformerMixin")
class CombinedAttributesAdder(BaseEstimator, TransformerMixin):
    # avoid "*args" or "**kargs" in "__init__"
    def __init__(self):
        pass

    # fit is needed later for the pipilene
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):

        # Host Since
        date2 = pd.to_datetime(X["Host Since"], format="%Y-%m-%d")
        X["year"] = date2.dt.year

        # Is Superhost: ok

        # Property Type
        X.loc[X["Property Type"] == "*", "Property Type"] = np.nan

        # Room Type: ok

        # Accomodates
        X.loc[X.Accomodates == "*", "Accomodates"] = np.nan
        X["Accomodates"] = X.Accomodates.astype("float")

        # Bathrooms
        X.loc[X.Bathrooms == "*", "Bathrooms"] = np.nan
        X["Bathrooms"] = X.Bathrooms.astype("float")

        # Bedrooms
        X.loc[X.Bedrooms == "*", "Bedrooms"] = np.nan
        X["Bedrooms"] = X.Bedrooms.astype("float")

        # Beds
        X.loc[X.Beds == "*", "Beds"] = np.nan
        X["Beds"] = X.Beds.astype("float")

        # Min Nights
        X.loc[X["Min Nights"] == "*", "Min Nights"] = np.nan
        X["Min Nights"] = X["Min Nights"].astype("float")

        # Instant Bookable: ok

        # Drop unused columns
        cols_to_drop = [
            "Listing ID", "Listing Name", "Host ID", "Host Name", "Host Since",
            "Host Response Time", "Host Response Rate",
            "neighbourhood", "Neighborhood Group", "City", "Postal Code",
            "Country Code", "Country", "Latitude", "Longitude", "Is Exact Location",
            "Square Feet", "Guests Included", "Reviews",
            "First Review", "Last Review", "Overall Rating", "Accuracy Rating",
            "Cleanliness Rating", "Checkin Rating", "Communication Rating",
            "Location Rating", "Value Rating",
            "Business Travel Ready"]
        X_out = X.copy()
        X_out.drop(columns=cols_to_drop, inplace=True)

        return X_out # X
     

## FIN QUI

In [104]:
caa = CombinedAttributesAdder()
X_tmp = X_train.copy()
new_x = caa.fit_transform(X_train)
# new_x.info()
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12546 entries, 0 to 12545
Data columns (total 39 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Listing ID             12545 non-null  float64
 1   Listing Name           12505 non-null  object 
 2   Host ID                12546 non-null  float64
 3   Host Name              12529 non-null  object 
 4   Host Since             12530 non-null  object 
 5   Host Response Time     6917 non-null   object 
 6   Host Response Rate     6917 non-null   object 
 7   Is Superhost           12528 non-null  object 
 8   neighbourhood          12546 non-null  object 
 9   Neighborhood Group     12546 non-null  object 
 10  City                   12545 non-null  object 
 11  Postal Code            12369 non-null  object 
 12  Country Code           12546 non-null  object 
 13  Country                12546 non-null  object 
 14  Latitude               12546 non-null  float64
 15  Lo

In [105]:
(new_x.isna().sum() / new_x.shape[0]).reset_index()

Unnamed: 0,index,0
0,Is Superhost,0.001435
1,Property Type,0.004862
2,Room Type,0.0
3,Accomodates,0.001435
4,Bathrooms,0.001594
5,Bedrooms,0.000877
6,Beds,0.000956
7,Min Nights,0.015623
8,Instant Bookable,0.0
9,year,0.001275


In [106]:
# Numerical pipeline
#
# All (except the last) estimators must be transformers (i.e., they
# must have a ".fit_transform()" method).
num_pipeline = Pipeline([
    # replace NA with mean
    ('imputer', SimpleImputer(strategy='mean')),
    # standardize the variables: z = (x - mean) / SD
    ('std_scaler', StandardScaler())])

In [107]:
# Categorical pipeline
#
# All (except the last) estimators must be transformers (i.e., they
# must have a ".fit_transform()" method).
cat_pipeline = Pipeline([
    # replace NA with mode
    ('imputer', SimpleImputer(strategy='most_frequent')),
    # apply "OneHotEncoder()"
    ('one_hot', OneHotEncoder(drop='if_binary'))])

In [108]:
list_num_attribs = ["Accomodates", "Bathrooms", "Bedrooms", "Beds",
                    "Min Nights", "year"]
list_cat_attribs = ["Is Superhost", "Property Type", "Room Type", "Instant Bookable"]

In [109]:
# ColumnTransformer requires tuples with:
# - a name
# - a transformer
# - a list of names (or indices) of columns to which the transformer is applied

cols_transformer = ColumnTransformer([
    # apply "num_pipeline" to numerical columns
    ('num', num_pipeline, list_num_attribs),
    # apply "cat_pipeline" to categorical columns
    ('cat', cat_pipeline, list_cat_attribs)])

In [110]:
full_pipeline = Pipeline([
    # transform/add columns
    ('attribs_adder', CombinedAttributesAdder()),
    # Transform numerical and categorical attributes
    ("cols_transformer", cols_transformer)])

In [111]:
full_pipeline_tmp = Pipeline([
    # transform/add columns
    ('attribs_adder', CombinedAttributesAdder()),
    # Transform numerical and categorical attributes
    ("cols_transformer", cols_transformer)])

X_tmp = X_train.copy()
# new_x = full_pipeline_tmp.fit(X_tmp)
new_x = full_pipeline_tmp.fit_transform(X_tmp)
new_x.shape

(12546, 26)

In [112]:
X_train.columns

Index(['Listing ID', 'Listing Name', 'Host ID', 'Host Name', 'Host Since',
       'Host Response Time', 'Host Response Rate', 'Is Superhost',
       'neighbourhood', 'Neighborhood Group', 'City', 'Postal Code',
       'Country Code', 'Country', 'Latitude', 'Longitude', 'Is Exact Location',
       'Property Type', 'Room Type', 'Accomodates', 'Bathrooms', 'Bedrooms',
       'Beds', 'Square Feet', 'Guests Included', 'Min Nights', 'Reviews',
       'First Review', 'Last Review', 'Overall Rating', 'Accuracy Rating',
       'Cleanliness Rating', 'Checkin Rating', 'Communication Rating',
       'Location Rating', 'Value Rating', 'Instant Bookable',
       'Business Travel Ready', 'year'],
      dtype='object')

# Random Forest

In [113]:
rf = Pipeline([
    # Pre-processing pipeline
    ("preparation", full_pipeline),
    # Random forest
    # n_estimators=100, max_depth=None, min_samples_leaf=1
    ("rf", RandomForestRegressor(max_depth= 20, min_samples_leaf=30, random_state=123))])

In [114]:
rf.fit(X_train, y_train)

In [115]:
X_train.columns

Index(['Listing ID', 'Listing Name', 'Host ID', 'Host Name', 'Host Since',
       'Host Response Time', 'Host Response Rate', 'Is Superhost',
       'neighbourhood', 'Neighborhood Group', 'City', 'Postal Code',
       'Country Code', 'Country', 'Latitude', 'Longitude', 'Is Exact Location',
       'Property Type', 'Room Type', 'Accomodates', 'Bathrooms', 'Bedrooms',
       'Beds', 'Square Feet', 'Guests Included', 'Min Nights', 'Reviews',
       'First Review', 'Last Review', 'Overall Rating', 'Accuracy Rating',
       'Cleanliness Rating', 'Checkin Rating', 'Communication Rating',
       'Location Rating', 'Value Rating', 'Instant Bookable',
       'Business Travel Ready', 'year'],
      dtype='object')

In [116]:
rf.score(X_train, y_train)

0.39761279487367285

In [117]:
y_pred_rf = rf.predict(X_train)

In [118]:
mean_squared_error(y_train, y_pred_rf, squared=False)

37.967344834962724

In [119]:
np.sqrt(np.sum((y_train -y_pred_rf)**2) / len(y_train))

37.967344834962724

# Test set

In [132]:
# Load dataset
dataset_path = Path().absolute() / "data"
filename = "X_y_test.csv"
X_y_test = load_ds(dataset_path, filename)

print(f"Shape: {X_y_test.shape}")

# X_test = X_y_test.drop("Sales") # include "Sales", CombinedAttributesAdder() drops it
X_test = X_y_test.drop(["Sales"], axis=1)
y_test = X_y_test.loc[:, "Sales"].copy()

print(f"shape X_train: {X_test.shape}")
print(f"shape y_train: {y_test.shape}")

Shape: (99476, 18)
shape X_train: (99476, 17)
shape y_train: (99476,)


In [133]:
def metric(preds, actuals):
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

In [134]:
y_pred = rf.predict(X_test)

In [135]:
metric(y_pred, y_test)

17.23081421221812

In [136]:
mean_squared_error(y_test, y_pred, squared=False)

1076.2233186348526

In [137]:
RMSPE(y_test, y_pred)

0.0296900958415978

# Save model

In [138]:
# open a file, where you ant to store the data
file = open('models/random_forest_final.pkl', 'wb')

# dump information to that file
pickle.dump(rf, file)

# close the file
file.close()