# Prototype

In [1]:
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
from scipy.stats import iqr
S
import seaborn as sns
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV

from sklearn.metrics import mean_absolute_error

from models import KNNGeoDataImputer, AirBnBDataCleaner

# Data Cleaning

In [2]:
data = pd.read_csv('./data/Airbnb_Open_Data.csv', low_memory=False)
data.columns = map(lambda x: x.replace(' ', '_').lower(), data.columns)
data.head()

Unnamed: 0,id,name,host_id,host_identity_verified,host_name,neighbourhood_group,neighbourhood,lat,long,country,...,service_fee,minimum_nights,number_of_reviews,last_review,reviews_per_month,review_rate_number,calculated_host_listings_count,availability_365,house_rules,license
0,1001254,Clean & quiet apt home by the park,80014485718,unconfirmed,Madaline,Brooklyn,Kensington,40.64749,-73.97237,United States,...,$193,10.0,9.0,10/19/2021,0.21,4.0,6.0,286.0,Clean up and treat the home the way you'd like...,
1,1002102,Skylit Midtown Castle,52335172823,verified,Jenna,Manhattan,Midtown,40.75362,-73.98377,United States,...,$28,30.0,45.0,5/21/2022,0.38,4.0,2.0,228.0,Pet friendly but please confirm with me if the...,
2,1002403,THE VILLAGE OF HARLEM....NEW YORK !,78829239556,,Elise,Manhattan,Harlem,40.80902,-73.9419,United States,...,$124,3.0,0.0,,,5.0,1.0,352.0,"I encourage you to use my kitchen, cooking and...",
3,1002755,,85098326012,unconfirmed,Garry,Brooklyn,Clinton Hill,40.68514,-73.95976,United States,...,$74,30.0,270.0,7/5/2019,4.64,4.0,1.0,322.0,,
4,1003689,Entire Apt: Spacious Studio/Loft by central park,92037596077,verified,Lyndon,Manhattan,East Harlem,40.79851,-73.94399,United States,...,$41,10.0,9.0,11/19/2018,0.1,3.0,1.0,289.0,"Please no smoking in the house, porch or on th...",


In [3]:
def currency_str_to_float(in_str):
    return float(in_str.replace('$', '').replace(',', '.'))

## Imputing Geographic Information using `Lat` & `Long`

In [4]:
geographic_cat_info = [
    'neighbourhood', 
    'country', 
    'country_code'
    ]
geographic_coordinates = ['lat', 'long']

In [5]:
geo_data = (   
    data
    .dropna(subset=geographic_cat_info + geographic_coordinates)
    .loc[:, geographic_cat_info + geographic_coordinates]
)

In [7]:
geo_models = {}
for geo_info in geographic_cat_info:
    geo_data = (   
        data
        .dropna(
            subset=[geo_info] + geographic_coordinates)
        .pivot_table(
            index=geo_info, 
            values=geographic_coordinates, 
            aggfunc=list)
        .explode(geographic_coordinates)
    )
    curr_model =  KNeighborsClassifier(3, weights='distance')
    train = geo_data.reset_index().drop_duplicates()
    X_train = train.drop(columns=[geo_info])
    y_train = train[geo_info]
    curr_model.fit(X_train.values, y_train.values)
    geo_models[geo_info] = curr_model

In [8]:
numerical_features = [
    'construction_year', 'service_fee', 
    'minimum_nights', 'number_of_reviews', 'reviews_per_month', 
    'review_rate_number', 'availability_365',
    'calculated_host_listings_count'
    ]

target_feature = ['price']

categorical_features = [
    'host_identity_verified', 
    'neighbourhood', 'country', 
    'instant_bookable', 
    'cancellation_policy', 
    'room_type',
    'has_rules'
    ]

clean_data = (
    data
    .drop_duplicates()
    .dropna(subset=['price'])
    .fillna(
        {
            'reviews_per_month': 0,
            'service_fee': '$0',
    })
    .assign(
        has_rules=lambda x: (x.house_rules.notna()) & (x.house_rules != '#NAME?'),
        # listing_months=lambda x: x.number_of_reviews/x.reviews_per_month,
        price= lambda x: x.price.apply(currency_str_to_float),
        service_fee= lambda x: x.service_fee.apply(currency_str_to_float)
    )
)

for geo_info, model in geo_models.items():
    clean_data.loc[clean_data[geo_info].isna(), geo_info] = model.predict(clean_data.loc[clean_data[geo_info].isna(), geographic_coordinates].values)


iqr_minimum_nights = iqr(clean_data.minimum_nights.dropna())
third_quartile = clean_data.minimum_nights.quantile(.75)

clean_data = (
    clean_data
    .loc[
        (clean_data.minimum_nights > 0) &
        (clean_data.price >= 50) &
        (clean_data.minimum_nights <= third_quartile + (1.5 * iqr_minimum_nights)) &
        (clean_data.availability_365 > 0) &
        (clean_data.availability_365 <= 365)
        , numerical_features + categorical_features + target_feature]
    .dropna()
)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(clean_data.drop(columns=target_feature), 
                                                    clean_data[target_feature], 
                                                    test_size=0.2, 
                                                    random_state=42)

In [None]:
col_transformer = ColumnTransformer([
                                    ('numerical_features', MinMaxScaler(), numerical_features),
                                    ('hot_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False), categorical_features),
                                    ])
col_transformer.fit(X_train)

# Model Selection

In [None]:
initialized_models = {
    'LinearRegression': LinearRegression(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'SVR': SVR(),
    'RandomForestRegressor': RandomForestRegressor(),
    'AdaBoostRegressor': AdaBoostRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor(),
    'MLPRegressor': MLPRegressor()
}

In [None]:
cross_validation_model_selection = {
    model_name: cross_validate(
                                model, 
                                col_transformer.transform(X_train), 
                                y_train.values.reshape(-1, 1),
                                scoring='neg_mean_absolute_error'
                                ) for model_name, model in tqdm(initialized_models.items(), total=len(initialized_models))
}
cross_validation_model_selection

In [None]:
result_df = pd.DataFrame(cross_validation_model_selection).T
result_df

In [None]:
agg_func_metrics = ['mean', 'std']
pivot_results = (
    result_df
    .explode(
        'test_score')
    .assign(
        test_score=lambda x: -x.test_score)
    .reset_index()
    .rename(
        columns={'index': 'method'})
    .pivot_table(
        index='method', 
        values='test_score', 
        aggfunc=agg_func_metrics)
)

pivot_results.columns = agg_func_metrics
pivot_results.sort_values(by=agg_func_metrics, ascending=True)

# Model Tunning

In [None]:
param_dict = {
    'n_estimators': [20, 50, 100],
    'max_depth': [None, 5, 15],
    'min_samples_split': [2, 3],
    'min_samples_leaf': [1, 2],
    'max_leaf_nodes': [None, 1, 3]
}
rf_gs = GridSearchCV(RandomForestRegressor(random_state=42), param_dict, scoring='neg_mean_absolute_error', )
rf_gs.fit(col_transformer.transform(X_train), y_train.values.ravel())


In [None]:
grid_search_df = pd.DataFrame(rf_gs.cv_results_).sort_values(by=['mean_test_score', 'std_test_score'], ascending=[False, True])
grid_search_df.head()

In [None]:
best_params

In [None]:
grid_search_df.to_pickle('./resources/random_forest_gs.pkl')
with open('resources/random_forest_best_params.json', 'w') as bp:
    json.dump(grid_search_df.iloc[0].params, bp)