In [30]:
import pandas as pd
import numpy as np

# Test Splitting
from sklearn.model_selection import train_test_split

# Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV

# Metrics
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score

In [16]:
pd.options.display.float_format = '{:.4f}'.format

In [17]:
# Load all the different data sets
df_bin = pd.read_csv("../data/df_bin.csv")
df_onehot = pd.read_csv("../data/df_onehot.csv")
df_label = pd.read_csv("../data/df_label.csv")

# Removing useless column
df_bin.drop("Unnamed: 0",inplace = True, axis = 1)
df_onehot.drop("Unnamed: 0",inplace = True, axis = 1)
df_label.drop("Unnamed: 0",inplace = True, axis = 1)

In [18]:
# Dictionary with Dataset name and the corresponding dataset with different encoding for the neighbourhood
datasets = {"df_bin": df_bin,
            "df_onehot": df_onehot,
            "df_label": df_label}

In [19]:
#### Initialize models being utilised
lr = LinearRegression()
rfr = RandomForestRegressor(n_estimators=500, max_leaf_nodes=100, n_jobs=-1, random_state = 49)
svr = SVR(kernel="poly", degree = 4)
sgdr = SGDRegressor(max_iter=1000, tol=1e-3, random_state = 49)
mlpr = MLPRegressor(random_state=49, max_iter=2000, tol=0.1, hidden_layer_sizes = (50,3))
hgbr = HistGradientBoostingRegressor(random_state = 49)

models = { "Linear Regression": lr,
           "Random Forest Regressor": rfr,
           "HistGradientBoosting": hgbr,
           "SVR": svr,
           "SGDR":sgdr,
           "MLP": mlpr,
         }

In [20]:
# Target feature
target_feature = "price"

In [21]:
# Will contain all the results
all_results = {}

In [22]:
# Support function that calculates the metrics (R2, RSME and MAE) for a model
# Return a dictionary contaiting the results of these metrics
def save_results(model_name, y_pred, y_test):
    
    # Calclulate the score for the following metrics
    mae = mean_absolute_error(y_test, y_pred) #The lower the better
    rmse = root_mean_squared_error(y_test, y_pred) #The lower the better
    r2 = r2_score(y_test, y_pred) #Closer to 1 better
    
    # For instant analysis
    #print(f"{model_name}:")
    #print(f"R2: {r2}\nMAE: {mae}\nRMSE: {rmse}\n")
    
    # Return the results for current model as a dictionary
    return {model_name: {"r2": r2,
                          "rmse": rmse,
                           "mae": mae}}
    
    

In [23]:
# For all the datasets we are analysing, 
# check different models and save the results in a dictionary

for df_name, df in datasets.items():
    
    ##############
    # Data Split # 
    ##############
    X = df.drop(target_feature, axis = 1)
    y = df[target_feature]
    
    # Train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 70)
    
    
     #########
    # Models # 
    ##########
    # Temporary holder of all the models for curent datataset
    models_results = {}
    for model_name, model in models.items():
        # Train
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_test)
        # Results of current iteration
        models_results.update(save_results(model_name, y_pred, y_test))
        
    
    ###########
    # Results # 
    ###########
    all_results.update({
                     df_name: models_results
                   })



In [24]:
df.columns

Index(['accommodates', 'bathrooms', 'bedrooms', 'beds', 'price',
       'minimum_nights_avg_ntm', 'availability_60', 'availability_90',
       'availability_365', 'number_of_reviews_ltm', 'review_scores_value',
       'reviews_per_month', 'Entire home/apt', 'Private room', 'Shared room',
       'has_parking', 'has_pool', 'has_washer', 'has_dishwasher',
       'has_ceiling_fan', 'has_long_term', 'has_bbq_grill', 'has_outdoor',
       'has_jacuzzi', 'has_bathtub', 'has_ac', 'has_seaview',
       'is_bathroom_shared', 'contains_lux_description', 'geo_cluster'],
      dtype='object')

In [25]:
##############
# Comparison # 
##############

index = ["R2", "RSME", "MAE"]

df_results = pd.DataFrame()

for df_name, results in all_results.items():
    temp_df = pd.DataFrame(results).T
    temp_df["df_encoding"] = df_name
    df_results = pd.concat([df_results, temp_df])
    
df_results.index.name = "models"
df_results.set_index("df_encoding", append = True).reorder_levels(["df_encoding", "models"]).sort_values(by="r2", ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,r2,rmse,mae
df_encoding,models,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
df_onehot,HistGradientBoosting,0.6403,152.3454,110.4024
df_bin,HistGradientBoosting,0.6375,152.9323,111.254
df_label,HistGradientBoosting,0.6362,153.2021,111.2976
df_onehot,MLP,0.6083,158.974,116.9867
df_bin,MLP,0.5707,166.4257,124.894
df_onehot,Random Forest Regressor,0.5706,166.4462,123.4229
df_bin,Random Forest Regressor,0.5614,168.2204,125.4848
df_label,Random Forest Regressor,0.5596,168.5671,125.319
df_label,MLP,0.5475,170.8618,126.5828
df_onehot,Linear Regression,0.5369,172.8487,132.2036


### GridSearch and Cross-Validation

In [27]:
# Split training and dataset with best dataset

X = df_onehot.drop(target_feature, axis = 1)
y = df_onehot[target_feature]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 70)

In [None]:
# Define GridSearch parameters for best model
grid_param = {
                    'loss': ["poisson", "squared_error"],
                    'max_iter': [150,200,300],
                    'learning_rate': [0.04, 0.05, 0.06],
                    'max_leaf_nodes': [71,81,91],
                    'max_features': [0.15,0.2,0.25]
}

# Best model
model = HistGradientBoostingRegressor(random_state = 19)

# Initialize GridSearch with Cross Validation
grid_search = GridSearchCV(model, grid_param, cv = 3, scoring = 'r2')

# Fit
grid_search.fit(X_train, y_train)

# Train best model with best parameters
best_model_and_param = grid_search.best_estimator_.fit(X_train, y_train)

In [34]:
# Print best parameters
print(grid_search.best_params_)

{'learning_rate': 0.04, 'loss': 'poisson', 'max_features': 0.2, 'max_iter': 300, 'max_leaf_nodes': 81}


In [32]:
# Predict with the best model and best parameters
y_pred = best_model_and_param.predict(X_test)

# Evaluate
# Calclulate the score for the following metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)

metrics = [r2, mape, mae, rmse]
results = pd.DataFrame(metrics, index = ["R2", "MAPE", "MAE", "RMSE"], columns = ["Results"])

results

Unnamed: 0,Results
R2,0.6507
MAPE,0.3579
MAE,107.9837
RMSE,150.119
