In [204]:
import pandas as pd
import numpy as np

# Test Splitting
from sklearn.model_selection import train_test_split

# Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# Metrics
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [205]:
pd.options.display.float_format = '{:.4f}'.format

In [206]:
# Load all the different data sets
df_bin = pd.read_csv("Data/df_bin.csv")
df_onehot = pd.read_csv("Data/df_onehot.csv")
df_label = pd.read_csv("Data/df_label.csv")

# Removing useless column
df_bin.drop("Unnamed: 0",inplace = True, axis = 1)
df_onehot.drop("Unnamed: 0",inplace = True, axis = 1)
df_label.drop("Unnamed: 0",inplace = True, axis = 1)

In [207]:
# Dictionary with Dataset name and the corresponding dataset with different encoding for the neighbourhood
datasets = {"df_bin": df_bin,
            "df_onehot": df_onehot,
            "df_label": df_label}

In [208]:
#### Initialize models being utilised
lr = LinearRegression()
rfr = RandomForestRegressor(n_estimators=500, max_leaf_nodes=100, n_jobs=-1, random_state = 49)
svr = SVR(kernel="poly", degree = 4)
sgdr = SGDRegressor(max_iter=1000, tol=1e-3, random_state = 49)
mlpr = MLPRegressor(random_state=49, max_iter=2000, tol=0.1, hidden_layer_sizes = (50,3))
hgbr = HistGradientBoostingRegressor(random_state = 49)


models = { #"Linear Regression": lr,
           #"Random Forest Regressor": rfr,
           "HistGradientBoosting": hgbr,
           #"SVR": svr,
           #"SGDR":sgdr,
           #"MLP": mlpr,
         }

In [209]:
# Target feature
target_feature = "price"

In [210]:
# Will contain all the results
all_results = {}

In [211]:
# Support function that calculates the metrics (R2, RSME and MAE) for a model
# Return a dictionary contaiting the results of these metrics
def save_results(model_name, y_pred, y_test):
    
    # Calclulate the score for the following metrics
    mae = mean_absolute_error(y_test, y_pred) #The lower the better
    rmse = root_mean_squared_error(y_test, y_pred) #The lower the better
    r2 = r2_score(y_test, y_pred) #Closer to 1 better
    
    # For instant analysis
    #print(f"{model_name}:")
    #print(f"R2: {r2}\nMAE: {mae}\nRMSE: {rmse}\n")
    
    # Return the results for current model as a dictionary
    return {model_name: {"r2": r2,
                          "rmse": rmse,
                           "mae": mae}}
    
    

In [212]:
# For all the datasets we are analysing, 
# check different models and save the results in a dictionary

for df_name, df in datasets.items():
    
    ##############
    # Data Split # 
    ##############
    X = df.drop(target_feature, axis = 1)
    y = df[target_feature]
    
    # Train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 70)
    
    
     #########
    # Models # 
    ##########
    # Temporary holder of all the models for curent datataset
    models_results = {}
    for model_name, model in models.items():
        # Train
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_test)
        # Results of current iteration
        models_results.update(save_results(model_name, y_pred, y_test))
        
    
    ###########
    # Results # 
    ###########
    all_results.update({
                     df_name: models_results
                   })



In [213]:
df.columns

Index(['accommodates', 'bathrooms', 'bedrooms', 'beds',
       'minimum_nights_avg_ntm', 'availability_30', 'availability_60',
       'availability_90', 'availability_365', 'number_of_reviews',
       'number_of_reviews_ltm', 'number_of_reviews_l30d',
       'review_scores_location', 'reviews_per_month',
       'is_score_location_empty', 'is_entire_home', 'is_bathroom_shared',
       'has_parking', 'has_pool', 'has_washer', 'has_dishwasher',
       'has_ceiling_fan', 'has_long_term', 'has_bbq_grill', 'has_self_checkin',
       'has_ac', 'has_bathtub', 'has_seaview', 'cluster_zone_beach_access',
       'price'],
      dtype='object')

In [214]:
##############
# Comparison # 
##############

index = ["R2", "RSME", "MAE"]

df_results = pd.DataFrame()

for df_name, results in all_results.items():
    temp_df = pd.DataFrame(results).T
    temp_df["df_encoding"] = df_name
    df_results = pd.concat([df_results, temp_df])
    
df_results.index.name = "models"
df_results.set_index("df_encoding", append = True).reorder_levels(["df_encoding", "models"]).sort_values(by="r2", ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mae,r2,rmse
df_encoding,models,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
df_onehot,HistGradientBoosting,158.0802,0.6336,238.4229
df_label,HistGradientBoosting,158.5594,0.6294,239.7734
df_bin,HistGradientBoosting,160.3171,0.6268,240.6224


import lazypredict

from lazypredict.Supervised import LazyRegressor

X = df_onehot.drop(target_feature, axis = 1)
y = df_onehot[target_feature]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)


reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

print(models)