In [6]:
import pandas as pd
import numpy as np

# Test Splitting
from sklearn.model_selection import train_test_split

# Models
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import HistGradientBoostingRegressor

# Metrics
from sklearn.metrics import root_mean_squared_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score

In [7]:
pd.options.display.float_format = '{:.4f}'.format

In [8]:
# Load all the different data sets
df_bin = pd.read_csv("Data/df_bin.csv")
df_onehot = pd.read_csv("Data/df_onehot.csv")
df_label = pd.read_csv("Data/df_label.csv")

# Removing useless column
df_bin.drop("Unnamed: 0",inplace = True, axis = 1)
df_onehot.drop("Unnamed: 0",inplace = True, axis = 1)
df_label.drop("Unnamed: 0",inplace = True, axis = 1)

In [9]:
# Dictionary with Dataset name and the corresponding dataset with different encoding for the neighbourhood
datasets = {"df_bin": df_bin,
            "df_onehot": df_onehot,
            "df_label": df_label}

In [10]:
# Initialize models being utilised
lr = LinearRegression()
rfr = RandomForestRegressor(n_estimators=500, max_leaf_nodes=100, n_jobs=-1, random_state = 0)
svr = SVR(kernel="poly", degree = 4)
sgdr = SGDRegressor(max_iter=1000, tol=1e-3)
mlpr = MLPRegressor(random_state=1, max_iter=2000, tol=0.1)
hgbr = HistGradientBoostingRegressor()


models = { "Linear Regression": lr,
           "Random Forest Regressor": rfr,
           "HistGradientBoosting": hgbr,
           #"SVR": svr,
           #"SGDR":sgdr,
           "MLP": mlpr,
         }

In [11]:
# Target feature
target_feature = "price"

In [12]:
# Will contain all the results
all_results = {}

In [13]:
# Support function that calculates the metrics (R2, RSME and MAE) for a model
# Return a dictionary contaiting the results of these metrics
def save_results(model_name, y_pred, y_test):
    
    # Calclulate the score for the following metrics
    mae = mean_absolute_error(y_test, y_pred) #The lower the better
    rmse = root_mean_squared_error(y_test, y_pred) #The lower the better
    r2 = r2_score(y_test, y_pred) #Closer to 1 better
    
    # For instant analysis
    #print(f"{model_name}:")
    #print(f"R2: {r2}\nMAE: {mae}\nRMSE: {rmse}\n")
    
    # Return the results for current model as a dictionary
    return {model_name: {"r2": r2,
                          "rmse": rmse,
                           "mae": mae}}
    
    

In [14]:
# For all the datasets we are analysing, 
# check different models and save the results in a dictionary

for df_name, df in datasets.items():
    
    ##############
    # Data Split # 
    ##############
    X = df.drop(target_feature, axis = 1)
    y = df[target_feature]
    
    # Train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 70)
    
    
     #########
    # Models # 
    ##########
    # Temporary holder of all the models for curent dafataset
    models_results = {}
    for model_name, model in models.items():
        # Train
        model.fit(X_train, y_train)
        # Predict
        y_pred = model.predict(X_test)
        # Iteration model result
        models_results.update(save_results(model_name, y_pred, y_test))
        
    
    ###########
    # Results # 
    ###########
    all_results.update({
                     df_name: models_results
                   })



In [15]:
df.columns

Index(['neighbourhood_cleansed', 'accommodates', 'bathrooms', 'bedrooms',
       'beds', 'number_of_reviews', 'number_of_reviews_ltm',
       'number_of_reviews_l30d', 'reviews_per_month', 'is_entire_home',
       'is_bathroom_shared', 'has_parking', 'has_pool', 'has_washer',
       'has_gym', 'has_dishwasher', 'has_ac', 'has_bathtub', 'has_seaview',
       'cluster_zone_beach_access', 'price'],
      dtype='object')

In [16]:
##############
# Comparison # 
##############

index = ["R2", "RSME", "MAE"]

df_results = pd.DataFrame()

for df_name, results in all_results.items():
    temp_df = pd.DataFrame(results).T
    temp_df["df_encoding"] = df_name
    df_results = pd.concat([df_results, temp_df])
    
df_results.index.name = "models"
df_results.set_index("df_encoding", append = True).reorder_levels(["df_encoding", "models"]).sort_values(by="r2", ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,r2,rmse,mae
df_encoding,models,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
df_onehot,HistGradientBoosting,0.5011,209.5862,143.2653
df_label,HistGradientBoosting,0.5002,209.7842,144.163
df_bin,HistGradientBoosting,0.4997,209.8929,143.8282
df_onehot,MLP,0.4926,211.3714,145.7219
df_bin,MLP,0.4878,212.3632,146.6082
df_onehot,Random Forest Regressor,0.4643,217.179,151.2243
df_label,MLP,0.4599,218.0838,153.2898
df_bin,Random Forest Regressor,0.4575,218.5584,152.4191
df_label,Random Forest Regressor,0.4573,218.5996,152.7355
df_onehot,Linear Regression,0.4257,224.8702,159.3796


import lazypredict

from lazypredict.Supervised import LazyRegressor

X = df_onehot.drop(target_feature, axis = 1)
y = df_onehot[target_feature]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.5,random_state =123)


reg = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None)
models, predictions = reg.fit(X_train, X_test, y_train, y_test)

print(models)