In [1]:
import warnings
warnings.filterwarnings('ignore')

import importlib.util
from sklearn.linear_model import Ridge, RidgeCV, BayesianRidge
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression

from sklearn.model_selection import cross_val_score, RandomizedSearchCV, RepeatedKFold, GridSearchCV

from sklearn import metrics
from sklearn.dummy import DummyRegressor

from sklearn.ensemble import RandomForestRegressor

import xgboost as xg


import numpy as np
import pandas as pd
import pickle
from collections import Counter

import altair as alt
from scipy.stats import skew
from sklearn.feature_selection import SelectFromModel


# Set Up

Top 50 Features for one run of the Random Forest Regressor

In [2]:
top50 = ['HOUROFDAY', 'Age_of_ride_years', 'Age_of_ride_days', 'TL_rank',
       'Ride_duration_min', "Ride_name_walt disney's carousel of progress",
       'MKEMHCLOSE_HOUR', 'Temperature (C)', 'MKOPEN_HOUR', 'Wind Angle',
       'CapacityLostWGT_MK', 'Wind Speed', 'WDW_TICKET_SEASON_none',
       'Ride_name_prince charming regal carrousel', 'Cloud Height',
       'CapacityLost_EP', 'CapacityLostWGT_EP', 'WDWMINTEMP', 'WDWMAXTEMP',
       'WDWMEANTEMP', 'WEATHER_WDWLOW', 'Park_area_fantasyland',
       'CapacityLost_MK',
       'Ride_name_tomorrowland transit authority peoplemover', 'HOLIDAYPX',
       'DAYOFYEAR', 'MKeventN_dah', 'MKEMHMORN', 'WEATHER_WDWHIGH',
       'MKCLOSE_HOUR', 'DAYOFWEEK', 'Visibility Distance (M)',
       'MKFIREN_happy hallowishes fireworks', 'MKHOURSEMH',
       'Park_area_tomorrowland', 'YEAR', 'Ride_name_jungle cruise',
       'MKFIREN_happily ever after', 'MKEMHOPEN_HOUR', 'MKHOURSEMHYEST',
       'MKHOURS', 'Height_req_inches', 'inSession', 'MKHOURSEMHTOM',
       'new_case', 'Ride_type_dark', 'MKFIRET1_HOUR',
       'Cloud Quality Code_passed all quality control checks, data originate from an ncei data source',
       'MKEMHEVE', 'Ride_type_small_drops']

Time columns that need to be converted to integer hour:

In [3]:
parse_times = ["MKOPEN", "MKCLOSE", "MKEMHOPEN", "MKEMHCLOSE",
               "MKOPENYEST", "MKCLOSEYEST", "MKOPENTOM",
               "MKCLOSETOM","EPOPEN", "EPCLOSE", "EPEMHOPEN",
               "EPEMHCLOSE", "EPOPENYEST", "EPCLOSEYEST",
               "EPOPENTOM", "EPCLOSETOM", "HSOPEN", "HSCLOSE",
               "HSEMHOPEN", "HSEMHCLOSE", "HSOPENYEST", "HSCLOSEYEST",
               "HSOPENTOM", "HSCLOSETOM", "AKOPEN", "AKCLOSE",
               "AKEMHOPEN", "AKOPENYEST", "AKCLOSEYEST","AKEMHCLOSE",
               "AKOPENTOM", "AKCLOSETOM", "MKPRDDT1", "MKPRDDT2",
               "MKPRDNT1", "MKPRDNT2", "MKFIRET1", "MKFIRET2",
               "EPFIRET1", "EPFIRET2", "HSPRDDT1", "HSFIRET1",
               "HSFIRET2", "HSSHWNT1", "HSSHWNT2", "AKPRDDT1",
               "AKPRDDT2", "AKSHWNT1", "AKSHWNT2"]

# Load posted wait time datasets

In [4]:
# specify the module that needs to be
# imported relative to the path of the
# module
spec = importlib.util.spec_from_file_location("loadTrainTestPostedWaitTimes", "../src/data/loadTrainTestData.py")

# creates a new module based on spec
loadTrainPosted = importlib.util.module_from_spec(spec)

# executes the module in its own namespace
# when a module is imported or reloaded.
spec.loader.exec_module(loadTrainPosted)

X_train, X_test, y_train, y_test = loadTrainPosted.loadTrainTestPostedWaitTimes()

### Convert key data points from date to integer

In [5]:
X_train["MONTHOFYEAR"] = X_train["date"].dt.month.astype("Int8")
X_train["YEAR"] = X_train["date"].dt.year.astype("Int16")
X_train["DAYOFYEAR"] = X_train["date"].dt.dayofyear.astype("Int16")
X_train["HOUROFDAY"] = X_train["datetime"].dt.hour.astype("Int8")

X_test["MONTHOFYEAR"] = X_test["date"].dt.month.astype("Int8")
X_test["YEAR"] = X_test["date"].dt.year.astype("Int16")
X_test["DAYOFYEAR"] = X_test["date"].dt.dayofyear.astype("Int16")
X_test["HOUROFDAY"] = X_test["datetime"].dt.hour.astype("Int8")

### Sort by datetime before imputation (keeping y-values associated)

In [6]:
train = pd.concat([X_train, y_train], axis=1).sort_values(['datetime'])
test = pd.concat([X_test, y_test], axis=1).sort_values(['datetime'])

In [7]:
X_train_impute = train.drop(columns=["POSTED_WAIT"])
y_train = train["POSTED_WAIT"]

X_test_impute = test.drop(columns=["POSTED_WAIT"])
y_test = test["POSTED_WAIT"]

In [8]:
# del train, test

### Many open/close times, parade times, etc. are in HH:MM format. 

Convert to integer hour & fill nulls with 99.

This means that particulate event does not exist for that day. (e.g. Magic Kingdom doesn't have a second parade)

In [9]:
for col in parse_times:
    X_train_impute[col] =  X_train_impute[col].fillna("99")
    X_train_impute[f"{col}_HOUR"] = X_train_impute[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_train_impute.drop(columns = col, inplace=True)

In [10]:
for col in parse_times:
    X_test_impute[col] =  X_test_impute[col].fillna("99")
    X_test_impute[f"{col}_HOUR"] = X_test_impute[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_test_impute.drop(columns = col, inplace=True)

### Data Imputation

First impute by backfilling values. For any remaining nulls, impute the median.

In [11]:
for col in X_train_impute.columns:
    nulls = X_train_impute[col].isnull().sum()
    
    if nulls>0:
        print(col)
        X_train_impute[col].fillna(method ='bfill', inplace=True)
    
        if X_train_impute[col].isnull().sum()>0:
            X_train_impute[col].fillna(X_train_impute[col].median(), inplace=True)

WDWMAXTEMP
WDWMINTEMP
WDWMEANTEMP
inSession
inSession_Enrollment
inSession_wdw
inSession_dlr
inSession_sqrt_WDW
inSession_sqrt_DLR
inSession_California
inSession_DC
inSession_Central_FL
inSession_Drive1_FL
inSession_Drive2_FL
inSession_Drive_CA
inSession_Florida
inSession_Mardi_Gras
inSession_Midwest
inSession_NY_NJ
inSession_NY_NJ_PA
inSession_New_England
inSession_New_Jersey
inSession_Nothwest
INSESSION_PLANES
inSession_SoCal
inSession_Southwest


In [12]:
for col in X_test_impute.columns:
    nulls = X_test_impute[col].isnull().sum()
    
    if nulls>0:
        print(col)
        X_test_impute[col]= X_test_impute[col].fillna(method ='bfill')
        
        if X_test_impute[col].isnull().sum()>0:
            X_test_impute[col].fillna(X_test_impute[col].median(), inplace=True)

WDWMAXTEMP
WDWMINTEMP
WDWMEANTEMP
inSession
inSession_Enrollment
inSession_wdw
inSession_dlr
inSession_sqrt_WDW
inSession_sqrt_DLR
inSession_California
inSession_DC
inSession_Central_FL
inSession_Drive1_FL
inSession_Drive2_FL
inSession_Drive_CA
inSession_Florida
inSession_Mardi_Gras
inSession_Midwest
inSession_NY_NJ
inSession_NY_NJ_PA
inSession_New_England
inSession_New_Jersey
inSession_Nothwest
INSESSION_PLANES
inSession_SoCal
inSession_Southwest


### Drop Datetime columns - no longer needed

In [13]:
X_train_encoded = X_train_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])
X_test_encoded = X_test_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])

In [14]:
del X_train_impute, X_test_impute

### Use variance threshold to drop more boolean columns.

These columns are more than 99.9% similar.

In [15]:
X_dtype = X_train_encoded.select_dtypes(include=['bool']).reset_index(drop=True)

var_thr = VarianceThreshold(threshold=0.001)  # Removing both constant and quasi-constant
var_thr.fit(X_dtype)

concol = [column for column in X_dtype.columns
          if column not in X_dtype.columns[var_thr.get_support()]]


del var_thr, X_dtype

if "Weather Type" in concol:
    concol.remove("Weather Type")

print(f"DROPPING BOOL: ", concol)
X_train_encoded.drop(concol, axis=1, inplace=True)
X_test_encoded.drop(concol, axis=1, inplace=True)

DROPPING BOOL:  ['HOLIDAYN_ash|val', 'HOLIDAYN_chv|pas', 'HOLIDAYN_cmd|han', 'HOLIDAYN_col|suk', 'HOLIDAYN_hal|nvd', 'HOLIDAYN_njc|vet', 'MKeventN_dah|emm', 'HSeventN_wdwsotf', "HSFIREN_disney's hollywood studios special july 4th fireworks presentation", "HSFIREN_new year's eve fireworks", 'Wind Speed Quality_a', 'Wind Speed Quality_p', 'Wind Speed Quality_passed gross limits check if element is present', 'Cloud Quality Code_erroneous, data originate from an ncei data source', 'Cloud Determination Code_statistically derived', 'Visibiliy Quality Code_p', 'Visibility Variability Code_variable', 'Temperature Quality Code_suspect, data originate from an ncei data source']


### Data Scaling & Transformation

Log Transform skewed numeric columns & then apply StandardScaler to scale numeric columns.

In [16]:
scaler = MinMaxScaler()


X_dtype_train = X_train_encoded.select_dtypes(include=[np.number]).reset_index(drop=True)
num_cols = list(X_dtype_train.columns)

X_dtype_test = X_test_encoded.select_dtypes(include=[np.number]).reset_index(drop=True)

In [17]:
skewed_cols = []
for col in X_dtype_train.columns:
    skewness = abs(skew(list(X_dtype_train[col])))
    if skewness > 1:
        print(col)
        X_dtype_train[f"log_{col}"] = np.log(X_dtype_train[col].astype(float).apply(lambda x: x+5))
        X_dtype_test[f"log_{col}"] = np.log(X_dtype_test[col].astype(float).apply(lambda x: x+5))                                         
        skewed_cols.append(col)

X_dtype_train.drop(columns=skewed_cols, inplace=True)
X_dtype_test.drop(columns=skewed_cols, inplace=True)

Ride_duration_min
HOLIDAYPX
HOLIDAYM
WDWMAXTEMP
inSession_SoCal
CapacityLost_HS
CapacityLost_AK
CapacityLostWGT_HS
CapacityLostWGT_AK
MKPRDDAY
MKFIREWK
EPFIREWK
new_case
Wind Angle
Wind Speed
Cloud Height
Visibility Distance (M)
MKOPEN_HOUR
MKCLOSE_HOUR
MKOPENYEST_HOUR
MKCLOSEYEST_HOUR
MKOPENTOM_HOUR
MKCLOSETOM_HOUR
EPOPEN_HOUR
EPCLOSE_HOUR
EPEMHOPEN_HOUR
EPEMHCLOSE_HOUR
EPOPENYEST_HOUR
EPCLOSEYEST_HOUR
EPOPENTOM_HOUR
EPCLOSETOM_HOUR
HSOPEN_HOUR
HSCLOSE_HOUR
HSEMHOPEN_HOUR
HSEMHCLOSE_HOUR
HSOPENYEST_HOUR
HSCLOSEYEST_HOUR
HSOPENTOM_HOUR
HSCLOSETOM_HOUR
AKOPEN_HOUR
AKCLOSE_HOUR
AKOPENYEST_HOUR
AKCLOSEYEST_HOUR
AKOPENTOM_HOUR
AKCLOSETOM_HOUR
MKPRDDT1_HOUR
MKPRDDT2_HOUR
MKFIRET1_HOUR
MKFIRET2_HOUR
EPFIRET1_HOUR
EPFIRET2_HOUR
HSFIRET1_HOUR
HSFIRET2_HOUR
HSSHWNT1_HOUR


In [18]:
scaler = MinMaxScaler()
X_train_norm = scaler.fit_transform(X_dtype_train)
X_test_norm = scaler.transform(X_dtype_test)

X_train_encoded[num_cols] = X_train_norm
X_test_encoded[num_cols] = X_test_norm

In [None]:
del X_train_norm, X_test_norm, X_dtype_train, X_dtype_test

# Dummy Regression - Baseline

In [16]:
lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train_encoded, y_train)
y_predict_dummy_mean = lm_dummy_mean.predict(X_test_encoded)

In [17]:
print("DUMMY REGRESSOR - MEAN")

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_predict_dummy_mean))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_predict_dummy_mean))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_predict_dummy_mean)))

DUMMY REGRESSOR - MEAN
Mean Absolute Error (MAE): 162.50162595558666
Mean Squared Error (MSE): 84570.05314900333
Root Mean Squared Error (RMSE): 290.8093071911615


In [18]:
lm_dummy_median = DummyRegressor(strategy = 'median').fit(X_train_encoded, y_train)
y_predict_dummy_median = lm_dummy_median.predict(X_test_encoded)

In [19]:
print("DUMMY REGRESSOR - MEDIAN")

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_predict_dummy_median))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_predict_dummy_median))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_predict_dummy_median)))

DUMMY REGRESSOR - MEDIAN
Mean Absolute Error (MAE): 106.54793720058697
Mean Squared Error (MSE): 91049.30566297466
Root Mean Squared Error (RMSE): 301.7437748537236


# Ridge Regression Grid Search on Alpha

In [34]:
linRidge = RidgeCV(alphas=[1e-1, 1, 10], scoring='neg_mean_absolute_error').fit(X_train_encoded, y_train)

In [37]:
predLinRidge = linRidge.predict(X_test_encoded)

In [39]:
print("RIDGE REGRESSION GRID SEARCH: ")
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, predLinRidge))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, predLinRidge))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, predLinRidge)))

RIDGE REGRESSION GRID SEARCH: 
Mean Absolute Error (MAE): 141.44605525583378
Mean Squared Error (MSE): 69929.3386858671
Root Mean Squared Error (RMSE): 264.4415600579211


In [None]:
del linRidge

# Random Forest Regression

Doing grid search manually because GCP throws TerminatedWorkerError and local throws memory error with GridSearchCV:

In [None]:
important_features = []
results = {}
for n_est in (10, 50, 100, 500):
    for max_depth in [10, 50, 100]:
        print(f"STARTING {n_est}, {max_depth}")
        rfc = RandomForestRegressor(n_estimators=n_est, max_depth=max_depth, random_state=0, n_jobs=-1)
        rfc.fit(X_train_encoded, y_train)
        
        pred = rfc.predict(X_test_encoded)
        
        mae = metrics.mean_absolute_error(y_test, pred)
        mse = metrics.mean_squared_error(y_test, pred)
        rmse = np.sqrt(metrics.mean_squared_error(y_test, pred))
        r2 = metrics.r2_score(y_test, pred)
        
        results[f"{n_est}_{max_depth}"] = {"mae": mae, "mse": mse, "rmse":rmse, "r2":r2}
        
        importances = rfc.feature_importances_
        important_feat = np.argsort(importances)[-50:]
        
        selected_feat= list(X_train.columns[important_feat])
        print(selected_feat)
        important_features.extend(selected_feat)
    
print(results)

STARTING 10, 10
{'10_10': {'mae': 82.09963841166338, 'mse': 42038.141302517135, 'rmse': 205.03204945207258, 'r2': 0.5029192990046908}}
Index(['Park_area_frontierland', 'Wind Speed Quality_a',
       'MKPRDNN_main street electrical parade', 'inSession', 'HSOPENTOM',
       'DAYOFYEAR', 'WDWeventN_probowl', 'HOLIDAYN_mgs',
       'AKSHWNN_rivers of light', 'EPeventN_ephol', 'EPeventN_epfw',
       'Ride_type_small_drops', 'Ride_type_slow', 'HSevent',
       'CapacityLostWGT_EP', 'EPEMHEVE', 'Park_area_liberty square',
       'WDWeventN_wdwdd|wdwhol', 'inSession_NY_NJ', 'Wind Speed Quality_p',
       'Age_of_ride_years', 'Ride_name_dumbo the flying elephant', 'MKEMHMORN',
       'WDWSEASON_thanksgiving', 'EPCLOSEYEST', 'WDWSEASON_christmas',
       'EPEMHOPEN', 'MKCLOSETOM', 'MKHOURSYEST', 'Age_of_ride_days',
       'MKeventN_mvmcp', 'INSESSION_PLANES', 'MKFIREN_happily ever after',
       'MKOPENYEST', 'MKeventN_mnsshp', 'HOLIDAYN_lab',
       'WDWSEASON_september low', 'WDWSEASON_memori

In [None]:
Counter(important_features)

----

These parameters seem to have the best balance of simplicity and low error.

##### ```n_estimators=50``` ```max_depth=50```

In [58]:
# create regressor object
rf = RandomForestRegressor(n_estimators = 50, max_depth=50, n_jobs=-1, random_state = 0)
 
# fit the regressor with x and y data
rf.fit(X_train_encoded, y_train)

# with open('random_forest100_pkl', 'wb') as files:
#     pickle.dump(regressor, files)

#########
# OR 
#########

# load saved model
# with open('random_forest100_pkl' , 'rb') as f:
#     rf = pickle.load(f)

RandomForestRegressor(n_jobs=-1, random_state=0)

In [59]:
pred = rf.predict(X_test_encoded)

In [60]:
print("RANDOM FOREST n_estimators = 100 : ") 
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, pred)))

RANDOM FOREST n_estimators = 100 : 
Mean Absolute Error (MAE): 32.44205144366285
Mean Squared Error (MSE): 14614.88512405278
Root Mean Squared Error (RMSE): 120.89203912604329


In [61]:
importances = rf.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]

In [None]:
sorted_indices

In [62]:
X_train_encoded.columns[sorted_indices[:50]]

Index(['CapacityLostWGT_EP', 'Age_of_ride_days', 'Ride_duration_min',
       'Age_of_ride_years', 'Cloud Height',
       'Ride_name_walt disney's carousel of progress', 'CapacityLostWGT_AK',
       'CapacityLost_AK', 'EPOPENYEST_HOUR', 'EPOPEN_HOUR', 'AKHOURSTOM',
       'EPCLOSE_HOUR', 'WDW_TICKET_SEASON_none', 'EPEMHOPEN_HOUR',
       'Ride_name_prince charming regal carrousel', 'WEATHER_WDWHIGH',
       'AKHOURSYEST', 'YEAR', 'Weather Type', 'HOLIDAYPX',
       'Park_area_fantasyland', 'HSHOURSTOM', 'AKHOURS',
       'Ride_name_tomorrowland transit authority peoplemover',
       'Visibility Distance (M)', 'DAYOFWEEK', 'MKEMHMORN', 'MKeventN_dah',
       'HSHOURSYEST', 'EPCLOSEYEST_HOUR', 'TA_Stars', 'EPEMHCLOSE_HOUR',
       'MKFIREN_happy hallowishes fireworks', 'Park_area_tomorrowland',
       'inSession_New_Jersey', 'MONTHOFYEAR', 'Ride_name_jungle cruise',
       'MKFIREN_happily ever after', 'inSession_Nothwest', 'HSHOURSEMHYEST',
       'CapacityLostWGT_HS', 'Height_req_inches

Sample Predictions vs Actual from test set:

In [None]:
print("(PREDICTED, ACTUAL)")
for x in zip(np.round(pred[2000:2020], 0), y_test[2000:2020].values):
    print(x)

# Random Forest Grid Search Tuning

In [None]:
kbest = SelectKBest(score_func=f_regression, k=100)
kbest.fit(X_train_encoded, y_train)
kbest_features = X_train_encoded.columns[kbest.get_support()]

In [None]:
kbest_features

In [None]:
rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=25, max_features=1, n_jobs=-1, random_state=0)
# rf = RandomForestRegressor(n_estimators=500, n_jobs=-1, min_samples_leaf=5, random_state=0)
rf.fit(X_train_encoded[top50], y_train)

# create an iterator object with write permission - model.pkl
with open('random_forest500_top50', 'wb') as files:
    pickle.dump(rf, files)
#########
# OR 
#########

# #load saved model
# with open('random_forest_gridsearch' , 'rb') as f:
#     rf = pickle.load(f)

In [None]:
pred = rf.predict(X_test_encoded[top50])

In [None]:
print("Random Forest - Grid Search: ")
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, pred)))

In [22]:
# Get numerical feature importances
importances = list(rf.feature_importances_)
feature_list = list(X_train_encoded[top50].columns)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 4)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
for pair in feature_importances:
    [print('Variable: {:20} Importance: {}'.format(*pair)) 

Variable: HOUROFDAY            Importance: 0.1749
Variable: Age_of_ride_years    Importance: 0.086
Variable: Age_of_ride_days     Importance: 0.0814
Variable: TL_rank              Importance: 0.0692
Variable: Ride_duration_min    Importance: 0.0556
Variable: Ride_name_walt disney's carousel of progress Importance: 0.0406
Variable: Temperature (C)      Importance: 0.0327
Variable: MKEMHCLOSE_HOUR      Importance: 0.0319
Variable: Wind Angle           Importance: 0.0288
Variable: MKOPEN_HOUR          Importance: 0.0272
Variable: CapacityLostWGT_MK   Importance: 0.0259
Variable: Wind Speed           Importance: 0.0254
Variable: Cloud Height         Importance: 0.021
Variable: WDW_TICKET_SEASON_none Importance: 0.0176
Variable: WDWMAXTEMP           Importance: 0.0167
Variable: WDWMINTEMP           Importance: 0.0162
Variable: Ride_name_prince charming regal carrousel Importance: 0.0152
Variable: CapacityLostWGT_EP   Importance: 0.0134
Variable: CapacityLost_EP      Importance: 0.0132
Varia

[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None]

In [84]:
# rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=25, max_features=1, n_jobs=-1, random_state=0)
rf = RandomForestRegressor(n_estimators=50, max_depth=50, n_jobs=-1, random_state=0)
rf.fit(X_train_encoded[kbest_features], y_train)

# create an iterator object with write permission - model.pkl
with open('random_forest100_kbest50_50', 'wb') as files:
    pickle.dump(rf, files)

In [86]:
pred = rf.predict(X_test_encoded)

In [87]:
print("Random Forest - Best Params: ")
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, pred)))

Random Forest - Best Params: 
param_grid =  {'n_estimators': [10, 20], 'max_features': ['sqrt', 'log2', 1.0], 'max_depth': [1, 4, 5]}
Mean Absolute Error (MAE): 158.36439147926393
Mean Squared Error (MSE): 80764.11593927557
Root Mean Squared Error (RMSE): 284.19028121889664


# XGBoost Regressor

```n_estimators = 100```

In [75]:
xgb_r = xg.XGBRegressor(objective ='reg:squarederror',
                  n_estimators = 100, seed = 123)

In [76]:
# Fitting the model
xgb_r.fit(X_train_encoded, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=123,
             reg_alpha=0, reg_lambda=1, ...)

In [77]:
# Predict the model
pred = xgb_r.predict(X_test_encoded)

In [79]:
print("XGBoost n_estimators = 100")
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, pred)))

XGBoost n_estimators = 100
Mean Absolute Error (MAE): 81.52782540698011
Mean Squared Error (MSE): 37576.03206247993
Root Mean Squared Error (RMSE): 193.84538184460297


# Bayesian Ridge Regressor 

In [80]:
BayReg = BayesianRidge()
BayReg.fit(X_train_encoded, y_train)

BayesianRidge()

In [81]:
pred = BayReg.predict(X_test_encoded)

In [83]:
print("Bayesian Ridge Regression: ")
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, pred)))

Bayesian Ridge Regression: 
Mean Absolute Error (MAE): 141.46531343928152
Mean Squared Error (MSE): 69932.50152871764
Root Mean Squared Error (RMSE): 264.4475402205845
