This notebook is meant to explore Tree Based Machine Learning Algorithms as the best results at this point came from a Random Forest Regressor model, so we thought we'd check other models of similar types.

In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import accuracy_score, f1_score
from sklearn import metrics
import importlib.util

In [None]:
# specify the module that needs to be
# imported relative to the path of the
# module
spec = importlib.util.spec_from_file_location("loadTrainTestPostedWaitTimes", "../src/data/loadTrainTestData.py")

# creates a new module based on spec
loadTrainPosted = importlib.util.module_from_spec(spec)

# executes the module in its own namespace
# when a module is imported or reloaded.
spec.loader.exec_module(loadTrainPosted)

X_train, X_test, y_train, y_test = loadTrainPosted.loadTrainTestPostedWaitTimes()

In [None]:
parse_times = ["MKOPEN", "MKCLOSE", "MKEMHOPEN", "MKEMHCLOSE",
               "MKOPENYEST", "MKCLOSEYEST", "MKOPENTOM",
               "MKCLOSETOM","EPOPEN", "EPCLOSE", "EPEMHOPEN",
               "EPEMHCLOSE", "EPOPENYEST", "EPCLOSEYEST",
               "EPOPENTOM", "EPCLOSETOM", "HSOPEN", "HSCLOSE",
               "HSEMHOPEN", "HSEMHCLOSE", "HSOPENYEST", "HSCLOSEYEST",
               "HSOPENTOM", "HSCLOSETOM", "AKOPEN", "AKCLOSE",
               "AKEMHOPEN", "AKOPENYEST", "AKCLOSEYEST","AKEMHCLOSE",
               "AKOPENTOM", "AKCLOSETOM", "MKPRDDT1", "MKPRDDT2",
               "MKPRDNT1", "MKPRDNT2", "MKFIRET1", "MKFIRET2",
               "EPFIRET1", "EPFIRET2", "HSPRDDT1", "HSFIRET1",
               "HSFIRET2", "HSSHWNT1", "HSSHWNT2", "AKPRDDT1",
               "AKPRDDT2", "AKSHWNT1", "AKSHWNT2"]

In [80]:
for col in parse_times:
    X_train[col] =  X_train[col].fillna("99")
    X_train[f"{col}_HOUR"] = X_train[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_train.drop(columns = col, inplace=True)
for col in parse_times:
    X_test[col] =  X_test[col].fillna("99")
    X_test[f"{col}_HOUR"] = X_test[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_test.drop(columns = col, inplace=True)
for col in X_train.columns:
    nulls = X_train[col].isnull().sum()
    
    if nulls>0:
        X_train[col].fillna(method ='bfill', inplace=True)
    
        if X_train[col].isnull().sum()>0:
            X_train[col].fillna(X_train[col].median(), inplace=True)
for col in X_test.columns:
    nulls = X_test[col].isnull().sum()
    
    if nulls>0:
        X_test[col]= X_test[col].fillna(method ='bfill')
        
        if X_test[col].isnull().sum()>0:
            X_test[col].fillna(X_test[col].median(), inplace=True)

In [81]:
dt = DecisionTreeRegressor(max_depth=6, random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

In [82]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error (MAE): 13.16068313035664
Mean Squared Error (MSE): 221682.33471597004
Root Mean Squared Error (RMSE): 470.83153538815776


In [83]:
features = pd.DataFrame([X_train.columns, dt.feature_importances_]).T.sort_values([1], ascending = [False])
important_features = list(features[features[1] > 0.000001][0])
important_features

['Temperature (C)',
 'Ride_name_tomorrowland transit authority peoplemover',
 'Wind Angle',
 'Age_of_ride_days',
 'WDW_TICKET_SEASON_regular',
 'HOLIDAYN_prs',
 'AKHOURS',
 'Age_of_ride_years',
 'TL_rank',
 'Ride_type_big_drops',
 'MKFIREN_happily ever after',
 'Ride_duration_min',
 'MKHOURS',
 'WEATHER_WDWHIGH']

In [84]:
dt = DecisionTreeRegressor(max_depth=6, random_state=42)
dt.fit(X_train[important_features], y_train)
y_pred = dt.predict(X_test[important_features])

In [85]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error (MAE): 13.145344036755082
Mean Squared Error (MSE): 221681.71600148827
Root Mean Squared Error (RMSE): 470.8308783432627


In [95]:
et = ExtraTreesRegressor(max_depth=6, random_state=42)
et.fit(X_train, y_train['SACTMIN'])
y_pred = et.predict(X_test)

In [96]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error (MAE): 13.821088580265943
Mean Squared Error (MSE): 223012.6590672642
Root Mean Squared Error (RMSE): 472.2421614672542


In [97]:
features = pd.DataFrame([X_train.columns, et.feature_importances_]).T.sort_values([1], ascending = [False])
important_features = list(features[features[1] > 0.000001][0])
important_features

['Wind Angle',
 'Temperature (C)',
 'Wind Quality Code_passed gross limits check if element is present',
 'Wind Quality Code_passed all quality control checks, data originate from an ncei data source',
 'Visibility Distance (M)',
 'Wind Type Code_normal',
 'Ride_name_seven dwarfs mine train',
 'Wind Type Code_calm',
 'Ride_name_tomorrowland transit authority peoplemover',
 'MKCLOSEYEST_HOUR',
 'AKHOURS',
 'CapacityLost_AK',
 'Wind Speed',
 'Age_of_ride_days',
 'MKHOURSYEST',
 'WDW_TICKET_SEASON_regular',
 'Age_of_ride_years',
 'inSession_Drive_CA',
 'AKEMHCLOSE_HOUR',
 'AKCLOSETOM_HOUR',
 'HSSHWNGT',
 'inSession_SoCal',
 'inSession_California',
 'CapacityLostWGT_MK',
 'AKHOURSEMHTOM',
 'inSession_Central_FL',
 'inSession_Florida',
 'HSSHWNT2_HOUR',
 'WDW_TICKET_SEASON_peak',
 'YEAR',
 'CapacityLost_MK',
 'CapacityLost_HS',
 'Cloud Height',
 'inSession_NY_NJ',
 'Cloud Determination Code_missing',
 'AKCLOSE_HOUR',
 'EPEMHCLOSE_HOUR',
 'AKHOURSEMHYEST',
 'Fast_pass',
 'CapacityLostWGT_AK'

In [99]:
et = ExtraTreesRegressor(max_depth=6, random_state=42)
et.fit(X_train[important_features], y_train['SACTMIN'])
y_pred = et.predict(X_test[important_features])

In [100]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error (MAE): 13.601841697360513
Mean Squared Error (MSE): 222103.11234275103
Root Mean Squared Error (RMSE): 471.27816875254365


In [101]:
et = ExtraTreesRegressor(max_depth=3, random_state=42)
et.fit(X_train, y_train['SACTMIN'])
y_pred = et.predict(X_test)

In [102]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error (MAE): 16.277383186515056
Mean Squared Error (MSE): 222481.03455459536
Root Mean Squared Error (RMSE): 471.6789528424979


In [103]:
et = ExtraTreesRegressor(max_depth=13, random_state=42)
et.fit(X_train, y_train['SACTMIN'])
y_pred = et.predict(X_test)

In [104]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error (MAE): 12.51285246393772
Mean Squared Error (MSE): 221889.68008436688
Root Mean Squared Error (RMSE): 471.0516745372708
