In [1]:
import warnings
warnings.filterwarnings('ignore')

import importlib.util
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn import metrics
from sklearn.dummy import DummyRegressor

from sklearn.ensemble import RandomForestRegressor


import numpy as np
import pandas as pd

In [2]:
parse_times = ["MKOPEN", "MKCLOSE", "MKEMHOPEN", "MKEMHCLOSE",
               "MKOPENYEST", "MKCLOSEYEST", "MKOPENTOM",
               "MKCLOSETOM","EPOPEN", "EPCLOSE", "EPEMHOPEN",
               "EPEMHCLOSE", "EPOPENYEST", "EPCLOSEYEST",
               "EPOPENTOM", "EPCLOSETOM", "HSOPEN", "HSCLOSE",
               "HSEMHOPEN", "HSEMHCLOSE", "HSOPENYEST", "HSCLOSEYEST",
               "HSOPENTOM", "HSCLOSETOM", "AKOPEN", "AKCLOSE",
               "AKEMHOPEN", "AKOPENYEST", "AKCLOSEYEST","AKEMHCLOSE",
               "AKOPENTOM", "AKCLOSETOM", "MKPRDDT1", "MKPRDDT2",
               "MKPRDNT1", "MKPRDNT2", "MKFIRET1", "MKFIRET2",
               "EPFIRET1", "EPFIRET2", "HSPRDDT1", "HSFIRET1",
               "HSFIRET2", "HSSHWNT1", "HSSHWNT2", "AKPRDDT1",
               "AKPRDDT2", "AKSHWNT1", "AKSHWNT2"]

# Load posted wait time datasets

In [None]:
# specify the module that needs to be
# imported relative to the path of the
# module
spec = importlib.util.spec_from_file_location("loadTrainTestPostedWaitTimes", "../src/data/loadTrainTestData.py")

# creates a new module based on spec
loadTrainPosted = importlib.util.module_from_spec(spec)

# executes the module in its own namespace
# when a module is imported or reloaded.
spec.loader.exec_module(loadTrainPosted)

X_train, X_test, y_train, y_test = loadTrainPosted.loadTrainTestPostedWaitTimes()

### Convert key data points from date to integer

In [None]:
X_train["MONTHOFYEAR"] = X_train["date"].dt.month.astype("Int8")
X_train["YEAR"] = X_train["date"].dt.year.astype("Int16")
X_train["DAYOFYEAR"] = X_train["date"].dt.dayofyear.astype("Int16")
X_train["HOUROFDAY"] = X_train["datetime"].dt.hour.astype("Int8")

X_test["MONTHOFYEAR"] = X_test["date"].dt.month.astype("Int8")
X_test["YEAR"] = X_test["date"].dt.year.astype("Int16")
X_test["DAYOFYEAR"] = X_test["date"].dt.dayofyear.astype("Int16")
X_test["HOUROFDAY"] = X_test["datetime"].dt.hour.astype("Int8")

### Sort by datetime before imputation (keeping y-values associated)

In [None]:
train = pd.concat([X_train, y_train], axis=1).sort_values(['datetime'])
test = pd.concat([X_test, y_test], axis=1).sort_values(['datetime'])

In [None]:
X_train_impute = train.drop(columns=["POSTED_WAIT"])
y_train = train["POSTED_WAIT"]

X_test_impute = test.drop(columns=["POSTED_WAIT"])
y_test = test["POSTED_WAIT"]

In [None]:
del train, test

### Many open/close times, parade times, etc. are in HH:MM format. 

Convert to integer hour & fill nulls with 99.

This means that particulate event does not exist for that day. (e.g. Magic Kingdom doesn't have a second parade)

In [None]:
for col in parse_times:
    X_train_impute[col] =  X_train_impute[col].fillna("99")
    X_train_impute[f"{col}_HOUR"] = X_train_impute[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_train_impute.drop(columns = col, inplace=True)

In [None]:
for col in parse_times:
    X_test_impute[col] =  X_test_impute[col].fillna("99")
    X_test_impute[f"{col}_HOUR"] = X_test_impute[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_test_impute.drop(columns = col, inplace=True)

In [None]:
for col in X_train_impute.columns:
    nulls = X_train_impute[col].isnull().sum()
    
    if nulls>0:
        print(col)
        X_train_impute[col].fillna(method ='bfill', inplace=True)
    
        if X_train_impute[col].isnull().sum()>0:
            X_train_impute[col].fillna(X_train_impute[col].median(), inplace=True)

In [None]:
for col in X_test_impute.columns:
    nulls = X_test_impute[col].isnull().sum()
    
    if nulls>0:
        print(col)
        X_test_impute[col]= X_test_impute[col].fillna(method ='bfill')
        
        if X_test_impute[col].isnull().sum()>0:
            X_test_impute[col].fillna(X_test_impute[col].median(), inplace=True)

In [None]:
X_train_encoded = X_train_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])
X_test_encoded = X_test_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])

In [None]:
del X_train_impute, X_test_impute

In [None]:
X_dtype = X_train_encoded.select_dtypes(include=['bool']).reset_index(drop=True)

var_thr = VarianceThreshold(threshold=0.001)  # Removing both constant and quasi-constant
var_thr.fit(X_dtype)

concol = [column for column in X_dtype.columns
          if column not in X_dtype.columns[var_thr.get_support()]]


del var_thr, X_dtype

if "Weather Type" in concol:
    concol.remove("Weather Type")

print(f"DROPPING BOOL: ", concol)
X_train_encoded.drop(concol, axis=1, inplace=True)
X_test_encoded.drop(concol, axis=1, inplace=True)

In [None]:
scaler = StandardScaler()


X_dtype_train = X_train_encoded.select_dtypes(include=[np.number]).reset_index(drop=True)
num_cols = list(X_dtype_train.columns)

X_dtype_test = X_test_encoded.select_dtypes(include=[np.number]).reset_index(drop=True)

In [None]:
X_train_norm = scaler.fit_transform(X_dtype_train)
X_test_norm = scaler.transform(X_dtype_test)

X_train_encoded[num_cols] = X_train_norm
X_test_encoded[num_cols] = X_test_norm

In [None]:
del X_train_norm, X_test_norm, X_dtype_train, X_dtype_test

# Dummy Regression - Baseline

In [None]:
lm_dummy_mean = DummyRegressor(strategy = 'mean').fit(X_train_encoded, y_train)
y_predict_dummy_mean = lm_dummy_mean.predict(X_test_encoded)

In [None]:
print("Mean squared error (dummy): {:.2f}".format(metrics.mean_squared_error(y_test, 
                                                                     y_predict_dummy_mean)))

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_predict_dummy_mean))


In [None]:
lm_dummy_median = DummyRegressor(strategy = 'median').fit(X_train_encoded, y_train)
y_predict_dummy_median = lm_dummy_median.predict(X_test_encoded)

In [None]:
print("Mean squared error (dummy): {:.2f}".format(metrics.mean_squared_error(y_test, 
                                                                     y_predict_dummy_median)))

print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, y_predict_dummy_median))

# Ridge Regression Grid Search on Alpha

In [None]:
linRidge = RidgeCV(alphas=[1e-1, 1, 10], scoring='neg_mean_absolute_error').fit(X_train_encoded, y_train)

In [30]:
print('Disney dataset')
print('ridge regression linear model intercept: {}'
     .format(linRidge.intercept_))
print('R-squared score (training): {:.3f}'
     .format(linRidge.score(X_train_encoded, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linRidge.score(X_test_encoded, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linRidge.coef_ != 0)))

Disney dataset
ridge regression linear model intercept: -186.69367854762902
R-squared score (training): 0.173
R-squared score (test): 0.173
Number of non-zero features: 376


In [32]:
del linRidge

# Random Forest Regression

In [18]:
# create regressor object
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
 
# fit the regressor with x and y data
regressor.fit(X_train_encoded, y_train)

RandomForestRegressor(n_estimators=10, random_state=0)

In [None]:
predictions = regressor.predict(X_test_encoded)

In [25]:
print('Mean Absolute Error (MAE):', metrics.mean_absolute_error(y_test, pred))
print('Mean Squared Error (MSE):', metrics.mean_squared_error(y_test, pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, pred)))

Mean Absolute Error (MAE): 32.55780451043366
Mean Squared Error (MSE): 14946.978712447142
Root Mean Squared Error (RMSE): 122.25783701852059
Mean Absolute Percentage Error (MAPE): inf
Accuracy: -inf


In [39]:
importances = regressor.feature_importances_
#
# Sort the feature importance in descending order
#
sorted_indices = np.argsort(importances)[::-1]

In [42]:
X_train_encoded.columns[sorted_indices[:50]]

Index(['HOUROFDAY', 'Age_of_ride_years',
       'Ride_name_walt disney's carousel of progress', 'Age_of_ride_days',
       'Ride_duration_min', 'CapacityLostWGT_MK', 'MKEMHCLOSE_HOUR',
       'Temperature (C)', 'MKOPEN_HOUR', 'Wind Angle', 'TL_rank', 'Wind Speed',
       'WDW_TICKET_SEASON_none', 'Ride_name_prince charming regal carrousel',
       'Cloud Height', 'CapacityLost_MK', 'WDWMINTEMP', 'WDWMAXTEMP',
       'WDWMEANTEMP', 'WEATHER_WDWLOW', 'Park_area_fantasyland',
       'Ride_name_tomorrowland transit authority peoplemover', 'DAYOFYEAR',
       'HOLIDAYPX', 'MKEMHMORN', 'MKeventN_dah', 'DAYOFWEEK', 'MKCLOSE_HOUR',
       'MKFIREN_happily ever after', 'MKFIREN_happy hallowishes fireworks',
       'WEATHER_WDWHIGH', 'CapacityLost_EP', 'Visibility Distance (M)',
       'MKHOURSEMH', 'Ride_name_jungle cruise', 'MKEMHOPEN_HOUR',
       'Park_area_tomorrowland', 'MKHOURSEMHYEST', 'Height_req_inches',
       'Ride_type_dark', 'MKHOURSEMHTOM', 'MKFIRET1_HOUR', 'new_case',
       'MKH