In [1]:
import warnings
warnings.filterwarnings('ignore')

import swifter
import importlib.util
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.compose import make_column_selector as selector, ColumnTransformer, make_column_transformer
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics



import pandas as pd
import numpy as np
from scipy.stats import skew
import joblib
import altair as alt

In [2]:
parse_times = ["MKOPEN", "MKCLOSE", "MKEMHOPEN", "MKEMHCLOSE",
               "MKOPENYEST", "MKCLOSEYEST", "MKOPENTOM",
               "MKCLOSETOM","EPOPEN", "EPCLOSE", "EPEMHOPEN",
               "EPEMHCLOSE", "EPOPENYEST", "EPCLOSEYEST",
               "EPOPENTOM", "EPCLOSETOM", "HSOPEN", "HSCLOSE",
               "HSEMHOPEN", "HSEMHCLOSE", "HSOPENYEST", "HSCLOSEYEST",
               "HSOPENTOM", "HSCLOSETOM", "AKOPEN", "AKCLOSE",
               "AKEMHOPEN", "AKOPENYEST", "AKCLOSEYEST","AKEMHCLOSE",
               "AKOPENTOM", "AKCLOSETOM", "MKPRDDT1", "MKPRDDT2",
               "MKPRDNT1", "MKPRDNT2", "MKFIRET1", "MKFIRET2",
               "EPFIRET1", "EPFIRET2", "HSPRDDT1", "HSFIRET1",
               "HSFIRET2", "HSSHWNT1", "HSSHWNT2", "AKPRDDT1",
               "AKPRDDT2", "AKSHWNT1", "AKSHWNT2"]

In [3]:
# specify the module that needs to be
# imported relative to the path of the
# module
spec = importlib.util.spec_from_file_location("loadTrainTestPostedWaitTimes", "../src/data/loadTrainTestData.py")

# creates a new module based on spec
loadTrainPosted = importlib.util.module_from_spec(spec)

# executes the module in its own namespace
# when a module is imported or reloaded.
spec.loader.exec_module(loadTrainPosted)

X_train, X_test, y_train, y_test = loadTrainPosted.loadTrainTestPostedWaitTimes()

In [4]:
X_train["MONTHOFYEAR"] = X_train["date"].dt.month.astype("Int8")
X_train["YEAR"] = X_train["date"].dt.year.astype("Int16")
X_train["DAYOFYEAR"] = X_train["date"].dt.dayofyear.astype("Int16")
X_train["HOUROFDAY"] = X_train["datetime"].dt.hour.astype("Int8")

X_test["MONTHOFYEAR"] = X_test["date"].dt.month.astype("Int8")
X_test["YEAR"] = X_test["date"].dt.year.astype("Int16")
X_test["DAYOFYEAR"] = X_test["date"].dt.dayofyear.astype("Int16")
X_test["HOUROFDAY"] = X_test["datetime"].dt.hour.astype("Int8")

In [5]:
train = pd.concat([X_train, y_train], axis=1).sort_values(['datetime'])
test = pd.concat([X_test, y_test], axis=1).sort_values(['datetime'])

X_train_impute = train.drop(columns=["POSTED_WAIT"])
y_train = train["POSTED_WAIT"]

X_test_impute = test.drop(columns=["POSTED_WAIT"])
y_test = test["POSTED_WAIT"]

In [6]:
X_train_clean = X_train_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])
X_test_clean = X_test_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])

# Pipeline

Helper Function for Imputation & Log Transformation

In [7]:
allCols = list(X_train_clean.columns)

def imputeTransform(x):
    for col in x:
        print(col)
        if col in parse_times:
            x[col] = x[col].fillna("99")
            x[col] = x[col].apply(lambda h: h[:2] if h[0]!=0 else h[:1]).astype(int).astype("Int8")
            
        x[col] = x[col].fillna(method ='bfill')
        x[col] = x[col].fillna(x[col].median())
    
            
        if (x[col].dtype != "bool") and (abs(skew(list(x[col]))) > 0.8):
            x[f"log_{col}"] = x[col].swifter.apply(lambda k: np.log(k+20)) # +20 linear scale on all values to ensure no resulting -inf vals
            x.drop(columns=[col], inplace=True)
    return x

In [8]:
preprocessor = make_column_transformer(
      (VarianceThreshold(threshold=0.001), selector(dtype_include="bool")),
      (RobustScaler(), selector(dtype_include=np.number)), remainder='passthrough')


In [9]:
# rf = Pipeline(
#     steps=[("imputerAndLogTransformer", FunctionTransformer(imputeTransform)),
#            ("preprocessor", preprocessor), 
#            ("regressor", RandomForestRegressor(n_estimators=10, max_depth=50, n_jobs=-1, random_state=0))]
# )

# rf.fit(X_train_clean, y_train)

# joblib.dump(rf, 'pipeline.pkl' + '.gz', compress=('gzip', 5)) 

############
#OR
############

rf = joblib.load("pipeline.pkl.gz")

In [10]:
pred = rf.predict(X_test_clean)

Ride_type_thrill
Ride_type_spinning
Ride_type_slow
Ride_type_small_drops
Ride_type_big_drops
Ride_type_dark
Fast_pass
Classic
Age_interest_preschoolers
Age_interest_tweens
Age_interest_teens
Age_interest_adults
Height_req_inches
Ride_duration_min
Age_of_ride_days
Age_of_ride_years
TL_rank
TA_Stars
DAYOFWEEK
DAYOFYEAR
WEEKOFYEAR
MONTHOFYEAR
YEAR
HOLIDAYPX


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HOLIDAYM


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HOLIDAY
WDWevent
WDWMAXTEMP
WDWMINTEMP
WDWMEANTEMP
MKevent
EPevent
HSevent
inSession


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_Enrollment


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_wdw


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_dlr


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_sqrt_WDW


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_sqrt_DLR


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_California


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_DC


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_Central_FL


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_Drive1_FL


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_Drive2_FL


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_Drive_CA


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_Florida


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_Mardi_Gras
inSession_Midwest


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_NY_NJ


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_NY_NJ_PA


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_New_England


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_New_Jersey


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_Nothwest
INSESSION_PLANES
inSession_SoCal


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

inSession_Southwest


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKEMHMORN
MKEMHMYEST
MKEMHMTOM
MKEMHEVE
MKHOURSEMH
MKHOURSEMHYEST
MKHOURSEMHTOM
MKEMHEYEST
MKEMHETOM
EPEMHMORN
EPEMHMYEST
EPEMHMTOM
EPEMHEVE
EPEMHEYEST
EPEMHETOM
EPHOURSEMH
EPHOURSEMHYEST
EPHOURSEMHTOM
HSEMHMORN
HSEMHMYEST
HSEMHMTOM
HSEMHEVE
HSEMHEYEST
HSEMHETOM
HSHOURSEMH
HSHOURSEMHYEST
HSHOURSEMHTOM
AKEMHMORN
AKEMHMYEST
AKEMHMTOM
AKHOURSEMH
AKHOURSEMHYEST
AKHOURSEMHTOM
MKOPEN


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKCLOSE


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKHOURS
MKEMHOPEN
MKEMHCLOSE
MKOPENYEST


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKCLOSEYEST


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKHOURSYEST
MKOPENTOM


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKCLOSETOM


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKHOURSTOM
EPOPEN


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPCLOSE


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPHOURS
EPEMHOPEN


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPEMHCLOSE


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPOPENYEST


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPCLOSEYEST


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPHOURSYEST
EPOPENTOM


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPCLOSETOM


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPHOURSTOM
HSOPEN


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSCLOSE


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSHOURS
HSEMHOPEN


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSEMHCLOSE


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSOPENYEST


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSCLOSEYEST


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSHOURSYEST
HSOPENTOM


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSCLOSETOM


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSHOURSTOM
AKOPEN


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

AKCLOSE


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

AKHOURS
AKEMHOPEN
AKEMHCLOSE
AKOPENYEST


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

AKCLOSEYEST


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

AKHOURSYEST
AKOPENTOM


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

AKCLOSETOM


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

AKHOURSTOM
WEATHER_WDWHIGH
WEATHER_WDWLOW
CapacityLost_MK
CapacityLost_EP
CapacityLost_HS
CapacityLost_AK
CapacityLostWGT_MK
CapacityLostWGT_EP
CapacityLostWGT_HS
CapacityLostWGT_AK
MKPRDDAY


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKPRDDT1


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKPRDDT2


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKPRDNGT
MKPRDNT1
MKPRDNT2


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKFIREWK


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKFIRET1


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

MKFIRET2


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPFIREWK


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPFIRET1


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

EPFIRET2


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSPRDDT1
HSFIREWK
HSFIRET1


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSFIRET2


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSSHWNGT
HSSHWNT1


Dask Apply:   0%|          | 0/64 [00:00<?, ?it/s]

HSSHWNT2
AKPRDDT1
AKPRDDT2
AKSHWNGT
AKSHWNT1
AKSHWNT2
new_case
Wind Angle
Wind Speed
Cloud Height
Visibility Distance (M)
Temperature (C)
Weather Type
WDW_TICKET_SEASON_none
WDW_TICKET_SEASON_peak
WDW_TICKET_SEASON_regular
WDW_TICKET_SEASON_value
SEASON_christmas
SEASON_christmas peak
SEASON_columbus day
SEASON_easter
SEASON_fall
SEASON_halloween
SEASON_jersey week
SEASON_july 4th
SEASON_mardi gras
SEASON_martin luther king junior day
SEASON_memorial day
SEASON_none
SEASON_presidents week
SEASON_september low
SEASON_spring
SEASON_summer break
SEASON_thanksgiving
SEASON_winter
HOLIDAYN_ash
HOLIDAYN_ash|val
HOLIDAYN_cdm
HOLIDAYN_chv
HOLIDAYN_chv|pas
HOLIDAYN_cmd
HOLIDAYN_cmd|han
HOLIDAYN_cme
HOLIDAYN_col
HOLIDAYN_col|suk
HOLIDAYN_elc
HOLIDAYN_esm
HOLIDAYN_ess
HOLIDAYN_fat
HOLIDAYN_gfr
HOLIDAYN_hal
HOLIDAYN_hal|nvd
HOLIDAYN_han
HOLIDAYN_ind
HOLIDAYN_lab
HOLIDAYN_mem
HOLIDAYN_mgs
HOLIDAYN_mlk
HOLIDAYN_mot
HOLIDAYN_njc
HOLIDAYN_njc|vet
HOLIDAYN_none
HOLIDAYN_nvd
HOLIDAYN_nyd
HOLIDAYN_nye
HO

In [11]:
mae = metrics.mean_absolute_error(y_test, pred)
mse = metrics.mean_squared_error(y_test, pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, pred))
r2 = metrics.r2_score(y_test, pred)

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R-SQUARED: ", r2)

MAE:  32.60092688790606
MSE:  14949.657417195534
RMSE:  122.26879167308203
R-SQUARED:  0.8232275272328865


In [12]:
print("(PREDICTED, ACTUAL)")
for x in zip(np.round(pred[3000:3020], 0), y_test[3000:3020].values):
    print(x)

(PREDICTED, ACTUAL)
(29.0, 30)
(24.0, 30)
(24.0, 30)
(14.0, 20)
(24.0, 20)
(60.0, 70)
(29.0, 30)
(6.0, 5)
(26.0, 20)
(25.0, 30)
(12.0, 20)
(12.0, 20)
(81.0, 95)
(26.0, 30)
(12.0, 10)
(29.0, 25)
(10.0, 10)
(6.0, 5)
(55.0, 55)
(35.0, 40)


# Regression Results Exploration

In [13]:
original_features = X_test_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])

all_test = pd.concat([original_features, pd.Series(pred)], axis = 1).rename(columns={0:"PREDICTED_WAIT"})

### Big Thunder Mountain Railroad


This ride is predicted to be down significantly less after 10AM with wait times remaining fairly consistent after 10AM.

The wait times are highly correlated with capacity metrics & whether there is a show/parade at Magic Kingdom.

In [226]:
mean_time_btm = pd.DataFrame(all_test[(all_test["Ride_name_big thunder mountain railroad"]==1) & 
                        (all_test["PREDICTED_WAIT"]>=0)].groupby("HOUROFDAY")["PREDICTED_WAIT"].mean()).reset_index()


mean_time_btm_chart = alt.Chart(mean_time_btm).mark_line().encode(
    x="HOUROFDAY:Q",
    y=alt.Y("PREDICTED_WAIT:Q", title="Predicted Wait Time (MINS)"),
    color = alt.value("#2C7AAF")
).properties(
    title={
      "text": ["Avg Predicted Wait Time by Hour"],
      "subtitle": ["Big Thunder Mountain Railroad", "(If the ride is not predicted to be down)"]
          }
)  

predicted_down = all_test[(all_test["Ride_name_big thunder mountain railroad"]==1) & 
                        (all_test["PREDICTED_WAIT"]<0)].groupby("HOUROFDAY")["PREDICTED_WAIT"].count()/\
                        all_test[(all_test["Ride_name_big thunder mountain railroad"]==1)].groupby("HOUROFDAY")["PREDICTED_WAIT"].count()

predicted_down_pct = pd.DataFrame(predicted_down)
predicted_down_pct["PREDICTED_WAIT"] = predicted_down_pct["PREDICTED_WAIT"]*100
predicted_down_pct = predicted_down_pct.rename(columns={"PREDICTED_WAIT":"PREDICTED PROB DOWN"}).reset_index()

predictedDown_BTM = alt.Chart(predicted_down_pct).mark_line().encode(
    x='HOUROFDAY',
    y=alt.Y('PREDICTED PROB DOWN', title = "Probability of being down (%)"),
    color = alt.value("#2C7AAF")
).properties(
    title={
      "text": ["Predicted Probability that Ride will be Down by Hour"],
      "subtitle": ["Big Thunder Mountain Railroad"]
    }
)

predictedDown_BTM | mean_time_btm_chart

In [231]:
btm = all_test[(all_test["Ride_name_big thunder mountain railroad"]==1)]

# Create correlation matrix
corr_mat = btm.corr(method='pearson')
  
# Convert correlation matrix to 1-D Series and sort
sorted_mat = corr_mat.unstack().sort_values()

btm_corr = pd.DataFrame(sorted_mat).reset_index().rename(columns={"level_0":"Variable1", "level_1":"Variable2", 0:"Correlation"}) 
pred_wait_corr = btm_corr[(btm_corr["Variable1"]=="PREDICTED_WAIT") & (btm_corr["Variable2"]!="PREDICTED_WAIT") &
                         ((btm_corr["Correlation"]>=0.05)|(btm_corr["Correlation"]<=-0.05))]

pred_wait_corr = pred_wait_corr.sort_values(by="Correlation")

bars = alt.Chart(pred_wait_corr).mark_bar().encode(
    x=alt.X('Variable2:N', sort='y'),
    y='Correlation:Q',
    color=alt.condition(
        alt.datum.Correlation > 0,
        alt.value("#EAC2B1"),  # The positive color
        alt.value("#2C7AAF")  # The negative color
    )
).properties(
    title={
    "text": ["Top Correlations Between Predicted Wait Time & Features"],
      "subtitle": ["Big Thunder Mountain Railroad"]
    }
).configure_axis(labelAngle=30)

bars