In [7]:
import warnings
warnings.filterwarnings('ignore')

import swifter
import importlib.util
from sklearn.preprocessing import RobustScaler, FunctionTransformer
from sklearn.compose import make_column_selector as selector, ColumnTransformer, make_column_transformer
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor


import pandas as pd
import numpy as np
from scipy.stats import skew
import joblib
import altair as alt
from altair_saver import save

import re
import json

In [8]:
parse_times = ["MKOPEN", "MKCLOSE", "MKEMHOPEN", "MKEMHCLOSE",
               "MKOPENYEST", "MKCLOSEYEST", "MKOPENTOM",
               "MKCLOSETOM","EPOPEN", "EPCLOSE", "EPEMHOPEN",
               "EPEMHCLOSE", "EPOPENYEST", "EPCLOSEYEST",
               "EPOPENTOM", "EPCLOSETOM", "HSOPEN", "HSCLOSE",
               "HSEMHOPEN", "HSEMHCLOSE", "HSOPENYEST", "HSCLOSEYEST",
               "HSOPENTOM", "HSCLOSETOM", "AKOPEN", "AKCLOSE",
               "AKEMHOPEN", "AKOPENYEST", "AKCLOSEYEST","AKEMHCLOSE",
               "AKOPENTOM", "AKCLOSETOM", "MKPRDDT1", "MKPRDDT2",
               "MKPRDNT1", "MKPRDNT2", "MKFIRET1", "MKFIRET2",
               "EPFIRET1", "EPFIRET2", "HSPRDDT1", "HSFIRET1",
               "HSFIRET2", "HSSHWNT1", "HSSHWNT2", "AKPRDDT1",
               "AKPRDDT2", "AKSHWNT1", "AKSHWNT2"]

In [9]:
def setup():
    with open("../data/processed/dtypes_parsed.json") as json_file:
        dtypes = json.load(json_file)

    return dtypes


def loadTrainTestPostedWaitTimes():
    """
            Loads train test data for posted wait times

            How to use:

            import importlib.util

            spec = importlib.util.spec_from_file_location("loadTrainTestPostedWaitTimes", "src/data/loadTrainTestData.py")
            loadTrainPosted = importlib.util.module_from_spec(spec)
            spec.loader.exec_module(loadTrainPosted)

            X_train_posted, X_test_posted, y_train_posted, y_test_posted = loadTrainPosted.loadTrainTestPostedWaitTimes()

            Parameters
            ----------

            Returns
            -------
            rideDataDf_trainX - train data features for posted wait times
            rideDataDf_testX - test data features for posted wait times
            rideDataDf_trainY - train data targets for posted wait times
            rideDataDf_testY - test data targets for posted wait times

        """
    parse_dates = ['date', 'datetime']
    X_train_list = []
    y_train_list = []
    dtypes = setup()

    for year in range(2015, 2022):

        rideData = pd.read_csv(f'../data/processed/All_train_postedtimes{year}.csv', dtype=dtypes,
                               parse_dates=parse_dates, compression='gzip')
        rideDataX = rideData.drop(columns=["POSTED_WAIT"])
        rideDataY = rideData["POSTED_WAIT"]
        X_train_list.append(rideDataX)
        y_train_list.append(rideDataY)

    rideDataDf_trainX = pd.concat(X_train_list, ignore_index=True)
    rideDataDf_trainY = pd.concat(y_train_list, ignore_index=True)


    X_test_list = []
    y_test_list = []

    for year in range(2015, 2022):
        rideData = pd.read_csv(f'../data/processed/All_test_postedtimes{year}.csv', dtype=dtypes,
                               parse_dates=parse_dates, compression='gzip')
        rideDataX = rideData.drop(columns=["POSTED_WAIT"])
        rideDataY = rideData["POSTED_WAIT"]

        X_test_list.append(rideDataX)
        y_test_list.append(rideDataY)

    rideDataDf_testX = pd.concat(X_test_list, ignore_index=True)
    rideDataDf_testY = pd.concat(y_test_list, ignore_index=True)

    return rideDataDf_trainX, rideDataDf_testX, rideDataDf_trainY, rideDataDf_testY

In [None]:
X_train, X_test, y_train, y_test = loadTrainTestPostedWaitTimes()

In [10]:
X_train["MONTHOFYEAR"] = X_train["date"].dt.month.astype("Int8")
X_train["YEAR"] = X_train["date"].dt.year.astype("Int16")
X_train["DAYOFYEAR"] = X_train["date"].dt.dayofyear.astype("Int16")
X_train["HOUROFDAY"] = X_train["datetime"].dt.hour.astype("Int8")

X_test["MONTHOFYEAR"] = X_test["date"].dt.month.astype("Int8")
X_test["YEAR"] = X_test["date"].dt.year.astype("Int16")
X_test["DAYOFYEAR"] = X_test["date"].dt.dayofyear.astype("Int16")
X_test["HOUROFDAY"] = X_test["datetime"].dt.hour.astype("Int8")

In [11]:
train = pd.concat([X_train, y_train], axis=1).sort_values(['datetime'])
test = pd.concat([X_test, y_test], axis=1).sort_values(['datetime'])

X_train_impute = train.drop(columns=["POSTED_WAIT"])
y_train = train["POSTED_WAIT"]

X_test_impute = test.drop(columns=["POSTED_WAIT"])
y_test = test["POSTED_WAIT"]

In [12]:
X_train_clean = X_train_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])
X_test_clean = X_test_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])

# Pipeline

Helper Function for Imputation & Log Transformation

In [13]:
allCols = list(X_train_clean.columns)

def impute_transform(x):
    for col in x:
        if col in parse_times:
            x[col] = x[col].fillna("99")
            x[col] = x[col].apply(lambda h: h[:2] if h[0] != 0 else h[:1]).astype(int).astype("Int8")

        x[col] = x[col].fillna(method='bfill')
        x[col] = x[col].fillna(x[col].median())

        if (x[col].dtype != "bool") and (abs(skew(list(x[col]))) > 0.8):
            # +20 linear scale on all values to ensure no resulting -inf vals
            x[f"log_{col}"] = x[col].apply(lambda k: np.log(k + 20))

            x.drop(columns=[col], inplace=True)

    return x


In [14]:
preprocessor = make_column_transformer(
      (RobustScaler(), selector(dtype_include=np.number)), remainder='passthrough')


In [15]:
rf = Pipeline(
    steps=[("imputerAndLogTransformer", FunctionTransformer(impute_transform)),
           ("preprocessor", preprocessor), 
           ("regressor", RandomForestRegressor(n_estimators=10, max_depth=50, n_jobs=-1, random_state=0))]
)

rf.fit(X_train_clean, y_train)

joblib.dump(rf, 'pipeline.pkl' + '.gz', compress=('gzip', 5))

############
#OR
############

# rf = joblib.load("pipeline.pkl.gz")

['pipeline.pkl.gz']

In [16]:
pred = rf.predict(X_test_clean)

In [17]:
mae = metrics.mean_absolute_error(y_test, pred)
mse = metrics.mean_squared_error(y_test, pred)
rmse = np.sqrt(metrics.mean_squared_error(y_test, pred))
r2 = metrics.r2_score(y_test, pred)

print("MAE: ", mae)
print("MSE: ", mse)
print("RMSE: ", rmse)
print("R-SQUARED: ", r2)

MAE:  40.45313127097203
MSE:  22577.2899065588
RMSE:  150.25741215180966
R-SQUARED:  0.7326651661948325


-------------------------------

# Regression Results Exploration

## Feature Importance

In [42]:
feature_importance = pd.Series(rf.steps[2][1].feature_importances_, index=X_train_clean.columns).astype(float)
sorted_importances = feature_importance.sort_values(ascending=False)
top30 = pd.DataFrame(sorted_importances.head(30)).reset_index()
top30.columns = ["feature_name", "importance"]

In [43]:
alt.Chart(top30).mark_bar(color="#90C6FA").encode(
    x=alt.X('feature_name', sort='-y', axis=alt.Axis(labelAngle=30)),
    y='importance'
).properties(title=["Top 30 Features for Random Forest", "(10 trees, 50 max_depth)"])

## Wait Time Analysis by Ride

Taking an exploratory approach to predicted wait times. 

In this analysis, we are considering a prediction less than 0 minutes to mean the ride is "down" and we calculated probability that the ride is down by taking the number of preditions less than 0 during a given hour vs. all predictions for that hour. We are also only looking at predictions from 2018 or later to narrow in on the trends of the most recent years.

In [95]:
original_features = X_test_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])
all_test = pd.concat([original_features, pd.Series(pred)], axis = 1).rename(columns={0:"PREDICTED_WAIT"})

class RideResults:
    def __init__(self, rideCol, rideName, results, corrThreshold=0.05):
        self.rideCol = rideCol
        self.rideName = rideName
        self.results = results
        self.corrThreshold = corrThreshold
        
    def visualize_wait_times(self, lower_thresh, upper_thresh):

        ride_data_lower = self.results[(self.results[self.rideCol]==1) & (self.results["YEAR"]>=2018) &
                                (self.results["PREDICTED_WAIT"]>=0) & 
                                   (self.results["HOUROFDAY"]>6)].groupby("HOUROFDAY")["PREDICTED_WAIT"].quantile(lower_thresh)

        ride_data_upper = self.results[(self.results[self.rideCol]==1) & (self.results["YEAR"]>=2018) &
                                (self.results["PREDICTED_WAIT"]>=0) & 
                                   (self.results["HOUROFDAY"]>6)].groupby("HOUROFDAY")["PREDICTED_WAIT"].quantile(upper_thresh)

        chartData = pd.concat([ride_data_lower, ride_data_upper], axis=1).reset_index()
        chartData.columns = ["HOUROFDAY", "LOWER_QUANTILE", "UPPER_QUANTILE"]

        predicted_down = self.results[(self.results[self.rideCol]==1) & (self.results["YEAR"]>=2018) &
                            (self.results["PREDICTED_WAIT"]<0) & (self.results["HOUROFDAY"]>6)].groupby("HOUROFDAY")["PREDICTED_WAIT"].count()


        number_waits = self.results[(self.results[self.rideCol]==1) & (self.results["YEAR"]>=2018) &
                                    (self.results["HOUROFDAY"]>6)].groupby("HOUROFDAY")["PREDICTED_WAIT"].count()

        predicted_down_pct = pd.DataFrame(predicted_down/number_waits)
        predicted_down_pct["PREDICTED_WAIT"] = predicted_down_pct["PREDICTED_WAIT"].apply(lambda x: x*100)
        predicted_down_pct = predicted_down_pct.rename(columns={"PREDICTED_WAIT":"PREDICTED PROB DOWN"}).reset_index()

        chartData["PREDICTED PROB DOWN"] = predicted_down_pct["PREDICTED PROB DOWN"]

        base = alt.Chart(chartData).encode(
            alt.X('HOUROFDAY', axis=alt.Axis(title=None))
        )
        wait_time_area = base.mark_area(opacity=0.3, color='#90C6FA').encode(
            alt.Y('LOWER_QUANTILE:Q',
                  axis=alt.Axis(title=['Predicted Wait Times by Hour of Day', f'({int(lower_thresh*100)} to {int(upper_thresh*100)} Percentile)'], 
                                titleColor='#90C6FA')),
            alt.Y2('UPPER_QUANTILE:Q')
        )

        pred_down_line = base.mark_line(stroke='#EAC2B1', interpolate='monotone').encode(
            alt.Y('PREDICTED PROB DOWN:Q',
                  axis=alt.Axis(title='Predicted Probability Ride Will Be Down', titleColor='#EAC2B1'))
        )

        final_chart = alt.layer(wait_time_area, pred_down_line).resolve_scale(
            y = 'independent'
        ).properties(
                    title={
                    "text": ["Expected Wait Time vs. Expected % of Times the Ride will be down"],
                      "subtitle": [self.rideName]
                    })
            
        return final_chart
        
        
    def visualize_correlations(self):
        rideData = self.results[(self.results[self.rideCol]==1)]

        # Create correlation matrix
        corr_mat = rideData.corr(method='pearson')

        # Convert correlation matrix to 1-D Series and sort
        sorted_mat = corr_mat.unstack().sort_values()

        ride_corr = pd.DataFrame(sorted_mat).reset_index().rename(columns={"level_0":"Variable1", "level_1":"Variable2", 0:"Correlation"}) 
        pred_wait_corr = ride_corr[(ride_corr["Variable1"]=="PREDICTED_WAIT") & (ride_corr["Variable2"]!="PREDICTED_WAIT") &
                                 ((ride_corr["Correlation"]>=self.corrThreshold)|(ride_corr["Correlation"]<=-self.corrThreshold))]

        pred_wait_corr = pred_wait_corr.sort_values(by="Correlation")

        bars = alt.Chart(pred_wait_corr).mark_bar().encode(
            x=alt.X('Variable2:N', sort='y', title="Correlated Variable"),
            y='Correlation:Q',
            color=alt.condition(
                alt.datum.Correlation > 0,
                alt.value("#EAC2B1"),  # The positive color
                alt.value("#2C7AAF")  # The negative color
            )
        ).properties(
            title={
            "text": ["Top Correlations Between Predicted Wait Time & Features"],
              "subtitle": [self.rideName]
            }
        ).configure_axis(labelAngle=30)

        return bars

In [96]:
rideCols = [col for col in all_test.columns if col.startswith("Ride_name")]

rideResults = []
for ride in rideCols:
    match = re.match("Ride_name_(.+)$", ride)
    rideCol  = ride
    rideName = match.group(1).title()
    
    rideResults.append(RideResults(rideCol, rideName, all_test))

### Astro Orbiter


It seems Astro Orbiter wait times are highly influenced by capacity, age of ride, and presence of night time shows, especially at Magic Kingdom. This means that over time Astro Orbiter wait times have gone down but may spike on days when there is a nighttime show or parade happening at Magic Kingdom.


Astro Orbiter wait times are pretty consistent from park open to park close, however the probability that the ride will be down steadily drops to be at its minimum by 3:00PM.

In [97]:
astro_orbiter = rideResults[0].visualize_wait_times(.45, .55)
save(astro_orbiter, "../reports/figures/AstroOrbiterWaitTimes.html")
astro_orbiter

### Big Thunder Mountain Railroad


This ride is predicted to be down significantly less after 8PM with wait times remaining fairly consistent. Wait times do hit a noticeable low around lunch time so we'd recommend either hitting BTM around noon or after dark.

The wait times are highly correlated with capacity metrics & whether there is a show/parade at Magic Kingdom.

In [100]:
btm = rideResults[1].visualize_wait_times(.45, .55)
save(btm, "../reports/figures/BigThunderMountainWaitTimes.html")
btm

### Buzz Lightyear's Space Ranger Spin

The wait times for this ride are fairly consistent. Fit it in anytime you have some free time in your day!

In [102]:
buzz = rideResults[2].visualize_wait_times(.45, .55)
save(btm, "../reports/figures/BuzzLightyearSpaceRangerSpin.html")
buzz

### Dumbo the Flying Elephant

The wait times for this ride are fairly consistent. Fit it in anytime you have some free time in your day!

In [104]:
dumbo = rideResults[3].visualize_wait_times(.45, .55)
save(dumbo, "../reports/figures/DumboWaitTimes.html")
dumbo

### Haunted Mansion

Haunted mansion wait times and probability of being down drop around 3PM. This is a great time to hop on quickly!

In [106]:
haunted_mansion = rideResults[4].visualize_wait_times(.45, .55)
save(haunted_mansion, "../reports/figures/HauntedMansionWaitTimes.html")
haunted_mansion

### It's a Small World

In [108]:
small_world = rideResults[5].visualize_wait_times(.45, .55)
save(small_world, "../reports/figures/ItsaSmallWorldWaitTimes.html")
small_world

### Jungle Cruise

In [110]:
jungle_cruise = rideResults[6].visualize_wait_times(.45, .55)
save(jungle_cruise, "../reports/figures/JungleCruiseWaitTimes.html")
jungle_cruise

### Mad Tea Party

In [112]:
mad_tea_party = rideResults[7].visualize_wait_times(.45, .55)
save(mad_tea_party, "../reports/figures/MadTeamPartyWaitTimes.html")
mad_tea_party

### Peter Pan

In [113]:
peter_pan = rideResults[8].visualize_wait_times(.45, .55)
save(peter_pan, "../reports/figures/PeterPanWaitTimes.html")
peter_pan

### Pirates of the Caribbean

In [114]:
pirates = rideResults[9].visualize_wait_times(.45, .55)
save(pirates, "../reports/figures/PiratesWaitTimes.html")
pirates

### Prince Charming's Regal Carrousel


In [115]:
carousel = rideResults[10].visualize_wait_times(.45, .55)
save(carousel, "../reports/figures/CarouselWaitTimes.html")
carousel

### Seven Dwarfs' Mine Train

In [116]:
seven_dwarfs = rideResults[11].visualize_wait_times(.45, .55)
save(seven_dwarfs, "../reports/figures/7DwarfsWaitTimes.html")
seven_dwarfs

### Space Mountain

In [117]:
space_mtn = rideResults[12].visualize_wait_times(.45, .55)
save(space_mtn, "../reports/figures/SpaceMountainWaitTimes.html")
space_mtn

### Splash Mountain

In [118]:
splash_mtn = rideResults[13].visualize_wait_times(.45, .55)
save(splash_mtn, "../reports/figures/SplashMountainWaitTimes.html")
splash_mtn

### Barnstormer

In [119]:
barnstormer = rideResults[14].visualize_wait_times(.45, .55)
save(barnstormer, "../reports/figures/BarnstormerWaitTimes.html")
barnstormer

### Magic Carpets of Aladdin

In [120]:
aladdin = rideResults[15].visualize_wait_times(.45, .55)
save(aladdin, "../reports/figures/AladdinWaitTimes.html")
aladdin

### Many Adventures of Winnie the Pooh

In [121]:
pooh = rideResults[16].visualize_wait_times(.45, .55)
save(pooh, "../reports/figures/WinnieThePoohWaitTimes.html")
pooh

### Tomorrowland Speedway

In [122]:
speedway = rideResults[17].visualize_wait_times(.45, .55)
save(speedway, "../reports/figures/TomorrowlandSpeedwayWaitTimes.html")
speedway

### People Mover

In [123]:
people_mover = rideResults[18].visualize_wait_times(.45, .55)
save(people_mover, "../reports/figures/PeopleMoverWaitTimes.html")
people_mover

### Carousel of Progress

In [124]:
carousel_of_progress = rideResults[19].visualize_wait_times(.45, .55)
save(carousel_of_progress, "../reports/figures/CarouselofProgressWaitTimes.html")
carousel_of_progress