In [1]:
import warnings
warnings.filterwarnings('ignore')

import importlib.util
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold

import numpy as np
import pandas as pd

In [2]:
parse_times = ["MKOPEN", "MKCLOSE", "MKEMHOPEN", "MKEMHCLOSE",
               "MKOPENYEST", "MKCLOSEYEST", "MKOPENTOM",
               "MKCLOSETOM","EPOPEN", "EPCLOSE", "EPEMHOPEN",
               "EPEMHCLOSE", "EPOPENYEST", "EPCLOSEYEST",
               "EPOPENTOM", "EPCLOSETOM", "HSOPEN", "HSCLOSE",
               "HSEMHOPEN", "HSEMHCLOSE", "HSOPENYEST", "HSCLOSEYEST",
               "HSOPENTOM", "HSCLOSETOM", "AKOPEN", "AKCLOSE",
               "AKEMHOPEN", "AKOPENYEST", "AKCLOSEYEST","AKEMHCLOSE",
               "AKOPENTOM", "AKCLOSETOM", "MKPRDDT1", "MKPRDDT2",
               "MKPRDNT1", "MKPRDNT2", "MKFIRET1", "MKFIRET2",
               "EPFIRET1", "EPFIRET2", "HSPRDDT1", "HSFIRET1",
               "HSFIRET2", "HSSHWNT1", "HSSHWNT2", "AKPRDDT1",
               "AKPRDDT2", "AKSHWNT1", "AKSHWNT2"]

# Load posted wait time datasets

In [3]:
# specify the module that needs to be
# imported relative to the path of the
# module
spec = importlib.util.spec_from_file_location("loadTrainTestPostedWaitTimes", "../src/data/loadTrainTestData.py")

# creates a new module based on spec
loadTrainPosted = importlib.util.module_from_spec(spec)

# executes the module in its own namespace
# when a module is imported or reloaded.
spec.loader.exec_module(loadTrainPosted)

X_train, X_test, y_train, y_test = loadTrainPosted.loadTrainTestPostedWaitTimes()

### Convert key data points from date to integer

In [4]:
X_train["MONTHOFYEAR"] = X_train["date"].dt.month.astype("Int8")
X_train["YEAR"] = X_train["date"].dt.year.astype("Int16")
X_train["DAYOFYEAR"] = X_train["date"].dt.dayofyear.astype("Int16")
X_train["HOUROFDAY"] = X_train["datetime"].dt.hour.astype("Int8")

X_test["MONTHOFYEAR"] = X_test["date"].dt.month.astype("Int8")
X_test["YEAR"] = X_test["date"].dt.year.astype("Int16")
X_test["DAYOFYEAR"] = X_test["date"].dt.dayofyear.astype("Int16")
X_test["HOUROFDAY"] = X_test["datetime"].dt.hour.astype("Int8")

### Sort by datetime before imputation (keeping y-values associated)

In [14]:
train = pd.concat([X_train, y_train], axis=1).sort_values(['datetime'])
test = pd.concat([X_test, y_test], axis=1).sort_values(['datetime'])

In [17]:
X_train_impute = train.drop(columns=["POSTED_WAIT"])
y_train = train["POSTED_WAIT"]

X_test_impute = test.drop(columns=["POSTED_WAIT"])
y_test = test["POSTED_WAIT"]

### Many open/close times, parade times, etc. are in HH:MM format. 

Convert to integer hour & fill nulls with 99.

This means that particulate event does not exist for that day. (e.g. Magic Kingdom doesn't have a second parade)

In [18]:
for col in parse_times:
    X_train_impute[col] =  X_train_impute[col].fillna("99")
    X_train_impute[f"{col}_HOUR"] = X_train_impute[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_train_impute.drop(columns = col, inplace=True)

In [19]:
for col in parse_times:
    X_test_impute[col] =  X_test_impute[col].fillna("99")
    X_test_impute[f"{col}_HOUR"] = X_test_impute[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_test_impute.drop(columns = col, inplace=True)

In [20]:
for col in X_train_impute.columns:
    nulls = X_train_impute[col].isnull().sum()
    
    if nulls>0:
        print(col)
        X_train_impute[col].fillna(method ='bfill', inplace=True)
    
        if X_train_impute[col].isnull().sum()>0:
            X_train_impute[col].fillna(X_train_impute[col].median(), inplace=True)

WDWMAXTEMP
WDWMINTEMP
WDWMEANTEMP
inSession
inSession_Enrollment
inSession_wdw
inSession_dlr
inSession_sqrt_WDW
inSession_sqrt_DLR
inSession_California
inSession_DC
inSession_Central_FL
inSession_Drive1_FL
inSession_Drive2_FL
inSession_Drive_CA
inSession_Florida
inSession_Mardi_Gras
inSession_Midwest
inSession_NY_NJ
inSession_NY_NJ_PA
inSession_New_England
inSession_New_Jersey
inSession_Nothwest
INSESSION_PLANES
inSession_SoCal
inSession_Southwest


In [21]:
for col in X_test_impute.columns:
    nulls = X_test_impute[col].isnull().sum()
    
    if nulls>0:
        print(col)
        X_test_impute[col]= X_test_impute[col].fillna(method ='bfill')
        
        if X_test_impute[col].isnull().sum()>0:
            X_test_impute[col].fillna(X_test_impute[col].median(), inplace=True)

WDWMAXTEMP
WDWMINTEMP
WDWMEANTEMP
inSession
inSession_Enrollment
inSession_wdw
inSession_dlr
inSession_sqrt_WDW
inSession_sqrt_DLR
inSession_California
inSession_DC
inSession_Central_FL
inSession_Drive1_FL
inSession_Drive2_FL
inSession_Drive_CA
inSession_Florida
inSession_Mardi_Gras
inSession_Midwest
inSession_NY_NJ
inSession_NY_NJ_PA
inSession_New_England
inSession_New_Jersey
inSession_Nothwest
INSESSION_PLANES
inSession_SoCal
inSession_Southwest


In [22]:
X_train_encoded = X_train_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])
X_test_encoded = X_test_impute.drop(columns=['date', 'datetime', 'Unnamed: 0'])

In [23]:
del X_train_impute, X_test_impute

In [24]:
X_dtype = X_train_encoded.select_dtypes(include=['bool']).reset_index(drop=True)

var_thr = VarianceThreshold(threshold=0.001)  # Removing both constant and quasi-constant
var_thr.fit(X_dtype)

concol = [column for column in X_dtype.columns
          if column not in X_dtype.columns[var_thr.get_support()]]


del var_thr, X_dtype

if "Weather Type" in concol:
    concol.remove("Weather Type")

print(f"DROPPING BOOL: ", concol)
X_train_encoded.drop(concol, axis=1, inplace=True)
X_test_encoded.drop(concol, axis=1, inplace=True)

DROPPING BOOL:  ['HOLIDAYN_ash|val', 'HOLIDAYN_chv|pas', 'HOLIDAYN_cmd|han', 'HOLIDAYN_col|suk', 'HOLIDAYN_hal|nvd', 'HOLIDAYN_njc|vet', 'MKeventN_dah|emm', 'HSeventN_wdwsotf', "HSFIREN_disney's hollywood studios special july 4th fireworks presentation", "HSFIREN_new year's eve fireworks", 'Wind Speed Quality_a', 'Wind Speed Quality_p', 'Wind Speed Quality_passed gross limits check if element is present', 'Cloud Quality Code_erroneous, data originate from an ncei data source', 'Cloud Determination Code_statistically derived', 'Visibiliy Quality Code_p', 'Visibility Variability Code_variable', 'Temperature Quality Code_suspect, data originate from an ncei data source']


In [25]:
min_max_scaler = MinMaxScaler()


X_dtype_train = X_train_encoded.select_dtypes(include=[np.number]).reset_index(drop=True)
num_cols = list(X_dtype_train.columns)

X_dtype_test = X_test_encoded.select_dtypes(include=[np.number]).reset_index(drop=True)

In [26]:
X_train_norm = min_max_scaler.fit_transform(X_dtype_train)
X_test_norm = min_max_scaler.transform(X_dtype_test)

X_train_encoded[num_cols] = X_train_norm
X_test_encoded[num_cols] = X_test_norm

In [27]:
del X_train_norm, X_test_norm, X_dtype_train, X_dtype_test

In [None]:
linRidge = RidgeCV(alphas=[1e-3,1e-1, 1]).fit(X_train_encoded, y_train)

In [None]:
print('Disney dataset')
print('ridge regression linear model intercept: {}'
     .format(linRidge.intercept_))
print('R-squared score (training): {:.3f}'
     .format(linRidge.score(X_train_encoded, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linRidge.score(X_test_encoded, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linRidge.coef_ != 0)))

In [None]:
X = pd.concat([X_train_encoded, X_test_encoded])
y = pd.concat([y_train, y_test])

In [None]:
linRidge = RidgeCV(alphas=[1e-3,1e-1, 1]).fit(X, y)

In [None]:
# force scores to be positive
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))