In [1]:
import pandas as pd 
import numpy as np
import datetime
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer

In [2]:
metadata = pd.read_csv("../Project-Disney-World-DSF/metadata.csv")
hauntedhouse = pd.read_csv("../Project-Disney-World-DSF/haunted_mansion[87].csv")

**Preparing Haunted House Data**

In [3]:
#compute percentage of na-values per column; result: 96.59% of actual waiting times are na
hauntedhouse.isna().mean()

#extract and drop SACTMIN column from Df
act_times = hauntedhouse.copy()["SACTMIN"]
hauntedhouse = hauntedhouse.drop(["SACTMIN"], axis=1)
hauntedhouse.shape

(319956, 3)

In [4]:
#format column date
hauntedhouse['date'] = pd.to_datetime(hauntedhouse['date'])
#format column datetime
hauntedhouse['datetime'] = pd.to_datetime(hauntedhouse['datetime'])


Handling Missing Data in Haunted House

In [5]:
#missing values are coded as -999; dataset does NOT include observations during covid-19 closure time
#imputation with ...

hauntedhouse.loc[hauntedhouse['SPOSTMIN'] == -999,'SPOSTMIN'] = np.nan
hauntedhouse["SPOSTMIN"] = hauntedhouse["SPOSTMIN"].fillna(method='bfill')
hauntedhouse["SPOSTMIN"] = hauntedhouse["SPOSTMIN"].fillna(hauntedhouse["SPOSTMIN"].median())
                
hauntedhouse    

Unnamed: 0,date,datetime,SPOSTMIN
0,2015-01-01,2015-01-01 08:23:09,10.0
1,2015-01-01,2015-01-01 08:37:13,10.0
2,2015-01-01,2015-01-01 08:37:31,10.0
3,2015-01-01,2015-01-01 08:44:11,10.0
4,2015-01-01,2015-01-01 08:51:12,10.0
...,...,...,...
319951,2021-12-28,2021-12-28 22:36:08,13.0
319952,2021-12-28,2021-12-28 22:42:15,13.0
319953,2021-12-28,2021-12-28 22:48:12,13.0
319954,2021-12-28,2021-12-28 22:54:10,13.0


**Preparing Metadata**

In [6]:
#looking at percentage of na-values per column
metadata.isna().mean().sort_values(ascending=False).head(20)
#metadata.shape

AKFIREN      1.000000
AKPRDDN      1.000000
AKPRDDT2     1.000000
AKPRDDT1     1.000000
HSPRDDN      1.000000
HSPRDDT1     1.000000
EPFIRET2     0.997595
MKFIRET2     0.995190
HSFIRET2     0.983646
HOLIDAYJ     0.979798
AKeventN     0.966811
WDWRaceN     0.958153
HSeventN     0.902838
HOLIDAYN     0.897066
MKPRDDT2     0.890332
WDWeventN    0.881193
MKeventN     0.743627
MKPRDNT2     0.711881
HSSHWNT2     0.658490
MKPRDNN      0.653199
dtype: float64

In [7]:
#remove variables related to Hollywood Studios Park in California (and not Walt Disney World in Florida)
metadata.columns.str.startswith('HS').sum() 
metadata.columns.str.endswith('_HS').sum() 

metadata = metadata.loc[:, ~metadata.columns.str.startswith('HS')]
metadata = metadata.loc[:, ~metadata.columns.str.endswith('_HS')]

metadata.shape

(2079, 145)

In [8]:
#filter out columns with all na-values
metadata.dropna(axis=1, how='all', inplace=True) #-> 6

#format date
metadata['DATE'] = pd.to_datetime(metadata['DATE'])


metadata.shape

(2079, 141)

In [9]:
#function that deals with string percentage values for columns that contain percentage of schools in session
def str_percent_to_float(dataframe):
    for col in dataframe.columns:
        if col.lower().startswith('insession'):
            dataframe[col] = dataframe[col].str.rstrip("%").astype(float)/100
            
str_percent_to_float(metadata)
metadata

Unnamed: 0,DATE,WDW_TICKET_SEASON,DAYOFWEEK,DAYOFYEAR,WEEKOFYEAR,MONTHOFYEAR,YEAR,SEASON,HOLIDAYPX,HOLIDAYM,...,MKFIREN,EPFIREWK,EPFIRET1,EPFIRET2,EPFIREN,AKPRDDAY,AKSHWNGT,AKSHWNT1,AKSHWNT2,AKSHWNN
0,2015-01-01,,5,0,0,1,2015,CHRISTMAS PEAK,0,5,...,Wishes Nighttime Spectacular,1,21:00,,IllumiNations: Reflections of Earth,0,0,,,
1,2015-01-02,,6,1,0,1,2015,CHRISTMAS,2,5,...,Wishes Nighttime Spectacular,1,21:00,,IllumiNations: Reflections of Earth,0,0,,,
2,2015-01-03,,7,2,0,1,2015,CHRISTMAS,3,0,...,Wishes Nighttime Spectacular,1,21:00,,IllumiNations: Reflections of Earth,0,0,,,
3,2015-01-04,,1,3,1,1,2015,CHRISTMAS,4,0,...,Wishes Nighttime Spectacular,1,21:00,,IllumiNations: Reflections of Earth,0,0,,,
4,2015-01-05,,2,4,1,1,2015,CHRISTMAS,5,0,...,Wishes Nighttime Spectacular,1,21:00,,IllumiNations: Reflections of Earth,0,0,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2074,2021-08-27,,6,238,34,8,2021,,11,0,...,Happily Ever After,1,22:00,,Epcot Forever,0,0,,,
2075,2021-08-28,,7,239,34,8,2021,,10,0,...,Happily Ever After,1,22:00,,Epcot Forever,0,0,,,
2076,2021-08-29,,1,240,35,8,2021,,9,0,...,Happily Ever After,1,22:00,,Epcot Forever,0,0,,,
2077,2021-08-30,,2,241,35,8,2021,,8,0,...,Happily Ever After,1,21:00,,Epcot Forever,0,0,,,


In [10]:
#function that deals with string times of form '9:00' or '25:00'; converts to hours since midnight (float) for consistency and usability
sww = ["MKOPEN", "MKCLOSE", "MKEMHOPEN", "MKEMHCLOSE", "MKOPENYEST", "MKCLOSEYEST", "MKOPENTOM", "MKCLOSETOM", "EPOPEN", "EPCLOSE", "EPEMHOPEN",
"EPEMHCLOSE", "EPOPENYEST", "EPCLOSEYEST", "EPOPENTOM", "EPCLOSETOM", "AKOPEN", "AKCLOSE", "AKEMHOPEN", "AKEMHCLOSE", "AKOPENYEST", "AKCLOSEYEST",
"AKOPENTOM", "AKCLOSETOM", "MKPRDDT1", "MKPRDDT2", "MKPRDNT1", "MKPRDNT2", "MKFIRET1", "MKFIRET2", "EPFIRET1", "EPFIRET2", "AKSHWNT1", "AKSHWNT2"]

for col in sww:
    metadata[col].fillna("99", inplace=True)  #to indicate outliers

metadata["MKCLOSE"][0]

'25:00'

In [11]:
def format_times(x):
    if len(x)==4:
        time = '0'+ x
    elif len(x)==5 and x > '24:00':
        hour = int(x[:2])-24
        minute = x[-2:]
        time = '0' + str(hour) + ':' + minute
    elif x == '24:00':
        time = '00:00'
    else:
        time = x
    return time

def str_times_to_numerical(dataframe):
    for col in sww:
        dataframe[col] = dataframe[col].apply(format_times)
        dataframe[col] = dataframe[col].apply(lambda y: y.rstrip(':'))
        dataframe[col] = dataframe[col].apply(lambda x: (float(x[:2])+(float(x[-2:])/60)) if x[0] != 0 else (float(x[1])+(float(x[-2:])/60)))

str_times_to_numerical(metadata)


In [12]:
#function for filling missing values
def imputation(dataframe):
        for col in dataframe.columns:
                dataframe[col] = dataframe[col].fillna(method='bfill')
                dataframe[col] = dataframe[col].fillna(dataframe[col].median())
        return dataframe

In [13]:
#one-hot encoding of categorical features
categorical_features = ["WDW_TICKET_SEASON", "SEASON", "HOLIDAYN", "WDWTICKETSEASON", "WDWRaceN", "WDWeventN", "WDWSEASON", "MKeventN", "EPeventN", "AKeventN", "HOLIDAYJ", "MKPRDDN", "MKPRDNN", "MKFIREN", "EPFIREN", "AKSHWNN"]

transformer = make_column_transformer(
    (OneHotEncoder(), categorical_features),
    remainder='passthrough')

transformed = transformer.fit_transform(metadata)
encoded_metadata = pd.DataFrame(transformed, columns=transformer.get_feature_names())
#new name of columns for i encoded column: 'onehotencoder__xi_oldcategoryname'

encoded_metadata.shape



(2079, 277)

Merge datasets

In [14]:
#change name of "DATE" column in metadata to fit with Haunted House 
encoded_metadata.rename(columns={"DATE":"date"}, inplace=True)

In [15]:
#merge metadata and waiting time data 
waittimes = pd.merge(hauntedhouse, encoded_metadata, how='left', on='date')

In [16]:
# brauchte noch diese Linie, dass funktioniert für mich (Annina)
waittimes["datetime"] = pd.to_datetime(waittimes.datetime, format = '%Y-%m-%d %H:%M:%S')

In [17]:
#Formatting of dates and times 
#drop date column due to redundancy
waittimes = waittimes.loc[:, waittimes.columns != "date"]

#create two new variables for hour and minute (day, month and year already included)
waittimes["HOUROFDAY"] = waittimes.copy()['datetime'].dt.hour
waittimes["MINUTEOFHOUR"] = waittimes.copy()['datetime'].dt.minute
#then drop datetime column
waittimes = waittimes.loc[:, waittimes.columns != 'datetime']


Feature Selection

In [19]:
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from math import sqrt

In [20]:
waittimes = imputation(waittimes)

Pearson correlation (linear regression)
-> problem: only for linear relations

In [21]:
from sklearn.feature_selection import r_regression, SelectKBest, f_regression, SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

X = waittimes.loc[:, waittimes.columns != "SPOSTMIN"]
y = waittimes['SPOSTMIN']
#standardize X
scaled_features = StandardScaler().fit_transform(X.values)
# apply Pearson correlation on standardized features
X_selection = SelectKBest(score_func=f_regression, k=200).fit_transform(scaled_features, y)

print("After selecting best 200 features:", X_selection.shape) 

After selecting best 200 features: (319956, 200)


In [22]:
features = list(waittimes.columns)
# standardizing the selected features to find column names
selector = SelectKBest(score_func=f_regression, k=200).fit(X_selection, y)
mask = selector.get_support()
# new list which will contain all kept K features

new_features = [] 
for bool, feature in zip(mask, features):
    if bool:
        new_features.append(feature)

#new_features

In [23]:
# applying linear regression to selected K features
ols_pearson = LinearRegression() 

x_train, x_test, y_train, y_test = train_test_split(X_selection, y, test_size = 0.25)
ols_pearson.fit(x_test, y_test)
y_predols= ols_pearson.predict(x_test)

# unfortunately r2 value is worse than linear regression without feature selection
r2 = ols_pearson.score(x_test, y_test)
print("R_squared: " + str(r2))

R_squared: 0.21913276658510172


In [24]:
print("MAE: " + str(metrics.mean_absolute_error(y_test, y_predols)))
print("MSE: " + str(metrics.mean_squared_error(y_test, y_predols)))

MAE: 13.196943598024115
MSE: 272.29900126313487


Spearman's rank correlation (Spearman's Rho)

-> works for non-linear relations, measures the monotonic relation between a pair of variables

In [21]:
from scipy import stats

X = waittimes.loc[:, waittimes.columns != "SPOSTMIN"]
y = waittimes['SPOSTMIN']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
# applying Spearman correlation to data
rho_matrix = x_train.corr(method="spearman")
# print(rho_matrix)
# prints the rho coefficient for every feature in correlation with another

In [22]:
# function to get the names of features with a correlation over a chosen threshold
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

# n = chosen threshold
n = 0.85
corr_features = correlation(x_train, n)
print(str(len(set(corr_features))) + " have a correlation over " + str(n))
#print(corr_features)

100 have a correlation over 0.85


In [23]:
# dropping the features with correlation over the chosen threshold
x_train.drop(corr_features,axis=1)
x_test.drop(corr_features,axis=1)

Unnamed: 0,onehotencoder__x0_peak,onehotencoder__x0_regular,onehotencoder__x0_value,onehotencoder__x0_nan,onehotencoder__x1_CHRISTMAS,onehotencoder__x1_CHRISTMAS PEAK,onehotencoder__x1_COLUMBUS DAY,onehotencoder__x1_EASTER,onehotencoder__x1_FALL,onehotencoder__x1_HALLOWEEN,...,WEATHER_WDWHIGH,WEATHER_WDWPRECIP,CapacityLost_MK,CapacityLost_AK,MKPRDDAY,MKPRDDT2,MKFIRET2,AKPRDDAY,HOUROFDAY,MINUTEOFHOUR
137081,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.9,0.12,437257.0,220778.0,1.0,100.65,100.65,0.0,17,45
134976,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,82.8,0.08,437257.0,220778.0,1.0,100.65,100.65,0.0,14,49
215789,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,82.7,0.07,463554.0,263674.0,1.0,100.65,100.65,0.0,20,42
49737,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,77.2,0.08,392861.0,210779.0,3.0,20.25,100.65,0.0,12,56
263029,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,72.3,0.12,433057.0,231777.0,0.0,100.65,100.65,0.0,14,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
93934,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,76.5,0.08,422858.0,220778.0,1.0,100.65,100.65,0.0,20,56
2612,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,70.7,0.08,354065.0,210779.0,1.0,100.65,100.65,0.0,14,54
297241,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,84.3,0.12,433857.0,231777.0,1.0,100.65,100.65,0.0,14,40
228980,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,92.2,0.24,463554.0,232777.0,1.0,100.65,100.65,0.0,11,16


In [28]:
# applying linear regression to selected features
ols_spearman = LinearRegression() 

ols_spearman.fit(x_test, y_test)
y_predols= ols_spearman.predict(x_test)

# unfortunately r2 value is the same as linear regression without feature selection
r2 = ols_spearman.score(x_test, y_test)
print("R_squared: " + str(r2))

R_squared: 0.22153519914862874


In [24]:
# applying random forest to selected features
rf = RandomForestRegressor(n_estimators=10, max_depth=50, n_jobs=-1, random_state=42)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)

# r2 value is the same as random forest without feature selection
print("RMSE: " + str(round(sqrt(mean_squared_error(y_test, y_pred)), 2)))
print("R_squared: " + str(round(r2_score(y_test, y_pred), 2)))

RMSE: 6.95
R_squared: 0.86


Random Forest

In [25]:
X = waittimes.loc[:, waittimes.columns != "SPOSTMIN"]
y = np.array(waittimes["SPOSTMIN"])
X.shape, y.shape

((319956, 278), (319956,))

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [27]:
#Random Forest
#does not accept NaN?
# Wieso classifier genutzt? Ist doch ein regression problem?...
rf = RandomForestClassifier(n_estimators=10, max_depth=50, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("RMSE: " + str(round(sqrt(mean_squared_error(y_test, y_pred)), 2)))
print("R_squared: " + str(round(r2_score(y_test, y_pred), 2)))

RMSE: 10.67
R_squared: 0.68


Random Forest (Regression)

In [21]:
X = waittimes.loc[:, waittimes.columns != "SPOSTMIN"]
y = np.array(waittimes["SPOSTMIN"])
X.shape, y.shape

((319956, 278), (319956,))

In [23]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [24]:
#Random Forest
rf = RandomForestRegressor(n_estimators=10, max_depth=50, n_jobs=-1, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("RMSE: " + str(round(sqrt(mean_squared_error(y_test, y_pred)), 2)))
print("R_squared: " + str(round(r2_score(y_test, y_pred), 2)))

RMSE: 7.09
R_squared: 0.86


Random Forest Cross Validation

In [25]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix

scores = cross_val_score(rf, X_train, y_train, cv=10, scoring='roc_auc')

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 352, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/Library/Frameworks/Python.framework/Versions/3.10/lib/python3.10/site-p

KeyboardInterrupt: 

In [None]:
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

In [26]:
from sklearn.model_selection import cross_validate
from sklearn.svm import SVC

clf = SVC()
scoring = {'acc': 'accuracy',
           'prec_macro': 'precision_macro',
           'rec_micro': 'recall_macro'}
scores = cross_validate(clf, X_train, y_train, scoring=scoring,
                         cv=5, return_train_score=True)
print(scores.keys())
print(scores['test_acc'])  



Regression (OLS)

In [34]:
from sklearn import metrics

In [35]:
# standardization doesn't work on datetime format, tried to standardize while excluding them but didn't work
datetime_cols = list(waittimes.select_dtypes(include=['datetime64']).columns)
datetime_cols


['date', 'datetime']

In [36]:
columns_list = list(waittimes.columns)
columns_list.remove("SPOSTMIN")


In [38]:
x = waittimes[columns_list]
y = waittimes['SPOSTMIN']

ols = LinearRegression() 

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25)


In [None]:
ols.fit(x_test, y_test)
y_predols= ols.predict(x_test)

r2 = ols.score(x_test, y_test)
r2

0.2204370048074198

In [None]:
print(metrics.mean_absolute_error(y_test, y_predols))
print(metrics.mean_squared_error(y_test, y_predols))

13.216513612215056
273.494886599498
