In [1]:
import warnings
warnings.filterwarnings('ignore')

import importlib.util
from sklearn.linear_model import Ridge, RidgeCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold

import numpy as np
import pandas as pd

In [2]:
parse_times = ["MKOPEN", "MKCLOSE", "MKEMHOPEN", "MKEMHCLOSE",
               "MKOPENYEST", "MKCLOSEYEST", "MKOPENTOM",
               "MKCLOSETOM","EPOPEN", "EPCLOSE", "EPEMHOPEN",
               "EPEMHCLOSE", "EPOPENYEST", "EPCLOSEYEST",
               "EPOPENTOM", "EPCLOSETOM", "HSOPEN", "HSCLOSE",
               "HSEMHOPEN", "HSEMHCLOSE", "HSOPENYEST", "HSCLOSEYEST",
               "HSOPENTOM", "HSCLOSETOM", "AKOPEN", "AKCLOSE",
               "AKEMHOPEN", "AKOPENYEST", "AKCLOSEYEST","AKEMHCLOSE",
               "AKOPENTOM", "AKCLOSETOM", "MKPRDDT1", "MKPRDDT2",
               "MKPRDNT1", "MKPRDNT2", "MKFIRET1", "MKFIRET2",
               "EPFIRET1", "EPFIRET2", "HSPRDDT1", "HSFIRET1",
               "HSFIRET2", "HSSHWNT1", "HSSHWNT2", "AKPRDDT1",
               "AKPRDDT2", "AKSHWNT1", "AKSHWNT2"]

In [3]:
# specify the module that needs to be
# imported relative to the path of the
# module
spec = importlib.util.spec_from_file_location("loadTrainTestPostedWaitTimes", "../src/data/loadTrainTestData.py")

# creates a new module based on spec
loadTrainPosted = importlib.util.module_from_spec(spec)

# executes the module in its own namespace
# when a module is imported or reloaded.
spec.loader.exec_module(loadTrainPosted)

X_train, X_test, y_train, y_test = loadTrainPosted.loadTrainTestPostedWaitTimes()

In [4]:
X_train["MONTHOFYEAR"] = X_train["date"].dt.month.astype("Int8")
X_train["YEAR"] = X_train["date"].dt.year.astype("Int16")
X_train["DAYOFYEAR"] = X_train["date"].dt.dayofyear.astype("Int16")
X_train["HOUROFDAY"] = X_train["datetime"].dt.hour.astype("Int8")

X_test["MONTHOFYEAR"] = X_test["date"].dt.month.astype("Int8")
X_test["YEAR"] = X_test["date"].dt.year.astype("Int16")
X_test["DAYOFYEAR"] = X_test["date"].dt.dayofyear.astype("Int16")
X_test["HOUROFDAY"] = X_test["datetime"].dt.hour.astype("Int8")

In [28]:
X_train_impute = X_train.set_index(['datetime']).sort_values(['datetime'])

In [29]:
X_test_impute = X_test.set_index(['datetime']).sort_values(['datetime'])

In [None]:
for col in parse_times:
    X_train_impute[col] =  X_train_impute[col].fillna("99")
    X_train_impute[f"{col}_HOUR"] = X_train_impute[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_train_impute.drop(columns = col, inplace=True)

In [None]:
for col in parse_times:
    X_test_impute[col] =  X_test_impute[col].fillna("99")
    X_test_impute[f"{col}_HOUR"] = X_test_impute[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int).astype("Int8")
    X_test_impute.drop(columns = col, inplace=True)

In [None]:
for col in X_train_impute.columns:
    nulls = X_train_impute[col].isnull().sum()
    
    if nulls>0:
        print(col)
        X_train_impute[col].fillna(method ='bfill', inplace=True)
    
        if X_train_impute[col].isnull().sum()>0:
            X_train_impute[col].fillna(X_train_impute[col].median(), inplace=True)

In [None]:
for col in X_test_impute.columns:
    nulls = X_test_impute[col].isnull().sum()
    
    if nulls>0:
        print(col)
        X_test_impute[col]= X_test_impute[col].fillna(method ='bfill')
        
        if X_test_impute[col].isnull().sum()>0:
            X_test_impute[col].fillna(X_test_impute[col].median(), inplace=True)

In [None]:
X_train_encoded = X_train_impute.reset_index().drop(columns=['date', 'datetime', 'Unnamed: 0'])
X_test_encoded = X_test_impute.reset_index().drop(columns=['date', 'datetime', 'Unnamed: 0'])

In [None]:
del X_train_impute, X_test_impute

In [None]:
X_dtype = X_train_encoded.select_dtypes(include=['bool']).reset_index(drop=True)

var_thr = VarianceThreshold(threshold=0.001)  # Removing both constant and quasi-constant
var_thr.fit(X_dtype)

concol = [column for column in X_dtype.columns
          if column not in X_dtype.columns[var_thr.get_support()]]


del var_thr, X_dtype

if "Weather Type" in concol:
    concol.remove("Weather Type")

print(f"DROPPING BOOL: ", concol)
X_train_encoded.drop(concol, axis=1, inplace=True)
X_test_encoded.drop(concol, axis=1, inplace=True)

In [None]:
min_max_scaler = MinMaxScaler()


X_dtype_train = X_train_encoded.select_dtypes(include=[np.number]).reset_index(drop=True)
num_cols = list(X_dtype_train.columns)

X_dtype_test = X_test_encoded.select_dtypes(include=[np.number]).reset_index(drop=True)

In [None]:
X_train_norm = min_max_scaler.fit_transform(X_dtype_train)
X_test_norm = min_max_scaler.transform(X_dtype_test)

X_train_encoded[num_cols] = X_train_norm
X_test_encoded[num_cols] = X_test_norm

In [None]:
del X_train_norm, X_test_norm, X_dtype_train, X_dtype_test

In [20]:

# print(dict(X_train_encoded.dtypes))
# linRidge = Ridge(alpha=20.0).fit(X_train_encoded, y_train)

linRidge = Ridge(alpha=1e-3).fit(X_train_encoded, y_train)

# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

MemoryError: Unable to allocate 5.82 GiB for an array with shape (243, 3213788) and data type object

In [51]:
print('Disney dataset')
print('ridge regression linear model intercept: {}'
     .format(linRidge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linRidge.coef_))
print('R-squared score (training): {:.3f}'
     .format(linRidge.score(X_train_encoded, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linRidge.score(X_test_encoded, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linRidge.coef_ != 0)))

Disney dataset
ridge regression linear model intercept: -50.517693830458136
ridge regression linear model coeff:
[-4.61169756e+00 -4.22018678e+00  4.66775990e-01 -2.23988865e+00
  1.08503019e+00  1.28489139e+00 -2.98012262e-01  9.90083683e+00
  4.61169756e+00  1.06496857e+00  1.06496857e+00  1.06496857e+00
  2.02208487e+00  9.11988164e-01 -1.62638411e+01 -1.62638411e+01
 -1.28008464e+00  1.33191789e+00  9.26246815e+00 -6.36065689e+02
  6.26235452e+02  6.19106845e+00 -2.07231537e+02 -4.16935953e+00
  7.55414811e-01 -2.47756735e-01 -5.06843333e+00  1.04110680e+01
  1.24099165e+01 -2.35532685e+01 -3.76496991e+00 -1.21369598e-01
 -1.82869126e+00  9.76369215e+00 -4.35343466e+00 -1.06343639e+01
 -7.06197404e+00 -1.56252522e+01  4.93834125e+01 -4.10220895e+01
 -3.15623905e+00  8.59795814e+00 -5.39098655e+00  9.49814050e-02
  3.23273407e+00  7.14756094e+00  6.08091725e+00  3.10208394e-01
  1.89459616e+00 -2.90856284e+00  5.90439719e-01 -1.39336036e+00
  7.54726860e-01 -9.50220430e+00  2.184651