In [1]:
import warnings
warnings.filterwarnings('ignore')

import importlib.util
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler

In [48]:
# specify the module that needs to be
# imported relative to the path of the
# module
spec = importlib.util.spec_from_file_location("loadTrainTestPostedWaitTimes", "../src/data/loadTrainTestData.py")

# creates a new module based on spec
loadTrainPosted = importlib.util.module_from_spec(spec)

# executes the module in its own namespace
# when a module is imported or reloaded.
spec.loader.exec_module(loadTrainPosted)

X_train, X_test, y_train, y_test = loadTrainPosted.loadTrainTestPostedWaitTimes()

In [25]:
X_train["MONTHOFYEAR"] = X_train["date"].dt.month.astype("Int8")
X_train["YEAR"] = X_train["date"].dt.year.astype("Int16")
X_train["DAYOFYEAR"] = X_train["date"].dt.dayofyear.astype("Int16")
X_train["HOUROFDAY"] = X_train["datetime"].dt.hour.astype("Int8")

X_test["MONTHOFYEAR"] = X_test["date"].dt.month.astype("Int8")
X_test["YEAR"] = X_test["date"].dt.year.astype("Int16")
X_test["DAYOFYEAR"] = X_test["date"].dt.dayofyear.astype("Int16")
X_test["HOUROFDAY"] = X_test["datetime"].dt.hour.astype("Int8")

In [26]:
parse_times = ["MKOPEN", "MKCLOSE", "MKEMHOPEN", "MKEMHCLOSE",
               "MKOPENYEST", "MKCLOSEYEST", "MKOPENTOM",
               "MKCLOSETOM","EPOPEN", "EPCLOSE", "EPEMHOPEN",
               "EPEMHCLOSE", "EPOPENYEST", "EPCLOSEYEST",
               "EPOPENTOM", "EPCLOSETOM", "HSOPEN", "HSCLOSE",
               "HSEMHOPEN", "HSEMHCLOSE", "HSOPENYEST", "HSCLOSEYEST",
               "HSOPENTOM", "HSCLOSETOM", "AKOPEN", "AKCLOSE",
               "AKEMHOPEN", "AKOPENYEST", "AKCLOSEYEST",
               "AKOPENTOM", "AKCLOSETOM", "MKPRDDT1", "MKPRDDT2",
               "MKPRDNT1", "MKPRDNT2", "MKFIRET1", "MKFIRET2",
               "EPFIRET1", "EPFIRET2", "HSPRDDT1", "HSFIRET1",
               "HSFIRET2", "HSSHWNT1", "HSSHWNT2", "AKPRDDT1",
               "AKPRDDT2", "AKSHWNT1", "AKSHWNT2"]

In [10]:
X_train_encoded = X_train.drop(columns=['date', 'datetime'])
X_test_encoded = X_test.drop(columns=['date', 'datetime'])

In [50]:
for col in parse_times:
    X_train_encoded[col] =  X_train_encoded[col].fillna("99")
    X_train_encoded[f"{col}_HOUR"] = X_train_encoded[col].apply(lambda x: x[:2] if x[0]!=0 else x[:1]).astype(int)
    X_train_encoded.drop(columns = col, inplace=True)

9    2249710
8     956138
6       4134
7       3806
Name: MKOPEN_HOUR, dtype: int64
23    722687
22    720897
0     646068
21    363533
18    260337
19    196191
20    159619
1     135463
16      3764
5       2262
2       1586
15       826
17       555
Name: MKCLOSE_HOUR, dtype: int64
9    1550657
8     993198
7     663695
6       6238
Name: MKEMHOPEN_HOUR, dtype: int64
0     1087343
22     654827
23     639904
21     315575
1      240327
20     125190
2       91360
3       28017
19      13197
18      10641
16       3764
5        2262
15        826
17        555
Name: MKEMHCLOSE_HOUR, dtype: int64
9    2267790
8     938513
7       3793
6       3692
Name: MKOPENYEST_HOUR, dtype: int64
22    734467
23    720394
0     636692
21    372709
18    245430
19    192412
20    168084
1     130124
16      6637
2       3229
5       1924
15      1649
17        37
Name: MKCLOSEYEST_HOUR, dtype: int64
9    2270422
8     935824
7       3881
6       3661
Name: MKOPENTOM_HOUR, dtype: int64
22    738582
2

In [51]:
dict(X_train_encoded.dtypes)

{'Unnamed: 0': dtype('int64'),
 'Ride_type_thrill': dtype('bool'),
 'Ride_type_spinning': dtype('bool'),
 'Ride_type_slow': dtype('bool'),
 'Ride_type_small_drops': dtype('bool'),
 'Ride_type_big_drops': dtype('bool'),
 'Ride_type_dark': dtype('bool'),
 'Fast_pass': dtype('bool'),
 'Classic': dtype('bool'),
 'Age_interest_preschoolers': dtype('bool'),
 'Age_interest_tweens': dtype('bool'),
 'Age_interest_teens': dtype('bool'),
 'Age_interest_adults': dtype('bool'),
 'Height_req_inches': Int16Dtype(),
 'Ride_duration_min': Float64Dtype(),
 'Age_of_ride_days': Int16Dtype(),
 'Age_of_ride_years': Float64Dtype(),
 'TL_rank': Int8Dtype(),
 'TA_Stars': Float64Dtype(),
 'DAYOFWEEK': Int8Dtype(),
 'DAYOFYEAR': Int16Dtype(),
 'WEEKOFYEAR': Int8Dtype(),
 'MONTHOFYEAR': Int8Dtype(),
 'YEAR': Int16Dtype(),
 'HOLIDAYPX': Int8Dtype(),
 'HOLIDAYM': Int8Dtype(),
 'HOLIDAY': dtype('bool'),
 'WDWevent': dtype('bool'),
 'WDWMAXTEMP': dtype('float64'),
 'WDWMINTEMP': dtype('float64'),
 'WDWMEANTEMP': dtyp

In [52]:
min_max_scaler = MinMaxScaler()

X_train_norm = min_max_scaler.fit_transform(X_train_encoded)
X_test_norm = min_max_scaler.transform(X_test_encoded)

KeyboardInterrupt: 

In [None]:

# print(dict(X_train_encoded.dtypes))
linRidge = Ridge(alpha=20.0).fit(X_train_encoded, y_train)


# X_train_scaled = scaler.fit_transform(X_train)
# X_test_scaled = scaler.transform(X_test)

# linridge = Ridge(alpha=20.0).fit(X_train_scaled, y_train)

In [None]:
print('Crime dataset')
print('ridge regression linear model intercept: {}'
     .format(linRidge.intercept_))
print('ridge regression linear model coeff:\n{}'
     .format(linRidge.coef_))
print('R-squared score (training): {:.3f}'
     .format(linRidge.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linRidge.score(X_test, y_test)))
print('Number of non-zero features: {}'
     .format(np.sum(linRidge.coef_ != 0)))

In [50]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import VarianceThreshold
import json


with open("../data/interim/dtypes.json") as json_file:
    dtypes = json.load(json_file)

rideDataLst = []
for year in range(2015, 2022):
    print(year)
    rideData = pd.read_csv(f"../data/interim/RideData{year}Weather.csv", dtype=dtypes, compression='gzip')
    rideDataLst.append(rideData)

2015
2016
2017
2018
2019
2020
2021


In [43]:
X = pd.concat(rideDataLst).drop(columns=["SACTMIN", "SPOSTMIN"])
X_num = X.select_dtypes(include=np.number).reset_index(drop=True)

In [45]:
print(type(X_num))

<class 'pandas.core.frame.DataFrame'>


In [56]:
threshold = 0

X_num = X.select_dtypes(include=np.number).reset_index(drop=True)

print(f"BEFORE THRESHOLD {threshold}: ", X_num.shape)

var_thr = VarianceThreshold(threshold = 0.05) #Removing both constant and quasi-constant
var_thr.fit(X_num)

var_thr.get_support()

concol = [column for column in X_num.columns 
          if column not in X_num.columns[var_thr.get_support()]]

X.drop(concol,axis=1)

for features in concol:
    print(features)

BEFORE THRESHOLD 0:  (1616601, 135)
Ride_type_scary
Age_interest_kids
WDWevent
WDWrace
AKevent
AKEMHEVE
AKEMHEYEST
AKEMHETOM
WEATHER_WDWPRECIP
MKFIREWK
EPFIREWK
HSPRDDAY
HSFIREWKS
AKPRDDAY
new_case
Source Element
Weather Type
Weather Type Observation
Weather Code Quality Code


In [59]:
X["AKEMHEVE"].value_counts()

0    1616601
Name: AKEMHEVE, dtype: Int64