In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import joblib

# check xgboost version
import lightgbm as lgb
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import holidays
us_holidays = holidays.country_holidays('US')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [4]:
df = pd.read_csv(f'{data_dir}/final_model_input_partial_scale_3.csv')

df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])
df = df.sort_values(by=['start_station_cluster', 'started_at_hourly'])

df_test = df[pd.to_datetime(
    df['started_at_hourly']) >= pd.to_datetime("2024-01-01 00:00:00")]
df_train = df[pd.to_datetime(
    df['started_at_hourly']) < pd.to_datetime("2024-01-01 00:00:00")]

del df

In [5]:
target_scaler = joblib.load(f'{models_dir}/target_scaler.sav')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
execfile('/content/drive/MyDrive/Thesis/models_training/model_training.py')

Using device: cpu


# GNN Mean Embedding

In [7]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            'coco',
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "temp_lag_1_h",
            "temp_lag_2_h",
            "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            "wspd_lag_1_h",
            "wspd_lag_2_h",
            "wspd_lag_24_h",
            "total_demand_1h",
            "demand_degrees_1h",
]
FEATURES.extend([f"dim_mean_{i}" for i in range(50)])

## XGBoost

In [8]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='xgb')

overall
MSE: 3.647029207344911
RMSE: 1.9097196672142513
MAE: 1.024807643990257
MAPE: 0.41650298269082386

Non-zero
MSE: 7.392688142556299
RMSE: 2.71894982347161
MAE: 1.7099444317900678
MAPE: 0.3576559447835239

Zeros
MSE: 0.5844602223788978
RMSE: 0.7644999819351848
MAE: 0.4646181785133301
MAPE: 0.4646181785133301


In [None]:
#11-01
3.189635202926419
1.7859549834546276
0.37270374298234266

#11-09 scaled
3.637830331305797
1.9073097103789403
0.4170961189192818

#11-09 unscaled
3.67041725196534
1.9158333048481384
0.4197110921569622

#11-28
3.647029207344911
1.9097196672142513
0.41650298269082386

0.4197110921569622

In [None]:
suf = "_e_xgb_11_28"

model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## LightGBM

In [9]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='lgbm')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.234584 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15689
[LightGBM] [Info] Number of data points in the train set: 1140300, number of used features: 84
[LightGBM] [Info] Start training from score 1.005143
overall
MSE: 3.831372334603823
RMSE: 1.957389162789
MAE: 1.0583796088275395
MAPE: 0.4315368878008171

Non-zero
MSE: 7.752158382077213
RMSE: 2.7842698112929383
MAE: 1.7605971129315732
MAPE: 0.36709768947239374

Zeros
MSE: 0.6256138943998073
RMSE: 0.7909575806576528
MAE: 0.4842244105118419
MAPE: 0.4842244105118419


In [None]:
#11-15
3.8252097525133104
1.9558143451036734
0.4318934265290617

#11-28
3.831372334603823
1.957389162789
0.4315368878008171

0.4315368878008171

In [None]:
suf = "_e_lgbm_11_28"

model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model


## Random Forest

In [10]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='rf')

overall
MSE: 4.141888648505085
RMSE: 2.035163052068577
MAE: 1.0912857092249184
MAPE: 0.45691381547723675

Non-zero
MSE: 8.259875803873008
RMSE: 2.8739999658790896
MAE: 1.7895989888091752
MAPE: 0.3793618774199258

Zeros
MSE: 0.7748923574221989
RMSE: 0.880279704083991
MAE: 0.5203227281215812
MAPE: 0.5203227281215812


In [None]:
3.582259568051494
1.8926858080652198
0.40263418701670023

#11_01
3.597199686712592
1.8966285051935163
0.4085506307097107

#11-28
4.144683733881307
2.035849634398697
0.45704513095680854

0.4085506307097107

In [None]:
suf = "_e_rf_11_28"
model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds
filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model