In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import joblib

# check xgboost version
from xgboost import XGBRegressor

#torch stuff
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#from torch_geometric.utils import dense_to_sparse


import lightgbm as lgb
from lightgbm import LGBMRegressor

import holidays
us_holidays = holidays.country_holidays('US')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [4]:
df = pd.read_csv(f'{data_dir}/final_model_input_partial_scale_4.csv')

df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])
df = df.sort_values(by=['start_station_cluster', 'started_at_hourly'])

df_test = df[pd.to_datetime(
    df['started_at_hourly']) >= pd.to_datetime("2024-01-01 00:00:00")]
df_train = df[pd.to_datetime(
    df['started_at_hourly']) < pd.to_datetime("2024-01-01 00:00:00")]

del df

In [5]:
target_scaler = joblib.load(f'{models_dir}/target_scaler.sav')

In [6]:
execfile('/content/drive/MyDrive/Thesis/models_training/model_training.py')

Using device: cpu


# GNN Mean-Variance Embedding

In [7]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            #'temp',
            #'dwpt',
            #'rhum',
            #'prcp',
            #'wdir',
            #'wspd',
            #'pres',
            #'coco',
            #"demand_lag_1_h",
            #"demand_lag_2_h",
            #"demand_lag_24_h",
            #"temp_lag_1_h",
            #"temp_lag_2_h",
            #"temp_lag_24_h",
            #"prcp_lag_1_h",
            #"prcp_lag_2_h",
            #"prcp_lag_24_h",
            #"rhum_lag_1_h",
            #"rhum_lag_2_h",
            #"rhum_lag_24_h",
            #"wspd_lag_1_h",
            #"wspd_lag_2_h",
            #"wspd_lag_24_h",
            "total_demand_1h",
            "demand_degrees_1h",
]
#FEATURES.extend([f"dim_var_{i}" for i in range(50)])
FEATURES.extend([f"dim_mean_{i}" for i in range(50)])

## XGBoost

In [8]:
model, model_preds  = train_model(df_train, df_test, FEATURES, model='xgb')

overall
MSE: 4.112374062928124
RMSE: 2.027898928183583
MAE: 1.0717779573241577
MAPE: 0.43646158777503297

Non-zero
MSE: 8.34630349904037
RMSE: 2.8889969711026646
MAE: 1.7826971541819536
MAPE: 0.37036043081567327

Zeros
MSE: 0.6505797031487478
RMSE: 0.8065852113377406
MAE: 0.49050798032590226
MAPE: 0.49050798032590226


In [None]:
#11_01
3.581524793531
1.8924916891577093
0.3959399998524082


#11-09 scaled
4.112374062928123
2.0278989281835824
0.43646158777503297

#11-09 unscaled
4.163518283080992
2.040470113253559
0.4386774517867249

#11-28
4.157300202907574
2.0389458558057822
0.4420209656932694

0.4386774517867249

In [None]:
suf = "_i_xgb_11_28"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## LightGBM

In [9]:
model, model_preds  = train_model(df_train, df_test, FEATURES, model='lgbm')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.910249 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 13428
[LightGBM] [Info] Number of data points in the train set: 1140300, number of used features: 61
[LightGBM] [Info] Start training from score 1.005143
overall
MSE: 4.631116870511952
RMSE: 2.152002990358506
MAE: 1.1832700762218398
MAPE: 0.533267690437183

Non-zero
MSE: 9.066963150584561
RMSE: 3.0111398424159184
MAE: 1.7951700721513506
MAPE: 0.3501856774144816

Zeros
MSE: 1.0042289251842313
RMSE: 1.002112231830463
MAE: 0.6829613236501669
MAPE: 0.6829613236501669


In [None]:
suf = "_i_lgbm_11_28"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

In [None]:
#11-28
4.645069382106288
2.1552423024120255
0.5353495889045643

## Random Forest

In [10]:
model, model_preds  = train_model(df_train, df_test, FEATURES, model='rf')

overall
MSE: 3.7217060574607004
RMSE: 1.92917237629526
MAE: 1.0256527207028774
MAPE: 0.41478722067521495

Non-zero
MSE: 7.499608617960336
RMSE: 2.738541330336341
MAE: 1.7242966947290452
MAPE: 0.36631534861820436

Zeros
MSE: 0.6327736666217284
RMSE: 0.7954707201536261
MAE: 0.4544193533847735
MAPE: 0.4544193533847735


In [None]:
suf = "_i_rf_11_28"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

In [None]:
#11_01
3.273679430845745
1.8093312109300896
0.3716006671932742

#11_28
3.723580769803055
1.9296582002528466
0.41509447338288163



0.3716006671932742