In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import joblib

# check xgboost version
from xgboost import XGBRegressor

#torch stuff
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#from torch_geometric.utils import dense_to_sparse


import lightgbm as lgb
from lightgbm import LGBMRegressor

import holidays
us_holidays = holidays.country_holidays('US')

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [4]:
suf = "_normal_xgb_11_08"

returns = pd.read_csv(f"{models_dir}/test_returns_predictions{suf}.csv").rename(
    columns={
        "end_station_cluster": "start_station_cluster",
        "pred": "returns"
        }
    )[['started_at_hourly', 'returns', 'start_station_cluster']]

In [6]:
df = pd.read_csv(f'{data_dir}/final_model_input_partial_scale_3.csv')
df = df.merge(returns, on=['started_at_hourly', 'start_station_cluster'], how='left').fillna(0)

df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])
df = df.sort_values(by=['start_station_cluster', 'started_at_hourly'])

df_test = df[pd.to_datetime(
    df['started_at_hourly']) >= pd.to_datetime("2024-01-01 00:00:00")]
df_train = df[pd.to_datetime(
    df['started_at_hourly']) < pd.to_datetime("2024-01-01 00:00:00")]

del df

In [7]:
target_scaler = joblib.load(f'{models_dir}/target_scaler.sav')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [8]:
execfile('/content/drive/MyDrive/Thesis/models_training/model_training.py')

Using device: cpu


# GNN Variance Embedding

In [9]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            'temp',
            'dwpt',
            'rhum',
            'prcp',
            'wdir',
            'wspd',
            'pres',
            'coco',
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "temp_lag_1_h",
            "temp_lag_2_h",
            "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            "wspd_lag_1_h",
            "wspd_lag_2_h",
            "wspd_lag_24_h",
            "total_demand_1h",
            "demand_degrees_1h",
            'returns'
]
FEATURES.extend([f"dim_mean_{i}" for i in range(50)])

## XGBoost

In [10]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='xgb')

overall
MSE: 3.534265459201861
RMSE: 1.8799642175323075
MAE: 0.9973491322851492
MAPE: 0.3947481051421412

Non-zero
MSE: 7.20667522373176
RMSE: 2.684525139336892
MAE: 1.6938633612235703
MAPE: 0.3542543082153698

Zeros
MSE: 0.5315873084478928
RMSE: 0.7291003418240132
MAE: 0.4278571117806337
MAPE: 0.4278571117806337


In [11]:
#11_01
3.1785156823909997
1.7828392194449278
0.374173850198251

#11_08
3.063089630068344
1.7501684576258205
0.35523741143766596

#11-09 scaled
3.536247192876756
1.880491210528982
0.39338313284835247

#11-09 unscaled
3.6902228556581362
1.9209952773648706
0.41701083809631545


#11-28
3.534265459201861
1.8799642175323075
0.3947481051421412

0.3947481051421412

In [None]:
suf = "_j_xgb_11_28"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## LightGBM

In [12]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='lgbm')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.184918 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 15944
[LightGBM] [Info] Number of data points in the train set: 1140300, number of used features: 85
[LightGBM] [Info] Start training from score 1.005143
overall
MSE: 3.6358338733382563
RMSE: 1.9067862683946137
MAE: 1.0132713098049062
MAPE: 0.40259074141784906

Non-zero
MSE: 7.419498714770608
RMSE: 2.723875679022559
MAE: 1.7134235384671923
MAPE: 0.35585330367011214

Zeros
MSE: 0.5421900597102779
RMSE: 0.7363355618943567
MAE: 0.44080474588510865
MAPE: 0.44080474588510865


In [None]:
suf = "_j_lgbm_11_28"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

In [None]:
#11-28
3.6358338733382563
1.9067862683946137
0.40259074141784906

## Random Forest

In [13]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='rf')

overall
MSE: 3.7961633952104554
RMSE: 1.9483745520844946
MAE: 1.0436927873034172
MAPE: 0.4315783650758218

Non-zero
MSE: 7.636987008087836
RMSE: 2.763509907361983
MAE: 1.7242217217020086
MAPE: 0.3634639656446628

Zeros
MSE: 0.6557847657131763
RMSE: 0.8098053875550448
MAE: 0.48727084822383193
MAPE: 0.48727084822383193


In [None]:
#11_01
3.6042873699256828
1.8984960810930538
0.4084309933647605

#11_28
3.78203289464364
1.9447449433392647
0.4310134176146017

0.4084309933647605

In [None]:
suf = "_j_rf_11_28"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model