In [10]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [11]:

import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.decomposition import PCA
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

import joblib

# check xgboost version
from xgboost import XGBRegressor

#torch stuff
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#from torch_geometric.utils import dense_to_sparse


import lightgbm as lgb
from lightgbm import LGBMRegressor
import holidays

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

In [12]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [13]:
suf = "_normal_xgb_11_08"

returns = pd.read_csv(f"{models_dir}/test_returns_predictions{suf}.csv").rename(
    columns={
        "end_station_cluster": "start_station_cluster",
        "pred": "returns"
        }
    )[['started_at_hourly', 'returns', 'start_station_cluster']]

In [5]:
df = pd.read_csv(f'{data_dir}/final_model_input_partial_scale_4.csv')
pca = PCA(n_components=15, svd_solver='arpack')
cols = [f"dim_mean_{i}" for i in range(50)]
cols.extend([f"dim_var_{i}" for i in range(50)])
m = pca.fit_transform(df[cols])
df_pca = pd.DataFrame(m, columns=[f"dim_mean_pca_{i}" for i in range(15)])

df = pd.concat([df, df_pca], axis=1)
df = df.merge(returns, on=['started_at_hourly', 'start_station_cluster'], how='left').fillna(0)

df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])
df = df.sort_values(by=['start_station_cluster', 'started_at_hourly'])

df_test = df[pd.to_datetime(
    df['started_at_hourly']) >= pd.to_datetime("2024-01-01 00:00:00")]
df_train = df[pd.to_datetime(
    df['started_at_hourly']) < pd.to_datetime("2024-01-01 00:00:00")]

del df
del df_pca
del m

In [6]:
target_scaler = joblib.load(f'{models_dir}/target_scaler.sav')

In [7]:
execfile('/content/drive/MyDrive/Thesis/models_training/model_training.py')

Using device: cpu


# GNN Variance Embedding

In [None]:
FEATURES = ['start_station_cluster', 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            "temp",
            "dwpt",
            "rhum",
            "prcp",
            "wdir",
            "wspd",
            "pres",
            "coco",
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "temp_lag_1_h",
            "temp_lag_2_h",
            "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            "wspd_lag_1_h",
            "wspd_lag_2_h",
            "wspd_lag_24_h",
            # "coco_lag_1_h",
            # "coco_lag_2_h",
            # "coco_lag_24_h",
            "mean_gnn_cluster_demand_1h",
            "total_gnn_cluster_demand_1h",
            "total_demand_1h",
            "demand_degrees_1h",
            "returns"
]
FEATURES.extend([f"dim_mean_pca_{i}" for i in range(15)])

## XGBoost

In [24]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='xgb')

overall
MSE: 3.5222329685487885
RMSE: 1.876761297701119
MAE: 0.9917181586341258
MAPE: 0.39208096407077486

Non-zero
MSE: 7.195642903461277
RMSE: 2.6824695531284743
MAE: 1.6860098611609182
MAPE: 0.3529895406249029

Zeros
MSE: 0.5187370469175498
RMSE: 0.720234022327153
MAE: 0.42404334588244
MAPE: 0.42404334588244


In [None]:
#11-13
3.5379750563532233
1.8809505725438995
0.3987008225026617

#11-28
3.5330230608451085
1.8796337571040558
0.39265990380160887

0.39265990380160887

In [None]:
suf = "_n_xgb_01_11"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## LightGBM

In [None]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='lgbm')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.799340 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7529
[LightGBM] [Info] Number of data points in the train set: 1140300, number of used features: 52
[LightGBM] [Info] Start training from score 1.005143
overall
MSE: 3.604162432399839
RMSE: 1.898463176466649
MAE: 1.0053561979837935
MAPE: 0.39688443035904

Non-zero
MSE: 7.364610213404925
RMSE: 2.7137815338388838
MAE: 1.7087870999010193
MAPE: 0.356127128054372

Zeros
MSE: 0.5295016202721359
RMSE: 0.7276686198209567
MAE: 0.4302088873125976
MAPE: 0.4302088873125976


In [None]:
suf = "_n_lgbm_01_11"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## Random Forest

In [None]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='rf')

In [None]:
#11_01
3.6042873699256828
1.8984960810930538
0.4084309933647605

#11_28

In [None]:
suf = "_n_rf_01_11"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model