In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
!unzip /content/drive/MyDrive/Thesis/Data/compressed_data.zip
!unzip /content/drive/MyDrive/Thesis/Data/cosine_similarity.zip

Archive:  /content/drive/MyDrive/Thesis/Data/compressed_data.zip
  inflating: demand_graphs.pkl.npz   
  inflating: final_model_input_partial_scale_4.csv  
Archive:  /content/drive/MyDrive/Thesis/Data/cosine_similarity.zip
  inflating: cosine_similarity.csv   


In [None]:
!unzip /content/drive/MyDrive/Thesis/Data/returns.zip

Archive:  /content/drive/MyDrive/Thesis/Data/returns.zip
  inflating: demand_model_o_xgb_01_11.sav  
  inflating: test_predictions_o_xgb_01_11.csv  


In [None]:
import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import joblib

from sklearn.decomposition import PCA
# check xgboost version
from xgboost import XGBRegressor


#torch stuff
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#from torch_geometric.utils import dense_to_sparse


import lightgbm as lgb
from lightgbm import LGBMRegressor
import holidays

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:

execfile('/content/drive/MyDrive/Thesis/models_training/model_training.py')

Using device: cpu


In [None]:
target_scaler = joblib.load(f'{models_dir}/target_scaler.sav')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:

#################################
## Cosine Similarity
#################################

cosine_similarity = pd.read_csv(f'cosine_similarity.csv').drop(columns=['Unnamed: 0'])
cols = [f"cosine_sim_{i}" for i in range(183)]
cosine_similarity[cols] = cosine_similarity[cols].astype('float32')
cosine_similarity['started_at_hourly'] = pd.to_datetime(cosine_similarity['started_at_hourly'])

#################################
## PCA
#################################

PCA_DIM = 1

pca_test = cosine_similarity[cosine_similarity['started_at_hourly'] >= pd.to_datetime("2024-01-01 00:00:00")]
pca_train = cosine_similarity[cosine_similarity['started_at_hourly'] < pd.to_datetime("2024-01-01 00:00:00")]

del cosine_similarity

pca_train_time_clusters = pca_train[['started_at_hourly', 'start_station_cluster']]
pca_test_time_clusters = pca_test[['started_at_hourly', 'start_station_cluster']]

pca_train.drop(columns=['started_at_hourly', 'start_station_cluster'], inplace=True)
pca_test.drop(columns=['started_at_hourly', 'start_station_cluster'], inplace=True)

pca = PCA(n_components=PCA_DIM, svd_solver='arpack')

m = pca.fit_transform(pca_train[cols])
n = pca.transform(pca_test[cols])
del pca_train

n.shape


df_train_pca = pd.DataFrame(m, columns=[f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])
df_test_pca = pd.DataFrame(n, columns=[f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

del m
del n

df_train_pca[['started_at_hourly', 'start_station_cluster']] = pca_train_time_clusters[['started_at_hourly', 'start_station_cluster']]
df_test_pca[['started_at_hourly', 'start_station_cluster']] = pca_test_time_clusters[['started_at_hourly', 'start_station_cluster']]

#################################
#################################

df = pd.read_csv(f'final_model_input_partial_scale_4.csv')

suf = "_normal_xgb_01_11"

returns = pd.read_csv(f"{models_dir}/test_returns_predictions{suf}.csv").rename(
    columns={
        "end_station_cluster": "start_station_cluster",
        "pred": "returns"
        }
    )[['started_at_hourly', 'returns', 'start_station_cluster']]

#################################
#################################

df = df.merge(returns, on=['started_at_hourly', 'start_station_cluster'], how='left').fillna(0)

df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])
df = df.sort_values(by=['start_station_cluster', 'started_at_hourly'])

df_test = df[df['started_at_hourly'] >= pd.to_datetime("2024-01-01 00:00:00")]
df_train = df[df['started_at_hourly'] < pd.to_datetime("2024-01-01 00:00:00")]

del df
del returns

df_train = df_train.merge(df_train_pca, on=[
    'started_at_hourly', 'start_station_cluster'
    ], how='left').fillna(0)

df_test = df_test.merge(df_test_pca, on=[
    'started_at_hourly', 'start_station_cluster'
    ], how='left').fillna(0)

df_hold_out = df_test[df_test['started_at_hourly'] >= pd.to_datetime("2024-03-25 00:00:00")]
df_test = df_test[df_test['started_at_hourly'] < pd.to_datetime("2024-03-25 00:00:00")]
#del df_train_pca
#del df_test_pca


#################################
#################################

# GNN Variance Embedding

In [None]:
FEATURES = ['start_station_cluster',# 'started_at_year',
            'started_at_month', 'started_at_day',
            'started_at_hour',
            'started_at_week',
            'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            "temp",
            "dwpt",
            "rhum",
            "prcp",
            "wdir",
            "wspd",
            "pres",
            "coco",
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "temp_lag_1_h",
            "temp_lag_2_h",
            "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            "wspd_lag_1_h",
            "wspd_lag_2_h",
            "wspd_lag_24_h",
            # "coco_lag_1_h",
            # "coco_lag_2_h",
            # "coco_lag_24_h",
            "mean_gnn_cluster_demand_1h",
            "total_gnn_cluster_demand_1h",
            "total_demand_1h",
            "demand_degrees_1h",
            "returns"
]
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

## XGBoost

In [None]:
model, model_preds, preds_list = train_model(df_train, df_test, FEATURES, model='xgb')

overall
MSE: 3.4385015032398725
RMSE: 1.8543196874433145
MAE: 0.9873230419200121
MAPE: 0.3942077604799864
### Standard Deviation
MSE: 0.0
RMSE: 2.340555645717801e-16
MAE: 0.0
MAPE: 5.851389114294502e-17
### Variance
MSE: 0.0
RMSE: 5.478200730701471e-32
MAE: 0.0
MAPE: 3.4238754566884194e-33

Non-zero
MSE: 7.019512216780733
RMSE: 2.6494362073431263
MAE: 1.6766834387513974
MAPE: 0.3535476897758064
### Standard Deviation
MSE: 0.0
RMSE: 4.681111291435602e-16
MAE: 2.340555645717801e-16
MAPE: 5.851389114294502e-17
### Variance
MSE: 0.0
RMSE: 2.1912802922805884e-31
MAE: 5.478200730701471e-32
MAPE: 3.4238754566884194e-33

#####
Zeros
MSE: 0.5290595469456003
RMSE: 0.7273647963337242
MAE: 0.4272426003941237
MAPE: 0.4272426003941237
### Standard Deviation
MSE: 1.1702778228589004e-16
RMSE: 1.1702778228589004e-16
MAE: 5.851389114294502e-17
MAPE: 5.851389114294502e-17
### Variance
MSE: 1.3695501826753678e-32
RMSE: 1.3695501826753678e-32
MAE: 3.4238754566884194e-33
MAPE: 3.4238754566884194e-33


In [None]:
preds_holdout = test_given_model(df_hold_out, model, FEATURES)

overall
MSE: 3.9522885862355226
RMSE: 1.9880363644147767
MAE: 1.0621618331080735
MAPE: 0.4054599371932882

Non-zero
MSE: 7.793040025395449
RMSE: 2.7916016953346783
MAE: 1.7526389049439484
MAPE: 0.3512115785649531

#####
Zeros
MSE: 0.565498436929647
RMSE: 0.7519963011409344
MAE: 0.45329635918748096
MAPE: 0.45329635918748096


In [None]:
df_test.shape[0]

In [None]:
#aggregated_importance = []
#
#for i in range(df_test.shape[0]):
#    row = df_test[FEATURES].loc[i].values#.reshape(1, -1)
#    explanation = explainer.explain_instance(
#        data_row=row,
#        predict_fn=model.predict  # Prediction function of the model
#    )
#    # Extract feature importance
#    feature_importance = dict(explanation.as_list())
#    aggregated_importance.append(feature_importance)
#
## Aggregate feature importance
#importance_df = pd.DataFrame(aggregated_importance).fillna(0)  # Handle missing features
#mean_importance = importance_df.mean().sort_values(ascending=False)

In [None]:
#11-15
3.519954687632939
1.8761542281041126
0.39561652953215365


#11-28 (20 dim)
3.524785445848623
1.877441196375701
0.39610593754659995

#11-29 (no dim)
3.563436345796625
1.887706636582238
0.4006454580255268

#11-29 (5 dim)
3.504096872035053
1.8719233082674762
0.39420467949842114

#11-29 (6 dim)
3.524516281760969
1.8773695112473114
0.39833392477077356

#11-29 (1 dim)
3.4851078325990033
1.8668443514655964
0.39848029011836106

In [None]:
suf = "_o_xgb_02_25_holdout"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

preds_holdout.to_csv(f"{models_dir}/holdout_predictions{suf}.csv")
del preds_holdout


filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model


## LightGBM

In [None]:
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
#FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

model, model_preds, preds_list = train_model(df_train, df_test, FEATURES, model='lgbm')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.750807 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3954
[LightGBM] [Info] Number of data points in the train set: 1140300, number of used features: 37
[LightGBM] [Info] Start training from score 1.005143
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.169623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3944
[LightGBM] [Info] Number of data points in the train set: 1140300, number of used features: 37
[LightGBM] [Info] Start training from score 1.005143
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.186575 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

In [None]:
#11-28
3.622206095204831
1.9032094196921239
0.3979671041602479

#11-29
3.5923026180976714
1.895337072422125
0.3977005786689495

0.3977005786689495

In [None]:
suf = "_o_lgbm_01_27"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## Random Forest

In [None]:
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
#FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

model, model_preds, preds_list = train_model(df_train, df_test, FEATURES, model='rf')

In [None]:
#11_01
3.6042873699256828
1.8984960810930538
0.4084309933647605

#11_28
3.7858322010061154
1.9457215116778956
0.44009657232024235

In [None]:
suf = "_o_rf_01_27"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model