In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!unzip /content/drive/MyDrive/Thesis/Data/compressed_data.zip


Archive:  /content/drive/MyDrive/Thesis/Data/compressed_data.zip
  inflating: demand_graphs.pkl.npz   
  inflating: final_model_input_partial_scale_4.csv  


In [3]:
!unzip /content/drive/MyDrive/Thesis/Data/cosine_similarity.zip

Archive:  /content/drive/MyDrive/Thesis/Data/cosine_similarity.zip
  inflating: cosine_similarity.csv   


In [4]:
!unzip /content/drive/MyDrive/Thesis/Data/returns.zip

Archive:  /content/drive/MyDrive/Thesis/Data/returns.zip
  inflating: demand_model_o_xgb_01_11.sav  
  inflating: test_predictions_o_xgb_01_11.csv  


In [5]:
import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error

import joblib

from sklearn.decomposition import PCA
# check xgboost version
from xgboost import XGBRegressor


#torch stuff
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#from torch_geometric.utils import dense_to_sparse


import lightgbm as lgb
from lightgbm import LGBMRegressor
import holidays

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [6]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [7]:

execfile('/content/drive/MyDrive/Thesis/models_training/model_training.py')

Using device: cpu


In [8]:
target_scaler = joblib.load(f'{models_dir}/target_scaler.sav')

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [9]:
@scope.define
def to_int(x):
    """
    Custom Hyperopt scope function to cast a value to an integer.

    :param x: Value to be cast to an integer.
    :type x: Any
    :return: Integer representation of the input value.
    :rtype: int
    """
    return int(x)


In [10]:

#################################
## Cosine Similarity
#################################

cosine_similarity = pd.read_csv(f'cosine_similarity.csv').drop(columns=['Unnamed: 0'])
cols = [f"cosine_sim_{i}" for i in range(183)]
cosine_similarity[cols] = cosine_similarity[cols].astype('float32')
cosine_similarity['started_at_hourly'] = pd.to_datetime(cosine_similarity['started_at_hourly'])

#################################
## PCA
#################################

PCA_DIM = 1

pca_test = cosine_similarity[cosine_similarity['started_at_hourly'] >= pd.to_datetime("2024-01-01 00:00:00")]
pca_train = cosine_similarity[cosine_similarity['started_at_hourly'] < pd.to_datetime("2024-01-01 00:00:00")]

del cosine_similarity

pca_train_time_clusters = pca_train[['started_at_hourly', 'start_station_cluster']]
pca_test_time_clusters = pca_test[['started_at_hourly', 'start_station_cluster']]

pca_train.drop(columns=['started_at_hourly', 'start_station_cluster'], inplace=True)
pca_test.drop(columns=['started_at_hourly', 'start_station_cluster'], inplace=True)

pca = PCA(n_components=PCA_DIM, svd_solver='arpack')

m = pca.fit_transform(pca_train[cols])
n = pca.transform(pca_test[cols])
del pca_train

n.shape


df_train_pca = pd.DataFrame(m, columns=[f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])
df_test_pca = pd.DataFrame(n, columns=[f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

del m
del n

df_train_pca[['started_at_hourly', 'start_station_cluster']] = pca_train_time_clusters[['started_at_hourly', 'start_station_cluster']]
df_test_pca[['started_at_hourly', 'start_station_cluster']] = pca_test_time_clusters[['started_at_hourly', 'start_station_cluster']]

#################################
#################################

df = pd.read_csv(f'final_model_input_partial_scale_4.csv')

suf = "_normal_xgb_01_11"

returns = pd.read_csv(f"{models_dir}/test_returns_predictions{suf}.csv").rename(
    columns={
        "end_station_cluster": "start_station_cluster",
        "pred": "returns"
        }
    )[['started_at_hourly', 'returns', 'start_station_cluster']]

filtered_clusters = [
    35.0,
    42.0,
    53.0,
    62.0,
    65.0,
    69.0,
    71.0,
    79.0,
    81.0,
    86.0,
    88.0,
    91.0,
    94.0,
    99.0,
    105.0,
    122.0,
    145.0,
    154.0
    ]
df = df[~df['start_station_cluster'].isin(filtered_clusters)]

#################################
#################################

df = df.merge(returns, on=['started_at_hourly', 'start_station_cluster'], how='left').fillna(0)

df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])
df = df.sort_values(by=['start_station_cluster', 'started_at_hourly'])
df_test = df[df['started_at_hourly'] >= pd.to_datetime("2024-01-01 00:00:00")]
df_train = df[df['started_at_hourly'] < pd.to_datetime("2024-01-01 00:00:00")]

del df
del returns

df_train = df_train.merge(df_train_pca, on=[
    'started_at_hourly', 'start_station_cluster'
    ], how='left').fillna(0)

df_test = df_test.merge(df_test_pca, on=[
    'started_at_hourly', 'start_station_cluster'
    ], how='left').fillna(0)

df_hold_out = df_test[df_test['started_at_hourly'] >= pd.to_datetime("2024-03-25 00:00:00")]
df_test = df_test[df_test['started_at_hourly'] < pd.to_datetime("2024-03-25 00:00:00")]

#del df_train_pca
#del df_test_pca


#################################
#################################

# GNN Variance Embedding

In [11]:
#started_at_month, started_at_day, started_at_week,
# started_at_quarter, is_holiday, wspd

FEATURES = ['start_station_cluster',# 'started_at_year',
            #'started_at_month',
            #'started_at_day',
            'started_at_hour',
            #'started_at_week',
            #'started_at_quarter',
            'started_at_dayofweek',
            #'is_holiday',
            #'flag_added',
            "temp",
            "dwpt",
            "rhum",
            "prcp",
            "wdir",
            #"wspd",
            "pres",
            "coco",
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "temp_lag_1_h",
            "temp_lag_2_h",
            "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            "wspd_lag_1_h",
            "wspd_lag_2_h",
            "wspd_lag_24_h",
            "mean_gnn_cluster_demand_1h",
            "total_gnn_cluster_demand_1h",
            "total_demand_1h",
            "demand_degrees_1h",
            "returns"
]
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

## XGBoost

In [13]:
started_at_day = df_test['started_at_day']
started_at_month = df_test['started_at_month']
started_at_week = df_test['started_at_week']
started_at_year = df_test['started_at_year']
started_at_hour = df_test['started_at_hour']
started_at_quarter = df_test['started_at_quarter']
started_at_dayofweek = df_test['started_at_dayofweek']
model, model_preds, preds_list = train_model(df_train, df_test, FEATURES, model='xgb', tuning=False)
model_preds['started_at_day'] = started_at_day
model_preds['started_at_month'] = started_at_month
model_preds['started_at_week'] = started_at_week
model_preds['started_at_year'] = started_at_year
model_preds['started_at_hour'] = started_at_hour
model_preds['started_at_quarter'] = started_at_quarter
model_preds['started_at_dayofweek'] = started_at_dayofweek

overall
MSE: 1.6052396673885305
RMSE: 1.2669805315743923
MAE: 0.7364119261303899
MAPE: 0.37396028556060024
### Standard Deviation
MSE: 4.681111291435602e-16
RMSE: 2.340555645717801e-16
MAE: 1.1702778228589004e-16
MAPE: 5.851389114294502e-17
### Variance
MSE: 2.1912802922805884e-31
RMSE: 5.478200730701471e-32
MAE: 1.3695501826753678e-32
MAPE: 3.4238754566884194e-33

Non-zero
MSE: 3.4277172212593583
RMSE: 1.851409522839115
MAE: 1.2673945447092603
MAPE: 0.34075140078431965
### Standard Deviation
MSE: 4.681111291435602e-16
RMSE: 2.340555645717801e-16
MAE: 0.0
MAPE: 5.851389114294502e-17
### Variance
MSE: 2.1912802922805884e-31
RMSE: 5.478200730701471e-32
MAE: 0.0
MAPE: 3.4238754566884194e-33

#####
Zeros
MSE: 0.43443157231113005
RMSE: 0.6591142331274071
MAE: 0.39529455716355116
MAPE: 0.39529455716355116
### Standard Deviation
MSE: 0.0
RMSE: 1.1702778228589004e-16
MAE: 0.0
MAPE: 0.0
### Variance
MSE: 0.0
RMSE: 1.3695501826753678e-32
MAE: 0.0
MAPE: 0.0


In [14]:
started_at_day = df_hold_out['started_at_day']
started_at_month = df_hold_out['started_at_month']
started_at_week = df_hold_out['started_at_week']
started_at_year = df_hold_out['started_at_year']
started_at_hour = df_hold_out['started_at_hour']
started_at_quarter = df_hold_out['started_at_quarter']
started_at_dayofweek = df_hold_out['started_at_dayofweek']
preds_holdout = test_given_model(df_hold_out, model, FEATURES)
preds_holdout['started_at_day'] = started_at_day
preds_holdout['started_at_month'] = started_at_month
preds_holdout['started_at_week'] = started_at_week
preds_holdout['started_at_year'] = started_at_year
preds_holdout['started_at_hour'] = started_at_hour
preds_holdout['started_at_quarter'] = started_at_quarter
preds_holdout['started_at_dayofweek'] = started_at_dayofweek

overall
MSE: 1.866301480198346
RMSE: 1.3661264510279953
MAE: 0.7937983631654444
MAPE: 0.38955512810388687

Non-zero
MSE: 3.8399491393229668
RMSE: 1.9595788168182893
MAE: 1.321094071254138
MAPE: 0.3382962036341432

#####
Zeros
MSE: 0.48728959489359625
RMSE: 0.6980613117009109
MAE: 0.42537036885616325
MAPE: 0.42537036885616325


In [None]:

# overall
# MSE: 1.6091632690662536
# RMSE: 1.2685279930164148
# MAE: 0.7361779641989485
# MAPE: 0.3715039931210286
# Non-zero
# MSE: 3.4297751772131315
# RMSE: 1.851965220303322
# MAE: 1.269109121830633
# MAPE: 0.3404813521942264
# Zeros
# MSE: 0.431886378596269
# RMSE: 0.6571806285917662
# MAE: 0.3915644124125004
# MAPE: 0.3915644124125004

In [None]:
# remove started_at_month, started_at_day, started_at_week,
# started_at_quarter, is_holiday, wspd
# MSE: 1.6111181531005658
# RMSE: 1.2692982916165
# MAE: 0.7368968664689701
# MAPE: 0.3723765984434761
# Non-zero
# MSE: 3.431842588001555
# RMSE: 1.8525233029577672
# MAE: 1.2680101498075016
# MAPE: 0.3397737788835909
# Zeros
# MSE: 0.43376849855242555
# RMSE: 0.6586110373751912
# MAE: 0.393458821327036
# MAPE: 0.393458821327036


# overall
# MSE: 1.6294893097682621
# RMSE: 1.276514516082078
# MAE: 0.7408827790644698
# MAPE: 0.37533044059343734
# Non-zero
# MSE: 3.4627153298287077
# RMSE: 1.8608372658103953
# MAE: 1.2726231833224586
# MAPE: 0.34175868603922716
# Zeros
# MSE: 0.4440556548702337
# RMSE: 0.6663750106886014
# MAE: 0.397039213689089
# MAPE: 0.397039213689089

# Remove Month
# overall
# MSE: 1.6287163715084982
# RMSE: 1.2762117267555952
# MAE: 0.7416295395139874
# MAPE: 0.3762386808549814
# Non-zero
# MSE: 3.4556012433987138
# RMSE: 1.858924754636054
# MAE: 1.2723808700703776
# MAPE: 0.34192757470879753
# Zeros
# MSE: 0.44738314414609787
# RMSE: 0.6688670601443144
# MAE: 0.39842554680461706
# MAPE: 0.39842554680461706

#Remove Month and day
# MSE: 1.622845100710954
# RMSE: 1.273909376961703
# MAE: 0.739609583012184
# MAPE: 0.3740044328553058
# Non-zero
# MSE: 3.4528571644218076
# RMSE: 1.8581865257346495
# MAE: 1.2728596588723815
# MAPE: 0.3418606787306867
# Zeros
# MSE: 0.4394897122026063
# RMSE: 0.6629402025843706
# MAE: 0.39478980657198165
# MAPE: 0.39478980657198165

#Remove Holiday
# overall
# MSE: 1.6161437089286634
# RMSE: 1.2712764093338094
# MAE: 0.7380665373691346
# MAPE: 0.37320572690397497
# Non-zero
# MSE: 3.436946315998566
# RMSE: 1.8539002982896804
# MAE: 1.269840426478804
# MAPE: 0.3407368777706423
# Zeros
# MSE: 0.4387435052835275
# RMSE: 0.6623771624109088
# MAE: 0.394201319415361
# MAPE: 0.394201319415361

In [None]:
1.6252472891786742
1.2748518695043256
0.37497985772931636

# Removing month, technically better but maybe because of the limited test data to three months
1.6200344008372791
1.2728057199892209
0.3743123330000771

# Removing dwpt
1.622011464474316
1.273582138880063
0.373634900250201


0.373634900250201

In [15]:
suf = "_o_xgb_02_25_holdout_best"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

preds_holdout.to_csv(f"{models_dir}/holdout_predictions{suf}.csv")
del preds_holdout


filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model


## LightGBM

In [None]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='lgbm')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.145786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3741
[LightGBM] [Info] Number of data points in the train set: 1008560, number of used features: 35
[LightGBM] [Info] Start training from score 1.002970
overall
MSE: 1.6523804453712883
RMSE: 1.2854495110160058
MAE: 0.7469085035765901
MAPE: 0.3766677858565356

Non-zero
MSE: 3.5306674156607283
RMSE: 1.8790070291674612
MAE: 1.284205979459194
MAPE: 0.34140271080612744

Zeros
MSE: 0.4378086665656721
RMSE: 0.6616711166173661
MAE: 0.39947152437530264
MAPE: 0.39947152437530264


In [None]:
#11-28
3.622206095204831
1.9032094196921239
0.3979671041602479

#11-29
3.5923026180976714
1.895337072422125
0.3977005786689495

0.3977005786689495

In [None]:
suf = "_o_lgbm_11_29_best"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## Random Forest

In [None]:
model, model_preds = train_model(df_train, df_test, FEATURES, model='rf')

In [None]:
#11_01
3.6042873699256828
1.8984960810930538
0.4084309933647605

#11_28
3.7858322010061154
1.9457215116778956
0.44009657232024235

0.44009657232024235

In [None]:
#suf = "_o_rf_11_29"
#
#
#model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
#del model_preds
#
#filename = f'{models_dir}/demand_model{suf}.sav'
#joblib.dump(model, filename)
#del model