In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
import math
import torch
import pickle
import numpy as np
import pandas as pd
#import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from datetime import datetime, timedelta
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, mean_absolute_error
import joblib

from sklearn.decomposition import PCA
# check xgboost version
from xgboost import XGBRegressor


#torch stuff
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

#from torch_geometric.utils import dense_to_sparse


import lightgbm as lgb
from lightgbm import LGBMRegressor
import holidays

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [3]:
from hyperopt.pyll.base import scope
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [4]:

execfile('/content/drive/MyDrive/Thesis/models_training/model_training.py')

Using device: cpu


In [5]:
target_scaler = joblib.load(f'{models_dir}/target_scaler.sav')

In [6]:
@scope.define
def to_int(x):
    """
    Custom Hyperopt scope function to cast a value to an integer.

    :param x: Value to be cast to an integer.
    :type x: Any
    :return: Integer representation of the input value.
    :rtype: int
    """
    return int(x)


In [7]:

#################################
## Cosine Similarity
#################################

cosine_similarity = pd.read_csv(f'{data_dir}/cosine_similarity.csv').drop(columns=['Unnamed: 0'])
cols = [f"cosine_sim_{i}" for i in range(183)]
cosine_similarity[cols] = cosine_similarity[cols].astype('float32')
cosine_similarity['started_at_hourly'] = pd.to_datetime(cosine_similarity['started_at_hourly'])

#################################
## PCA
#################################

PCA_DIM = 1

pca_test = cosine_similarity[cosine_similarity['started_at_hourly'] >= pd.to_datetime("2024-01-01 00:00:00")]
pca_train = cosine_similarity[cosine_similarity['started_at_hourly'] < pd.to_datetime("2024-01-01 00:00:00")]

del cosine_similarity

pca_train_time_clusters = pca_train[['started_at_hourly', 'start_station_cluster']]
pca_test_time_clusters = pca_test[['started_at_hourly', 'start_station_cluster']]

pca_train.drop(columns=['started_at_hourly', 'start_station_cluster'], inplace=True)
pca_test.drop(columns=['started_at_hourly', 'start_station_cluster'], inplace=True)

pca = PCA(n_components=PCA_DIM, svd_solver='arpack')

m = pca.fit_transform(pca_train[cols])
n = pca.transform(pca_test[cols])
del pca_train

n.shape


df_train_pca = pd.DataFrame(m, columns=[f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])
df_test_pca = pd.DataFrame(n, columns=[f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

del m
del n

df_train_pca[['started_at_hourly', 'start_station_cluster']] = pca_train_time_clusters[['started_at_hourly', 'start_station_cluster']]
df_test_pca[['started_at_hourly', 'start_station_cluster']] = pca_test_time_clusters[['started_at_hourly', 'start_station_cluster']]

#################################
#################################

df = pd.read_csv(f'{data_dir}/final_model_input_partial_scale_4.csv')

suf = "_normal_xgb_11_08"

returns = pd.read_csv(f"{models_dir}/test_returns_predictions{suf}.csv").rename(
    columns={
        "end_station_cluster": "start_station_cluster",
        "pred": "returns"
        }
    )[['started_at_hourly', 'returns', 'start_station_cluster']]

filtered_clusters = [
    35.0,
    42.0,
    53.0,
    62.0,
    65.0,
    69.0,
    71.0,
    79.0,
    81.0,
    86.0,
    88.0,
    91.0,
    94.0,
    99.0,
    105.0,
    122.0,
    145.0,
    154.0
  ]
df = df[df['start_station_cluster'].isin(filtered_clusters)]

#################################
#################################

df = df.merge(returns, on=['started_at_hourly', 'start_station_cluster'], how='left').fillna(0)

df['started_at_hourly'] = pd.to_datetime(df['started_at_hourly'])
df = df.sort_values(by=['start_station_cluster', 'started_at_hourly'])
df_test = df[df['started_at_hourly'] >= pd.to_datetime("2024-01-01 00:00:00")]
df_train = df[df['started_at_hourly'] < pd.to_datetime("2024-01-01 00:00:00")]

del df
del returns

df_train = df_train.merge(df_train_pca, on=[
    'started_at_hourly', 'start_station_cluster'
    ], how='left').fillna(0)

df_test = df_test.merge(df_test_pca, on=[
    'started_at_hourly', 'start_station_cluster'
    ], how='left').fillna(0)

#del df_train_pca
#del df_test_pca


#################################
#################################

# GNN Variance Embedding

## XGBoost

In [8]:
# FEATURES = ['start_station_cluster', #
#             #'started_at_year',#
#             #'started_at_month',
#             'started_at_day',
#             'started_at_hour',#
#             #'started_at_week',
#             #'started_at_quarter',
#             'started_at_dayofweek',
#             'is_holiday',
#             #'flag_added',
#             "temp",
#             #"dwpt",
#             "rhum",
#             "prcp",
#             "wdir",
#             #"wspd",
#             "pres",
#             "demand_lag_1_h",
#             "demand_lag_2_h",
#             "demand_lag_24_h",
#             "prcp_lag_1_h",
#             "prcp_lag_2_h",
#             "prcp_lag_24_h",
#             "mean_gnn_cluster_demand_1h",
#             "total_gnn_cluster_demand_1h",
#             "total_demand_1h",
#             "demand_degrees_1h",
#             "returns"
# ]

# #FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
# #FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

In [48]:
FEATURES = ['start_station_cluster',# 'started_at_year',
            #'started_at_month',
            'started_at_day',
            'started_at_hour',
            #'started_at_week',
            #'started_at_quarter',
            'started_at_dayofweek',
            #'is_holiday',
            #'flag_added',
            #"temp",
            #"dwpt",
            "rhum",
            "prcp",
            #"wdir",
            #"wspd",
            "pres",
            "coco",
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            # "temp_lag_1_h",
            # "temp_lag_2_h",
            # "temp_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "rhum_lag_1_h",
            "rhum_lag_2_h",
            "rhum_lag_24_h",
            # "wspd_lag_1_h",
            # "wspd_lag_2_h",
            # "wspd_lag_24_h",
            "mean_gnn_cluster_demand_1h",
            "total_gnn_cluster_demand_1h",
            "total_demand_1h",
            "demand_degrees_1h",
            "returns"
]
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
#FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

In [49]:
started_at_day = df_test['started_at_day']
started_at_month = df_test['started_at_month']
started_at_week = df_test['started_at_week']
started_at_year = df_test['started_at_year']
started_at_hour = df_test['started_at_hour']
started_at_quarter = df_test['started_at_quarter']
started_at_dayofweek = df_test['started_at_dayofweek']
model, model_preds = train_model(df_train, df_test, FEATURES, model='xgb', tuning=False)
model_preds['started_at_day'] = started_at_day
model_preds['started_at_month'] = started_at_month
model_preds['started_at_week'] = started_at_week
model_preds['started_at_year'] = started_at_year
model_preds['started_at_hour'] = started_at_hour
model_preds['started_at_quarter'] = started_at_quarter
model_preds['started_at_dayofweek'] = started_at_dayofweek

overall
MSE: 16.080076871885485
RMSE: 4.00999711619416
MAE: 2.698637979669594
MAPE: 0.45012757189971053

Non-zero
MSE: 17.815312049061045
RMSE: 4.220818883707407
MAE: 2.8996487537781332
MAPE: 0.3589064469851769

Zeros
MSE: 2.728713215655714
RMSE: 1.6518817196324058
MAE: 1.152007310095613
MAPE: 1.152007310095613


In [None]:
# overall
# MSE: 16.080076871885485
# RMSE: 4.00999711619416
# MAE: 2.698637979669594
# MAPE: 0.45012757189971053
# Non-zero
# MSE: 17.815312049061045
# RMSE: 4.220818883707407
# MAE: 2.8996487537781332
# MAPE: 0.3589064469851769
# Zeros
# MSE: 2.728713215655714
# RMSE: 1.6518817196324058
# MAE: 1.152007310095613
# MAPE: 1.152007310095613

In [None]:
16.62309778288935
4.077143336073598
0.4559626791820638

16.683375499705036
4.084528797756852
0.4575149102875578

16.508282242728246
4.06303854802391
0.46395306431865185

16.611400070346974
4.075708535990641
0.45878085618772735

16.360039137812965
4.044754521329195
0.4570649011034489

16.29366632392327
4.03654138142089
0.4527781879189763

16.271867749976934
4.033840322816079
0.4521287239350817

16.08825524565287
4.011016734651312
0.4526198632614155

16.083943659451712
4.010479230647094
0.4507647119805387

0.4507647119805387

In [None]:
#11-15
3.519954687632939
1.8761542281041126
0.39561652953215365


#11-28 (20 dim)
3.524785445848623
1.877441196375701
0.39610593754659995

#11-29 (no dim)
3.563436345796625
1.887706636582238
0.4006454580255268

#11-29 (5 dim)
3.504096872035053
1.8719233082674762
0.39420467949842114

#11-29 (6 dim)
3.524516281760969
1.8773695112473114
0.39833392477077356

#11-29 (1 dim)
3.4851078325990033
1.8668443514655964
0.39848029011836106

0.39848029011836106

In [None]:
suf = "_o_xgb_01_11_worst"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## LightGBM

In [None]:
FEATURES = ['start_station_cluster', #
            #'started_at_year',#
            #'started_at_month',
            'started_at_day',
            'started_at_hour',#
            #'started_at_week',
            #'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            "temp",
            #"dwpt",
            "rhum",
            "prcp",
            "wdir",
            #"wspd",
            "pres",
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "mean_gnn_cluster_demand_1h",
            "total_gnn_cluster_demand_1h",
            "total_demand_1h",
            "demand_degrees_1h",
            "returns"
]
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
#FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

model, model_preds = train_model(df_train, df_test, FEATURES, model='lgbm')

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.012786 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2088
[LightGBM] [Info] Number of data points in the train set: 131740, number of used features: 21
[LightGBM] [Info] Start training from score 1.021779
16.42132775098073
4.0523237470593
0.46564148816927287


In [None]:
#11-28
3.622206095204831
1.9032094196921239
0.3979671041602479

#11-29
3.5923026180976714
1.895337072422125
0.3977005786689495

0.3977005786689495

In [None]:
suf = "_o_lgbm_11_29_worst"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model

## Random Forest

In [None]:
FEATURES = ['start_station_cluster', #
            #'started_at_year',#
            #'started_at_month',
            'started_at_day',
            'started_at_hour',#
            #'started_at_week',
            #'started_at_quarter',
            'started_at_dayofweek',
            'is_holiday',
            #'flag_added',
            "temp",
            #"dwpt",
            "rhum",
            "prcp",
            "wdir",
            #"wspd",
            "pres",
            "demand_lag_1_h",
            "demand_lag_2_h",
            "demand_lag_24_h",
            "prcp_lag_1_h",
            "prcp_lag_2_h",
            "prcp_lag_24_h",
            "mean_gnn_cluster_demand_1h",
            "total_gnn_cluster_demand_1h",
            "total_demand_1h",
            "demand_degrees_1h",
            "returns"
]
#FEATURES.extend([f"dim_mean_{i}" for i in range(50)])
#FEATURES.extend([f"cosine_sim_pca_{i}" for i in range(PCA_DIM)])

model, model_preds = train_model(df_train, df_test, FEATURES, model='rf')

17.855937960927964
4.22562870599488
0.49296428809942744


In [None]:
#11_01
3.6042873699256828
1.8984960810930538
0.4084309933647605

#11_28
3.7858322010061154
1.9457215116778956
0.44009657232024235

0.44009657232024235

In [None]:
suf = "_o_rf_11_29_worst"


model_preds.to_csv(f"{models_dir}/test_predictions{suf}.csv")
del model_preds

filename = f'{models_dir}/demand_model{suf}.sav'
joblib.dump(model, filename)
del model