In [None]:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:

import math
import torch
import pickle
import joblib
import numpy as np
import pandas as pd
import torch.nn as nn
from datetime import datetime, timedelta
from sklearn.metrics import mean_squared_error

In [None]:
directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

In [None]:
timestamps = np.load(f'{data_dir}/demand_graph_timestamps.pkl.npz')
stamps = [timestamps[f'arr_{i}'] for i in range(8757, len(timestamps)-1)]
gnn_embedding = torch.load(f'{data_dir}/gnn_cnn_2024-11-02 14:35:41.074099_output_embedding.pt')

  gnn_embedding = torch.load(f'{data_dir}/gnn_cnn_2024-11-02 14:35:41.074099_output_embedding.pt')


In [None]:
station_clusters = [
    74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,
    87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
    100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
    113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
    126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
    139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
    152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
    165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177,
    178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
    205, 206, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219,
    220, 222, 223, 224, 225, 226, 227, 228, 230, 231, 232, 233, 234,
    235, 237, 238, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
    250, 251, 252, 253, 255, 257, 258, 261, 264, 265, 266, 274, 275,
    278
]

num_stations = len(station_clusters)

In [None]:
df = pd.read_csv(f'{data_dir}/final_model_input_partial_scale_2.csv')
datetimes = [datetime(int(arr[0][0]), int(arr[0][1]), int(arr[0][2]), int(arr[0][3])) for arr in stamps]

# Rename the column just once
df = df.rename(columns={"started_at_hourly": "datetime"})
df['datetime'] = pd.to_datetime(df['datetime'])
# Create a DataFrame with all combinations of datetimes and station_clusters
all_combinations = pd.DataFrame(
    [(dt, sc) for dt in datetimes for sc in station_clusters],
    columns=["datetime", "start_station_cluster"]
)

# Merge with the original DataFrame to include 'demand' where it exists
output_df = all_combinations.merge(
    df[['demand', 'start_station_cluster', 'datetime']],
    on=["datetime", "start_station_cluster"],
    how="left"
)

# Fill missing values with 0
output_df['demand'] = output_df['demand'].fillna(0)

In [None]:
del df
del all_combinations
del timestamps

In [None]:
target_array = output_df['demand'].values

n_rows = len(target_array)

target = target_array.reshape(-1, num_stations)

In [None]:
train_target = target[1:8761]
test_target = target[8761:]
train_embedding = gnn_embedding[0:8760]
test_embedding = gnn_embedding[8760:-1]

In [None]:
test_target.shape

(2181, 183)

In [None]:
len(test_embedding)

2181

In [None]:


class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim, num_layers=1, dropout=0.1):
        super(LSTMModel, self).__init__()
        self.lstm = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            bidirectional=True,
            dropout=dropout
            )
        self.fc = nn.Linear(hidden_dim, output_dim)  # Output dimension, e.g., regression or classification

    def forward(self, x):
        # x: [batch_size, seq_len, embedding_dim]
        lstm_out, (hidden, _) = self.lstm(x)  # lstm_out: [batch_size, seq_len, hidden_dim]
        output = self.fc(hidden[-1])  # Use the last hidden state: [batch_size, output_dim]
        return output

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [None]:
train_labels = torch.tensor(train_target, dtype=torch.float32)
test_labels = torch.tensor(test_target, dtype=torch.float32)

In [None]:
from torch.utils.data import Dataset, DataLoader

class SequenceDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences  # List of tensors [seq_len, input_dim]
        self.labels = labels        # List of corresponding labels [output_dim]

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return self.sequences[idx], self.labels[idx]

In [None]:
dataset = SequenceDataset(train_embedding, train_labels)
train_dataloader = DataLoader(dataset, batch_size=48, shuffle=True)

In [None]:
embedding_dim = 50
hidden_dim = 128
num_layers = 2

# LSTM model
model = LSTMModel(embedding_dim=embedding_dim, hidden_dim=hidden_dim, output_dim=num_stations).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, weight_decay=1e-4)

epochs = 50
# Training loop
model.train()
for epoch in range(epochs):  # Number of epochs
    for batch in train_dataloader:
        inputs, targets = batch  # Inputs: [batch_size, seq_len, input_dim], Targets: [batch_size, output_dim]
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)  # Forward pass
        loss = criterion(outputs, targets)  # Loss calculation
        loss.backward()  # Backpropagation

        # Clip gradients
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()  # Update weights

    print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")



Epoch 1, Loss: 5.4820
Epoch 2, Loss: 6.1794
Epoch 3, Loss: 6.5951
Epoch 4, Loss: 2.8701
Epoch 5, Loss: 3.3345
Epoch 6, Loss: 5.2246
Epoch 7, Loss: 3.2831
Epoch 8, Loss: 7.3891
Epoch 9, Loss: 2.8601
Epoch 10, Loss: 2.9744
Epoch 11, Loss: 4.0277
Epoch 12, Loss: 4.1545
Epoch 13, Loss: 4.0793
Epoch 14, Loss: 2.4864
Epoch 15, Loss: 2.6482
Epoch 16, Loss: 4.3978
Epoch 17, Loss: 4.1200
Epoch 18, Loss: 2.7842
Epoch 19, Loss: 1.9577
Epoch 20, Loss: 4.5259
Epoch 21, Loss: 2.7899
Epoch 22, Loss: 1.3351
Epoch 23, Loss: 3.5982
Epoch 24, Loss: 2.9588
Epoch 25, Loss: 2.5110
Epoch 26, Loss: 4.6321
Epoch 27, Loss: 2.3855
Epoch 28, Loss: 1.7447
Epoch 29, Loss: 3.1415
Epoch 30, Loss: 2.5405
Epoch 31, Loss: 2.4176
Epoch 32, Loss: 1.7186
Epoch 33, Loss: 1.5220
Epoch 34, Loss: 2.0811
Epoch 35, Loss: 2.7598
Epoch 36, Loss: 3.3148
Epoch 37, Loss: 2.3711
Epoch 38, Loss: 1.6538
Epoch 39, Loss: 3.6659
Epoch 40, Loss: 2.9593
Epoch 41, Loss: 4.2195
Epoch 42, Loss: 1.3082
Epoch 43, Loss: 3.1453
Epoch 44, Loss: 2.47

In [None]:
dataset = SequenceDataset(test_embedding, test_labels)
test_dataloader = DataLoader(dataset, batch_size=48, shuffle=True)

In [None]:
preds = []
model.eval()
with torch.no_grad():
    for inputs, targets in test_dataloader:
        inputs = inputs.to(device)
        predictions = model(inputs)  # Shape: [batch_size, output_dim]
        preds.extend(predictions.reshape(-1))

In [None]:
suf = "2024-11-02 14:35:41.074099"

filename = f'{models_dir}/gnn_lstm_{suf}.sav'
joblib.dump(model, filename)
del model

In [None]:
all_tests = test_labels.reshape(-1)

In [None]:
predictions

tensor([[ 2.2160e-01,  7.2856e-01,  9.3312e-01,  ..., -6.1146e-08,
          2.5188e-12,  1.5824e-06],
        [ 3.0785e-01,  1.0109e+00,  1.2486e+00,  ..., -7.4423e-08,
          1.6979e-12,  8.1800e-07],
        [ 3.6023e-01,  1.2128e+00,  1.4402e+00,  ..., -7.6610e-08,
          1.4282e-12,  6.2063e-07],
        ...,
        [ 1.9510e-01,  6.1997e-01,  8.1353e-01,  ..., -4.6711e-08,
          2.7647e-12,  1.4031e-06],
        [ 3.8940e-01,  1.3252e+00,  1.5435e+00,  ..., -6.9205e-08,
          1.4055e-12,  4.8114e-07],
        [ 4.0848e-02,  1.0548e-01,  1.9283e-01,  ..., -1.9408e-07,
         -1.6586e-12,  8.4182e-07]])

In [None]:
rmse_score = np.sqrt(mean_squared_error(preds, all_tests))

In [None]:
rmse_score

2.5110908