In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [23]:
import os
import math
import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch_geometric.nn import GCNConv, global_mean_pool

import torch
import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from torch.utils.data import Dataset, DataLoader, random_split

from torch_geometric.data import Data
from torch_geometric.utils import dense_to_sparse

In [4]:
directory = '/content/drive/MyDrive/Thesis'
data_dir = directory + "/Data"
models_dir = directory + "/models"

In [5]:
distance = pd.read_csv(f'{data_dir}/distance_matrix_centroids.csv')

In [6]:
distance['end_station_cluster'].min()

74

In [7]:
timestamps = np.load(f'{data_dir}/demand_graph_timestamps.pkl.npz')
stamps = [timestamps[f'arr_{i}'] for i in range(17516, len(timestamps))]

In [8]:
station_clusters = [
    74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,  85,  86,
    87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,  99,
    100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
    113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
    126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
    139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151,
    152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164,
    165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177,
    178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190,
    191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
    205, 206, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219,
    220, 222, 223, 224, 225, 226, 227, 228, 230, 231, 232, 233, 234,
    235, 237, 238, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
    250, 251, 252, 253, 255, 257, 258, 261, 264, 265, 266, 274, 275,
    278
]

num_stations = len(station_clusters)

In [9]:
df = pd.read_csv(f'{data_dir}/final_model_input_partial_scale_2.csv')[['start_station_cluster', 'started_at_hourly', 'demand']]
datetimes = [datetime.datetime(int(arr[0][0]), int(arr[0][1]), int(arr[0][2]), int(arr[0][3])) for arr in stamps]

# Rename the column just once
df = df.rename(columns={"started_at_hourly": "datetime"})
df['datetime'] = pd.to_datetime(df['datetime'])
# Create a DataFrame with all combinations of datetimes and station_clusters
all_combinations = pd.DataFrame(
    [(dt, sc) for dt in datetimes for sc in station_clusters],
    columns=["datetime", "start_station_cluster"]
)

# Merge with the original DataFrame to include 'demand' where it exists
output_df = all_combinations.merge(
    df[['demand', 'start_station_cluster', 'datetime']],
    on=["datetime", "start_station_cluster"],
    how="left"
)

# Fill missing values with 0
output_df['demand'] = output_df['demand'].fillna(0)

output_df = output_df[output_df['datetime'] >= datetimes[0]]
target_array = output_df['demand'].values



In [10]:
del df
del all_combinations
del timestamps
del output_df

In [11]:
target = target_array.reshape(-1, num_stations)

In [18]:
loaded = np.load(f'{data_dir}/demand_graphs.pkl.npz')
demand_graphs = [loaded[f'arr_{i}'] for i in range(17515, len(loaded)-1)]
adj_matrices = torch.tensor(demand_graphs)

# Number of stations
num_stations = adj_matrices.shape[1]
del demand_graphs

In [19]:
from torch_geometric.data import Data
from torch_geometric.utils import dense_to_sparse

# Convert adjacency matrices to PyTorch Geometric format for each hour
data_list = []

for i in range(adj_matrices.shape[0]):  # Iterate over each hour
    adj_matrix = adj_matrices[i]
    y = target[i]
    # Convert dense adjacency matrix to edge_index (sparse representation)
    edge_index, edge_attr = dense_to_sparse(adj_matrix)
    # Use an identity matrix for node features (can be replaced with other features)
    x = torch.eye(num_stations)
    # Create a Data object
    y = torch.tensor(y, dtype=torch.float32)
    data = Data(x=adj_matrix.float(), y=y, edge_index=edge_index, edge_attr=edge_attr)
    data_list.append(data)

print(data_list[0])

for data in data_list:
    data.edge_attr = data.edge_attr.float()

Data(x=[183, 183], edge_index=[2, 103], edge_attr=[103], y=[183])


In [20]:
adj_matrices.shape

torch.Size([2184, 183, 183])

In [33]:
target_array.shape

(399672,)

In [22]:
del target
del adj_matrices
#del target_array

In [24]:
from torch_geometric.nn import GCNConv
import torch.nn.functional as F

class GNNForDemandPrediction(torch.nn.Module):
    def __init__(self, in_channels, out_channels=50, h1=100, num_layers=2, fc_hidden_dim = 256, dropout_prob=0.2):
        super(GNNForDemandPrediction, self).__init__()
        self.convs = nn.ModuleList()
        self.convs.append(GCNConv(in_channels, h1))

        for _ in range(num_layers - 2):
            self.convs.append(GCNConv(h1, h1))

        self.convs.append(GCNConv(h1, out_channels))

        self.fc1 = torch.nn.Linear(out_channels, fc_hidden_dim)
        self.fc2 = torch.nn.Linear(fc_hidden_dim, in_channels)
        self.dropout_prob = dropout_prob
        self.leaky_relu = nn.LeakyReLU()
    def forward(self, x, edge_index, batch):
        for conv in self.convs[:-1]:
          x = self.leaky_relu(conv(x, edge_index))
          x = F.dropout(x, p=self.dropout_prob, training=self.training)

        x = self.convs[-1](x, edge_index)

        x = global_mean_pool(x, batch)
        x = self.leaky_relu(self.fc1(x))
        # Predict demand using a fully connected layer
        predicted_demand = self.fc2(x)  # Shape: [num_edges]
        #print(predicted_demand.shape)
        return predicted_demand  # Return predicted demand for each edge

    def forward(self, x, edge_index, batch):
        for conv in self.convs[:-1]:
          x = self.leaky_relu(conv(x, edge_index))
          x = F.dropout(x, p=self.dropout_prob, training=self.training)

        x = self.convs[-1](x, edge_index)

        x = global_mean_pool(x, batch)
        x = self.leaky_relu(self.fc1(x))
        # Predict demand using a fully connected layer
        predicted_demand = self.fc2(x)  # Shape: [num_edges]
        #print(predicted_demand.shape)
        return predicted_demand  # Return predicted demand for each edge
    # New method to extract node embeddings

    def get_node_embeddings(self, x, edge_index, batch):
        for conv in self.convs[:-1]:
          x = self.leaky_relu(conv(x, edge_index))

        x = self.convs[-1](x, edge_index)

        x = global_mean_pool(x, batch)

In [25]:
# Instantiate the model
model = GNNForDemandPrediction(
    in_channels=num_stations,
    h1=100
)

model.load_state_dict(torch.load(f'{models_dir}/gnn_cnn_2023_2024-11-19 00:24:00.973781.pth'))

  model.load_state_dict(torch.load(f'{models_dir}/gnn_cnn_2023_2024-11-19 00:24:00.973781.pth'))


<All keys matched successfully>

In [26]:
from torch_geometric.loader import DataLoader
dataloader = DataLoader(data_list, batch_size=32, shuffle=True)

In [27]:
# Training function

def test(model, data_list):
    model.eval()  # Set the model to training mode
    total_loss = 0  # Keep track of total loss

    preds = []
    # Loop over the data (for each hour)
    for batch in dataloader:
        predicted_demand = model(batch.x, batch.edge_index, batch.batch)
        # Calculate loss (difference between predicted and actual demand)
        #print(data.y)
        preds.extend(predicted_demand.reshape(-1))
    return preds
# Train the model using data from all hours
preds = test(model, data_list)

In [35]:
from sklearn.metrics import mean_absolute_percentage_error

In [39]:
preds = [i.detach().numpy() for i in preds]

In [42]:
target_array

array([0., 2., 2., ..., 0., 0., 0.])

In [40]:
mean_absolute_percentage_error(target_array, preds)

734112278166999.4

In [None]:
output_list = []
for i in data_list:
  output_list.append(model.get_node_embeddings(i))

In [None]:
del data_list

In [None]:
torch.save(output_list, f'{data_dir}/gnn_cnn_2024-11-02 14:35:41.074099_output_embedding.pt')

In [None]:
timestamps = np.load(f'{data_dir}/demand_graph_timestamps.pkl.npz')
stamps = [timestamps[f'arr_{i}'][0] for i in range(8758, len(timestamps))]

In [None]:
len(stamps)

10942

In [None]:
stamps[0]

array([2.023e+03, 1.000e+00, 1.000e+00, 0.000e+00, 1.000e+00])

In [None]:
timestamps = np.load(f'{data_dir}/demand_graph_timestamps.pkl.npz')
stamps = [timestamps[f'arr_{i}'][0] for i in range(0, 8758)]

In [None]:
stamps[-1]

array([2022.,   12.,   31.,   23.,    0.])

In [None]:
model.get_node_embeddings(data_list[i])

tensor([[ 2.4265e-03, -2.5289e-02,  2.0832e-02,  ...,  7.7369e-02,
          7.1916e-06,  1.3067e-02],
        [ 2.4265e-03, -2.5289e-02,  2.0832e-02,  ...,  7.7369e-02,
          7.1916e-06,  1.3067e-02],
        [ 2.4265e-03, -2.5289e-02,  2.0832e-02,  ...,  7.7369e-02,
          7.1916e-06,  1.3067e-02],
        ...,
        [ 2.4265e-03, -2.5289e-02,  2.0832e-02,  ...,  7.7369e-02,
          7.1916e-06,  1.3067e-02],
        [ 2.4265e-03, -2.5289e-02,  2.0832e-02,  ...,  7.7369e-02,
          7.1916e-06,  1.3067e-02],
        [ 2.4265e-03, -2.5289e-02,  2.0832e-02,  ...,  7.7369e-02,
          7.1916e-06,  1.3067e-02]], grad_fn=<AddBackward0>)