In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
processed_data_loc = '/content/drive/MyDrive/Colab Notebooks/GBDS_Project/processed_data/'

In [3]:
import numpy as np

In [4]:
features_chunks = []
graph_chunks = []

for file_count in range(30):
  feature_chunk = np.load(f'{processed_data_loc}features/features_156_clusters_2024_4_11__{file_count}.npy')
  graph_chunk = np.load(f'{processed_data_loc}graphs/graphs_156_clusters_2024_4_11__{file_count}.npy')
  features_chunks.append(feature_chunk.astype(np.float16))
  graph_chunks.append(graph_chunk.astype(np.int16))

# for file_count in range(30):
#   np.save(f'{processed_data_loc}temp_features/features_156_clusters_2024_4_11__{file_count}.npy', features_chunks[file_count])
#   np.save(f'{processed_data_loc}temp_graphs/graphs_156_clusters_2024_4_11__{file_count}.npy', graph_chunks[file_count])
features = np.concatenate(features_chunks, axis=0)
del features_chunks
graphs = np.concatenate(graph_chunks, axis=0)
del graph_chunks

In [5]:
!pip install pyg_lib torch_scatter torch_sparse torch_cluster torch_spline_conv -f https://data.pyg.org/whl/torch-2.2.0+cu121.html
!pip install torch_geometric

Looking in links: https://data.pyg.org/whl/torch-2.2.0+cu121.html
Collecting pyg_lib
  Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcu121/pyg_lib-0.4.0%2Bpt22cu121-cp310-cp310-linux_x86_64.whl (2.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m68.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_scatter
  Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcu121/torch_scatter-2.1.2%2Bpt22cu121-cp310-cp310-linux_x86_64.whl (10.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m104.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_sparse
  Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcu121/torch_sparse-0.6.18%2Bpt22cu121-cp310-cp310-linux_x86_64.whl (5.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.0/5.0 MB[0m [31m100.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch_cluster
  Downloading https://data.pyg.org/whl/torch-2.2.0%2Bcu121/torch_cluster-1.6.3%2

In [6]:

import torch.nn as nn
from torch.utils.tensorboard import SummaryWriter
from torch_geometric.nn import GCNConv
import torch

import os
import math
from tqdm import tqdm

In [7]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
config = {
    'seed': 20010106,      # Your seed number, you can pick your lucky number. :)
    'select_all': True,   # Whether to use all features.
    'test_ratio': 0.2,   # validation_size = train_size * valid_ratio
    'n_epochs': 500,     # Number of epochs.
    'batch_size': 16,
    'learning_rate': 5e-5,
    'early_stop': 50,    # If model has not improved for this many consecutive epochs, stop training.
    'save_path': '/content/drive/MyDrive/Colab Notebooks/GBDS_Project/models/GNNLSTM.ckpt'  # Your model will be saved here.
}

In [8]:
device

'cuda'

In [9]:
class GNN_LSTM_model(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(GNN_LSTM_model, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers,x.size(0), self.hidden_size).to(device) # 初始化隐藏状态h0
        c0 = torch.zeros(self.num_layers,x.size(0), self.hidden_size).to(device)  # 初始化记忆状态c0
        #print(f"x.shape:{x.shape},h0.shape:{h0.shape},c0.shape:{c0.shape}")
        out, _ = self.lstm(x, (h0, c0))  # LSTM前向传播
        out = self.fc(out[:, -1, :])  # 取最后一个时间步的输出作为预测结果
        return out

In [10]:
def trainer(train_loader, valid_loader, model, config, device):

    criterion = nn.MSELoss() # Define your loss function, do not modify this.

    # Define your optimization algorithm.
    optimizer=torch.optim.Adam(model.parameters(),lr=config['learning_rate'],betas=(0.5,0.999))

    writer = SummaryWriter() # Writer of tensoboard.

    if not os.path.isdir('./models'):
        os.mkdir('./models') # Create directory of saving models.

    n_epochs, best_loss, step, early_stop_count = config['n_epochs'], math.inf, 0, 0

    for epoch in range(n_epochs):
        model.train() # Set your model to train mode.
        loss_record = []

        # tqdm is a package to visualize your training progress.
        train_pbar = tqdm(train_loader, position=0, leave=True)

        for x, y in train_pbar:
            optimizer.zero_grad()               # Set gradient to zero.
            x, y = x.to(device), y.to(device)   # Move your data to device.
            pred = model(x)
#             print("y:",y)
#             print("pred:",pred)
            loss = criterion(pred, y)
            loss.backward()                     # Compute gradient(backpropagation).
            optimizer.step()                    # Update parameters.
            step += 1
            loss_record.append(loss.detach().item())

            # Display current epoch number and loss on tqdm progress bar.
            train_pbar.set_description(f'Epoch [{epoch+1}/{n_epochs}]')
            train_pbar.set_postfix({'loss': loss.detach().item()})

        mean_train_loss = sum(loss_record)/len(loss_record)
        writer.add_scalar('Loss/train', mean_train_loss, step)

        model.eval() # Set your model to evaluation mode.
        loss_record = []
        for x, y in valid_loader:
            x, y = x.to(device), y.to(device)
            with torch.no_grad():
                pred = model(x)
                loss = criterion(pred, y)

            loss_record.append(loss.item())

        mean_valid_loss = sum(loss_record)/len(loss_record)
        print(f'Epoch [{epoch+1}/{n_epochs}]: Train loss: {mean_train_loss:.4f}, Test loss: {mean_valid_loss:.4f}')
        writer.add_scalar('Loss/Test', mean_valid_loss, step)

        if mean_valid_loss < best_loss:
            best_loss = mean_valid_loss
            torch.save(model.state_dict(), config['save_path']) # Save your best model
            print('Saving model with loss {:.3f}...'.format(best_loss))
            early_stop_count = 0
        else:
            early_stop_count += 1

        if early_stop_count >= config['early_stop']:
            print('\nModel is not improving, so we halt the training session.')
            return

In [80]:
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, data, time_step=72):
        self.data = data
        self.time_step = time_step

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
      dataX = self.data[idx: idx+self.time_step]
      dataX = dataX.reshape(len(dataX), self.time_step,-1)
      dataY = self.data[idx+self.time_step]
      return dataX, dataY

In [81]:
dataset = CustomDataset(graphs)

In [82]:
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

In [85]:
iterator = iter(dataloader)


In [86]:
X, Y = next(iterator)

In [88]:
X.shape

torch.Size([64, 72, 72, 338])

In [68]:
X.shape

torch.Size([64, 72, 72, 338])

In [73]:
graphs.shape

(29708, 156, 156)

In [79]:
graphs[0:690].reshape(len(graphs[0:690]), 72,-1).shape

(690, 72, 338)

In [59]:
def split_data(data, time_step=24):
    dataX=[]
    datay=[]
    dataNew = []
    for i in range(len(data)-time_step):
        dataX.append(data[i:i+time_step])
        datay.append(data[i+time_step])
        dataNew.append(data[i:i+time_step+1])
    dataX=np.array(dataX).reshape(len(dataX),time_step,-1)
    datay=np.array(datay)
    dataNew=np.array(dataNew).reshape(len(dataNew),time_step+1,-1)
    return dataX,datay,dataNew

In [60]:
dataX, datay, dataNew=split_data(graphs[0:600])
print(f"dataX.shape:{dataX.shape},datay.shape:{datay.shape},dataNew.shape:{dataNew.shape}")

dataX.shape:(576, 24, 24336),datay.shape:(576, 156, 156),dataNew.shape:(576, 25, 24336)


In [None]:
graphs.shape

(29708, 156, 156)

In [None]:
dataX[0]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int16)

In [None]:
graphs.shape

(29708, 156, 156)

In [None]:
dataX[0][0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int16)