In [1]:
# version should match with torch and cuda
%%bash
pip install torch-scatter -f https://data.pyg.org/whl/torch-2.3.0+cu121.html
pip install torch-sparse -f https://data.pyg.org/whl/torch-2.3.0+cu121.html
pip install torch-cluster -f https://data.pyg.org/whl/torch-2.3.0+cu121.html
pip install torch-spline-conv -f https://data.pyg.org/whl/torch-2.3.0+cu121.html

Looking in links: https://data.pyg.org/whl/torch-2.3.0+cu121.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_scatter-2.1.2%2Bpt23cu121-cp310-cp310-linux_x86_64.whl (10.9 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 10.9/10.9 MB 41.4 MB/s eta 0:00:00
Installing collected packages: torch-scatter
Successfully installed torch-scatter-2.1.2+pt23cu121
Looking in links: https://data.pyg.org/whl/torch-2.3.0+cu121.html
Collecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_sparse-0.6.18%2Bpt23cu121-cp310-cp310-linux_x86_64.whl (5.1 MB)
     ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 5.1/5.1 MB 36.3 MB/s eta 0:00:00
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.18+pt23cu121
Looking in links: https://data.pyg.org/whl/torch-2.3.0+cu121.html
Collecting torch-cluster
  Downloading https://data.pyg.org/whl/torch-2.3.0%2Bcu121/torch_cluster-1.6.3%2Bpt23cu121-cp310-cp310-linux_x86_6

In [2]:
!pip install torch_geometric==2.2.0
!pip install torch_geometric_temporal

Collecting torch_geometric==2.2.0
  Downloading torch_geometric-2.2.0.tar.gz (564 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/565.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.0/565.0 kB[0m [31m32.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.2.0-py3-none-any.whl size=773275 sha256=42526ba5bab3d75d6cdfda507a80c4ab47ad2285bf5d2577ff3411d228e5cf20
  Stored in directory: /root/.cache/pip/wheels/c8/e4/83/5e964867e23f8a61cb8c5d5b9477617b710e96e6ebf1844562
Successfully built torch_geometric
Installing collected packages: torch_geometric
Successfully installed torch_geometric-2.2.0
Collecting torch_geometric_temporal
  Downloading torch_geometric_temporal-0.54.0.tar.gz 

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import os

import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv
from torch_geometric_temporal.nn.recurrent import GConvLSTM
from torch_geometric_temporal.signal import temporal_signal_split
from torch_geometric_temporal.dataset import METRLADatasetLoader
from torch_geometric_temporal.nn.recurrent import GConvGRU
from torch_geometric_temporal.signal import StaticGraphTemporalSignal

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
def search_data(sequence_length, num_of_depend, label_start_idx, num_for_predict, units, points_per_hour):
    if points_per_hour < 0:
        raise ValueError("points_per_hour should be greater than 0!")

    if label_start_idx + num_for_predict > sequence_length:
        return None

    x_idx = []
    for i in range(1, num_of_depend + 1):
        start_idx = label_start_idx - points_per_hour * units * i
        end_idx = start_idx + num_for_predict
        if start_idx >= 0:
            x_idx.append((start_idx, end_idx))
        else:
            return None

    if len(x_idx) != num_of_depend:
        return None

    return x_idx[::-1]

In [6]:
def get_sample_indices(data_sequence, num_of_hours, label_start_idx, num_for_predict, points_per_hour=12):
    hour_sample = None

    if label_start_idx + num_for_predict > data_sequence.shape[0]:
        return None, None

    if num_of_hours > 0:
        hour_indices = search_data(data_sequence.shape[0], num_of_hours, label_start_idx, num_for_predict, 1, points_per_hour)
        if not hour_indices:
            return None, None
        hour_sample = np.concatenate([data_sequence[i: j] for i, j in hour_indices], axis=0)

    # 這裡生成了多個時間步的目標
    target = data_sequence[label_start_idx: label_start_idx + num_for_predict]  # 這裡 num_for_predict 決定了生成多少個時間步的目標

    return hour_sample, target

In [7]:
def read_and_generate_dataset(graph_signal_matrix_filename, num_of_hours, num_for_predict, points_per_hour=12):
    data_seq = np.load(graph_signal_matrix_filename)['data']  # 讀取原始數據

    all_samples = []
    for idx in range(data_seq.shape[0]):
        hour_sample, target = get_sample_indices(data_seq, num_of_hours, idx, num_for_predict, points_per_hour)

        if hour_sample is None and target is None:
            continue

        sample = []

        if num_of_hours > 0:
            hour_sample = np.expand_dims(hour_sample, axis=0).transpose((0, 2, 3, 1))  # (1, N, F, T)
            sample.append(hour_sample)

        target = np.expand_dims(target, axis=0).transpose((0, 2, 3, 1))[:, :, 0, :]  # (1, N, T)，這裡的 T 應該和 num_for_predict 一致
        sample.append(target)

        time_sample = np.expand_dims(np.array([idx]), axis=0)  # (1, 1)
        sample.append(time_sample)

        all_samples.append(sample)

    split_line1 = int(len(all_samples) * 0.6)
    split_line2 = int(len(all_samples) * 0.8)

    training_set = [np.concatenate(i, axis=0) for i in zip(*all_samples[:split_line1])]
    validation_set = [np.concatenate(i, axis=0) for i in zip(*all_samples[split_line1: split_line2])]
    testing_set = [np.concatenate(i, axis=0) for i in zip(*all_samples[split_line2:])]

    return training_set, validation_set, testing_set

In [None]:
# 從文件中加載數據
# weighs_only = True/False 控制資訊安全，我的檔案所以False 就好
# data = torch.load('gcn_data_hobbies.pt', weights_only=False)

In [None]:
data = np.load('gcn_data.npz')
#graph_signal_matrix_filename = 'TestFile/PEMS04/PEMS04.npz'

In [None]:
data.keys()

KeysView(NpzFile 'gcn_data.npz' with keys: data)

In [None]:
data['data'].shape
# 5650個商品，365天，每天的特徵和標籤 (364, 5650, 2)

(364, 5650, 2)

In [9]:
hobbies_files = '/content/drive/MyDrive/Dissertation/DissertationData/gcn_data.npz'
#hobbies_files = 'TestFile/PEMS04/PEMS04.npz'
# #data['combined_data'].shape

# num_of_vertices = 5650  # 節點數（關聯）
# points_per_hour = 1  # 每小時一個數據點
# num_for_predict = 1  # 預測下一天數據
# num_of_hours = 1  # 24小時 #num_for_predict&num_of_hours一樣

num_of_vertices = 5650
points_per_hour = 1  # 每小時一個數據點
num_for_predict = 1  # 只預測一個時間步
num_of_hours = 1     # 使用當前時間步作為輸入


# read_and_generate_dataset 函数
training_set, validation_set, testing_set = read_and_generate_dataset(hobbies_files,
                                                                      num_of_hours=num_of_hours,
                                                                      num_for_predict=num_for_predict,
                                                                      points_per_hour=points_per_hour)


In [10]:
def normalization(train, val, test):
    '''
    Parameters
    ----------
    train, val, test: np.ndarray (B,N,F,T)
    Returns
    ----------
    stats: dict, two keys: mean and std
    train_norm, val_norm, test_norm: np.ndarray,
                                     shape is the same as original
    '''

    assert train.shape[1:] == val.shape[1:] and val.shape[1:] == test.shape[1:]  # ensure the num of nodes is the same
    mean = train.mean(axis=(0,1,3), keepdims=True)
    std = train.std(axis=(0,1,3), keepdims=True)
    print('mean.shape:',mean.shape)
    print('std.shape:',std.shape)

    def normalize(x):
        return (x - mean) / std

    train_norm = normalize(train)
    val_norm = normalize(val)
    test_norm = normalize(test)

    return {'_mean': mean, '_std': std}, train_norm, val_norm, test_norm

In [11]:
train_x = np.concatenate(training_set[:-2], axis=-1)  # (B,N,F,T')
val_x = np.concatenate(validation_set[:-2], axis=-1)
test_x = np.concatenate(testing_set[:-2], axis=-1)

train_target = training_set[-2]  # (B,N,T)
val_target = validation_set[-2]
test_target = testing_set[-2]

train_timestamp = training_set[-1]  # (B,1)
val_timestamp = validation_set[-1]
test_timestamp = testing_set[-1]

(stats, train_x_norm, val_x_norm, test_x_norm) = normalization(train_x, val_x, test_x)

all_data = {'train': { 'x': train_x_norm, 'target': train_target,'timestamp': train_timestamp},
            'val': {'x': val_x_norm, 'target': val_target, 'timestamp': val_timestamp},
            'test': {'x': test_x_norm, 'target': test_target, 'timestamp': test_timestamp},
            'stats': {'_mean': stats['_mean'], '_std': stats['_std']} }

mean.shape: (1, 1, 2, 1)
std.shape: (1, 1, 2, 1)


In [12]:
print('train x:', all_data['train']['x'].shape)
print('train target:', all_data['train']['target'].shape)
print('train timestamp:', all_data['train']['timestamp'].shape)
print()
print('val x:', all_data['val']['x'].shape)
print('val target:', all_data['val']['target'].shape)
print('val timestamp:', all_data['val']['timestamp'].shape)
print()
print('test x:', all_data['test']['x'].shape)
print('test target:', all_data['test']['target'].shape)
print('test timestamp:', all_data['test']['timestamp'].shape)
print()
print('train data _mean :', all_data['stats']['_mean'].shape, all_data['stats']['_mean'])
print('train data _std :', all_data['stats']['_std'].shape, all_data['stats']['_std'])

train x: (217, 5650, 2, 1)
train target: (217, 5650, 1)
train timestamp: (217, 1)

val x: (73, 5650, 2, 1)
val target: (73, 5650, 1)
val timestamp: (73, 1)

test x: (73, 5650, 2, 1)
test target: (73, 5650, 1)
test timestamp: (73, 1)

train data _mean : (1, 1, 2, 1) [[[[0.69391216]
   [0.69402634]]]]
train data _std : (1, 1, 2, 1) [[[[2.19701253]
   [2.19933857]]]]


In [13]:
file = os.path.basename(hobbies_files).split('.')[0]  #資料路徑
dirpath = '.'
#filename = os.path.join(dirpath, file + '_r' + str(num_of_hours) + '_d' + str(num_of_days) + '_w' + str(num_of_weeks)) + '_astcgn'
filename = os.path.join(dirpath, f"{file}_Aug13_astcgn")
print('save file:', filename)
np.savez_compressed(filename,
                train_x=all_data['train']['x'],train_target=all_data['train']['target'],train_timestamp=all_data['train']['timestamp'],
                val_x=all_data['val']['x'], val_target=all_data['val']['target'],val_timestamp=all_data['val']['timestamp'],
                test_x=all_data['test']['x'], test_target=all_data['test']['target'], test_timestamp=all_data['test']['timestamp'],
                mean=all_data['stats']['_mean'], std=all_data['stats']['_std'])

save file: ./gcn_data_Aug13_astcgn


In [14]:
training_set

[array([[[[0],
          [0]],
 
         [[0],
          [1]],
 
         [[0],
          [1]],
 
         ...,
 
         [[0],
          [0]],
 
         [[0],
          [0]],
 
         [[0],
          [0]]],
 
 
        [[[0],
          [0]],
 
         [[1],
          [0]],
 
         [[1],
          [0]],
 
         ...,
 
         [[0],
          [0]],
 
         [[0],
          [0]],
 
         [[0],
          [2]]],
 
 
        [[[0],
          [0]],
 
         [[0],
          [0]],
 
         [[0],
          [1]],
 
         ...,
 
         [[0],
          [1]],
 
         [[0],
          [0]],
 
         [[2],
          [0]]],
 
 
        ...,
 
 
        [[[0],
          [0]],
 
         [[5],
          [0]],
 
         [[3],
          [0]],
 
         ...,
 
         [[0],
          [0]],
 
         [[0],
          [0]],
 
         [[0],
          [0]]],
 
 
        [[[0],
          [0]],
 
         [[0],
          [0]],
 
         [[0],
          [1]],
 
         ...,
 


In [None]:
# 數據加載和切分：
# num_of_hours = 1 表示只使用當前的時間步。
# num_for_predict = 1 表示只預測下一個時間步。

# 生成的數據形狀是：
# train_x 的形狀是 (B, 5650, 2, 1)，每個樣本包含 5650 個節點，每個節點有 2 個特徵，並且只有 1 個時間步。
# train_target 的形狀是 (B, 5650, 1)，每個樣本有 5650 個節點，並且每個節點有 1 個時間步的目標值。