In [1]:
import pandas as pd
import os
import torch
import numpy as np
from torch.utils.data import Dataset
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [2]:
data_path = '../data/' 

location_path = os.path.join(data_path, 'sdwpf_baidukddcup2022_turb_location.CSV')
location      = pd.read_csv(location_path)
location.describe()

Unnamed: 0,TurbID,x,y
count,134.0,134.0,134.0
mean,67.5,2666.760881,5986.126609
std,38.826537,1829.842484,3346.503906
min,1.0,0.0,0.0
25%,34.25,1014.935725,3182.294825
50%,67.5,3246.54185,5841.89434
75%,100.75,4320.609625,8814.83295
max,134.0,5501.4529,12121.00426


In [3]:
data = pd.read_csv(os.path.join(data_path, 'train/wtbdata_245days.csv')) 
data.fillna(method='bfill', inplace=True)
print(data.shape)
print(data.columns)
print('缺失值：', data.isna().sum().sum())
data.head()

(4727520, 13)
Index(['TurbID', 'Day', 'Tmstamp', 'Wspd', 'Wdir', 'Etmp', 'Itmp', 'Ndir',
       'Pab1', 'Pab2', 'Pab3', 'Prtv', 'Patv'],
      dtype='object')
缺失值： 0


Unnamed: 0,TurbID,Day,Tmstamp,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv
0,1,1,00:00,6.17,-3.99,30.73,41.8,25.92,1.0,1.0,1.0,-0.25,494.66
1,1,1,00:10,6.17,-3.99,30.73,41.8,25.92,1.0,1.0,1.0,-0.25,494.66
2,1,1,00:20,6.27,-2.18,30.6,41.63,20.91,1.0,1.0,1.0,-0.24,509.76
3,1,1,00:30,6.42,-0.73,30.52,41.52,20.91,1.0,1.0,1.0,-0.26,542.53
4,1,1,00:40,6.25,0.89,30.49,41.38,20.91,1.0,1.0,1.0,-0.23,509.36


In [24]:
def group_data(data, col_start, group_column='TurbID'):
    group_ids = sorted(data[group_column].unique())
    col_use = data.columns[col_start:]
    res = []
    for group in group_ids:
        tmp = data[data[group_column] == group][col_use].values
        res.append(tmp)

    return np.array(res)

data2 = group_data(data, 3)
print(data2.shape)

(134, 35280, 10)


In [28]:
data3 = data2.transpose((0, 2, 1))
data3.shape

(134, 10, 35280)

In [32]:
means = np.mean(data3, axis=(0, 2))
stds = np.std(data3, axis=(0, 2))
print(f'means: {means.shape}, stds: {stds.shape}')

X = (data3 - means.reshape(1, -1, 1)) / stds.reshape(1, -1, 1)
X.shape

means: (10,), stds: (10,)


(134, 10, 35280)

In [6]:
day_len = 24*6
train_days = 200 * day_len
val_days = 20 * day_len
test_days = 25 * day_len
print(f'day_len: {day_len}, train_days: {train_days}, val_days: {val_days}, test_days: {test_days}')
print('sum: ', train_days + val_days + test_days)

train_original_data = data3[:, :, :train_days]
val_original_data = data3[:, :, train_days:train_days + val_days]
test_original_data = data3[:, :, train_days + val_days:]
print(f'train_original_data: {train_original_data.shape}, \n'
      f'val_original_data: {val_original_data.shape}, \n'
      f'test_original_data: {test_original_data.shape}.')

day_len: 144, train_days: 28800, val_days: 2880, test_days: 3600
sum:  35280
train_original_data: (134, 10, 28800), 
val_original_data: (134, 10, 2880), 
test_original_data: (134, 10, 3600).


In [7]:
X = train_original_data  # (134, 10, 28800)
num_timesteps_input = 12   # 288极易爆内存
num_timesteps_output = 3
target_col = -1
indices = [(i, i + (num_timesteps_input + num_timesteps_output)) 
           for i in range(X.shape[2] - (num_timesteps_input + num_timesteps_output) + 1)]

print(indices[:10])

features, target = [], []
for i, j in indices:
    features.append(X[:, :, i:i + num_timesteps_input].transpose(0, 2, 1)) # 前num_timesteps_input个
    target.append(X[:, target_col, i + num_timesteps_input: j])  # 后num_timesteps_output个

features = np.array(features)
target = np.array(target)
print(f'features: {features.shape}, target: {target.shape}')

[(0, 15), (1, 16), (2, 17), (3, 18), (4, 19), (5, 20), (6, 21), (7, 22), (8, 23), (9, 24)]
features: (28786, 134, 12, 10), target: (28786, 134, 3)


In [34]:
batch_size = 128
for i in range(0, len(indices), batch_size):
    indices_part = indices[i:i + batch_size]
    print(len(indices_part))


128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
128
114


In [35]:
def generate_dataset(X, indices, input_time_steps=12, output_time_steps=3, target_col=-1, to_tensor=True):
    
    # [(i, j)]: i: window start point, j: window end point
    # indices = [(i, i + (input_time_steps + output_time_steps)) 
    #        for i in range(X.shape[2] - (input_time_steps + output_time_steps) + 1)] 
    
    features, target = [], []
    for i, j in indices:
        features.append(X[:, :, i:i + input_time_steps].transpose(0, 2, 1)) # 前num_timesteps_input个
        target.append(X[:, target_col, i + input_time_steps: j])  # 后num_timesteps_output个
        
    if to_tensor:
        return torch.from_numpy(np.array(features)), torch.from_numpy(np.array(target))
    else:
        return np.array(features, target)

batch_size = 128
indices_part = indices[0:batch_size]
num_timesteps_input = 288
num_timesteps_output = 288
X_train, Y_train = generate_dataset(train_original_data, indices_part, num_timesteps_input, num_timesteps_output)
print(f'X_train: {X_train.shape}, Y_train: {Y_train.shape}')

X_train: torch.Size([128, 134, 288, 10]), Y_train: torch.Size([128, 134, 0])


In [31]:
from sklearn.metrics.pairwise import euclidean_distances
def get_adjency_matrix(data_path, threshold=1000):
    # 读取数据集
    df = pd.read_csv(data_path)

    # 提取坐标列
    coordinates = df[['x', 'y']].values

    # 计算欧氏距离矩阵
    distance_matrix = euclidean_distances(coordinates)

    # 将距离矩阵转换为邻接矩阵（可以根据阈值定义邻接关系）
    # threshold = 10  # 距离阈值，根据具体情况调整
    adjacency_matrix = (distance_matrix < threshold).astype(int)

    return adjacency_matrix

def get_normalized_adj(A):
    """
    度规范化邻接矩阵，有助于在图神经网络中更好地处理不同节点度之间的差异，使得信息传递更为平滑和稳定。
    Returns the degree normalized adjacency matrix.
    """

    # 将对角线元素设为1，考虑自身因素
    A = A + np.diag(np.ones(A.shape[0], dtype=np.float32))

    # 计算度矩阵D
    D = np.array(np.sum(A, axis=1)).reshape((-1,))

    # 将度矩阵D中小于10e-5的元素设为10e-5，避免除零错误
    D[D <= 10e-5] = 10e-5    # Prevent infs

    # 计算度矩阵D的倒数（逆）
    diag = np.reciprocal(np.sqrt(D))

    # 度规范化矩阵A
    # \hat{A} = D^{-\frac{1}{2}} A D^{-\frac{1}{2}} 
    A_wave = np.multiply(np.multiply(diag.reshape((-1, 1)), A),
                         diag.reshape((1, -1)))
    
    return A_wave

In [32]:
location_path = '../data/sdwpf_baidukddcup2022_turb_location.CSV'
A = get_adjency_matrix(location_path)
A_wave = get_normalized_adj(A)
A_wave.shape

(134, 134)

In [58]:
from models import STGCN

# A_wave = torch.from_numpy(A_wave)
net = STGCN(A_wave.shape[0],
            X_train.shape[3],
            num_timesteps_input,
            num_timesteps_output)

# device = torch.device('cpu')
# print(f'device: {device}')
# A_wave = A_wave.to(device)
# X_train = X_train.to(device)
# net.to(device)

x = X_train[:5]
x = x.to(torch.float32)
print(x.shape)  # torch.Size([5, 134, 12, 10])

A_wave = A_wave.to(torch.float32)
out = net(A_wave, x)
out.shape

torch.Size([5, 134, 12, 10])


torch.Size([5, 134, 3])

In [51]:
# 代码如下：
import torch.nn as nn
num_features = X_train.shape[-1]
conv1 = nn.Conv2d(num_features, 64, (1, 3)).to(torch.float32)
conv2 = nn.Conv2d(num_features, 64, (1, 3))

x = X_train[:5]
print(x.shape)  # torch.Size([5, 134, 12, 10])
x = x.permute(0, 3, 1, 2)
print(x.shape)  # torch.Size([5, 10, 134, 12])
x = x.to(torch.float32)
temp = conv1(x) + torch.sigmoid(conv2(x))
print(temp.shape)

# 但是 temp = conv1(x) + torch.sigmoid(conv2(x)) 这行代码报错：
# RuntimeError: Input type (double) and bias type (float) should be the same
# 可以看看是为什么吗？

torch.Size([5, 134, 12, 10])
torch.Size([5, 10, 134, 12])
torch.Size([5, 64, 134, 10])


In [37]:
print(type(A_wave), type(X_train))
print(A_wave.dtype, X_train.dtype, Y_train.dtype)

<class 'torch.Tensor'> <class 'torch.Tensor'>
torch.float64 torch.float64 torch.float64
