In [15]:
import pandas as pd
import numpy as np

import torch
import torch.nn.functional as F

from sklearn.metrics.pairwise import cosine_similarity

# ***Data Preprocessing***

In [2]:
df = pd.read_csv('m5-forecasting-accuracy/sales_train_evaluation.csv')
# 抓出 cat_id 為 "HOBBIES" 的數據
df_hobbies = df[df['cat_id'] == 'HOBBIES']
df_hobbies.shape

(5650, 1947)

In [3]:
# 檢查是否有任何 NaN 值
if df_hobbies.isnull().values.any():
    print("CSV 檔案中存在 null 或 NaN 值。")
    # 打印出包含 null 或 NaN 值的行和列
    print("包含 NaN 的行：")
    print(df_hobbies[df_hobbies.isnull().any(axis=1)])
else:
    print("CSV 檔案中不存在 null 或 NaN 值。")

CSV 檔案中不存在 null 或 NaN 值。


In [4]:
# 3. 提取所有日期列
date_columns = [col for col in df_hobbies.columns if col.startswith('d_')]
total_days = len(date_columns)  # 確認總共有多少天

# 4. 篩選出這些天數的銷售數據
sales_data_days = df_hobbies[date_columns]

print(sales_data_days.shape)  # 應該會輸出 (5650, total_days)

(5650, 1941)


In [5]:
try:
    sales_data_days.to_csv("hobbies.csv", index=False)  # 將 DataFrame 儲存為 CSV 文件
    print("File saved successfully")
except Exception as e:
    print(f"An error occurred: {e}")

File saved successfully


In [6]:
# 5. 將 DataFrame 轉換為 PyTorch 張量
node_features_tensor = torch.tensor(sales_data_days.iloc[:, :-1].values, dtype=torch.float)  # 前面的1940
labels_tensor = torch.tensor(sales_data_days.iloc[:, 1:].values, dtype=torch.float)  # 第 2 到 1941 天作為標籤

print("Node features shape:", node_features_tensor.shape)  # 應該會輸出 (5650, 364)
print("Labels shape:", labels_tensor.shape)  # 應該會輸出 (5650, 364)

# Step 6: 計算餘弦相似度矩陣
# 先進行 L2 正規化（將每個向量的長度歸一化）
normalized_features = torch.nn.functional.normalize(node_features_tensor, p=2, dim=1)

# 計算餘弦相似度矩陣
cosine_sim_matrix = torch.matmul(normalized_features, normalized_features.t())

# 7. 設置相似度的閾值，將餘弦相似度矩陣轉換為鄰接矩陣
threshold = 0.5
adj_matrix = (cosine_sim_matrix > threshold).float()

# 8. 獲取非零元素的索引，作為邊的連接關係 (edge_index)
edge_index = adj_matrix.nonzero(as_tuple=False).t()

# 9. 根據鄰接矩陣提取對應的相似度值，作為 edge_attr
edge_attr = cosine_sim_matrix[adj_matrix.bool()]

# 現在我們有了全部的 PyTorch 張量，不需要使用 NumPy
print("Edge index shape:", edge_index.shape)
print("Edge attr shape:", edge_attr.shape)

Node features shape: torch.Size([5650, 1940])
Labels shape: torch.Size([5650, 1940])
Edge index shape: torch.Size([2, 47624])
Edge attr shape: torch.Size([47624])


In [7]:
# 10. 將 1941 天的features和 1940 天的labels 分割為 x 天一組 (timesteps)
timesteps = 1 #每個片段的長度 #This effects to the running speed in CPU/GPU
num_nodes = 5650 #樣本數(商品數量)
num_features = 1 #每個商品每次記錄中的特徵數量

# 將特徵數據進行 reshape，將它轉換為 [num_nodes, num_features, 1940]
x = node_features_tensor.view(num_nodes, num_features, -1)

# 分割特徵為 [num_nodes, num_features, timesteps]
features_list = [x[:, :, i:i+timesteps] for i in range(0, x.size(2) - timesteps + 1, timesteps)]

# 同樣處理 targets，將它轉換為 [num_nodes, 364]
y = labels_tensor.view(num_nodes, -1)

# 分割標籤為 [num_nodes, timesteps]
targets_list = [y[:, i:i+timesteps].numpy() for i in range(0, y.size(1) - timesteps + 1, timesteps)]

# 將特徵保留為 PyTorch 張量，並將標籤轉換為 NumPy 陣列，避免 attribute error
features_list = [f.numpy() for f in features_list]

In [11]:
# 將數據集的組件保存到文件
torch.save({
    'edge_index': edge_index,
    'edge_attr': edge_attr,
    'features': features_list,
    'targets': targets_list
}, 'static_graph_temporal_signal.pt')

In [14]:
# Save the adjacency matrix to a .npy file
np.save('adj_matrix.npy', adj_matrix)