# <div style="text-align:center; padding:15px; color:white; margin:0; font-size:150%; font-family:'Times New Roman'; background-color:#7E8083   ; overflow:hidden"><b>Import libraries and load data</b></div>

In [2]:
import os

import torch
import torch.nn as nn
import numpy as np
import pandas as pd

In [3]:
max_length = 490  # 输入序列的最大长度
input_size = 23  # 特征数量（根据你的数据集调整）
lstm_cell = 64  # LSTM 单元数
output_size = 5  # 预测的 Commit Count
dir = '../../data/predict/dataset'

In [4]:
tensor_list = []
tar_list = []

def count_files_in_directory(directory):
    # 获取目录下的所有文件和文件夹
    with os.scandir(directory) as entries:
        for entry in entries:
            if entry.is_file():
                df = pd.read_csv(entry)
                df = df.drop(columns=['Target'])
                
                tensor_list.append(torch.tensor(df.values, dtype=torch.float32))

    if tensor_list:
        max_len = max(t.size(0) for t in tensor_list)
        padded_tensors = [torch.cat([t, torch.zeros(max_len - t.size(0), t.size(1))]) for t in tensor_list]
        x_tensor = torch.stack(padded_tensors)
    else:
        x_tensor = torch.tensor([])

    return x_tensor

X_train_tensor = count_files_in_directory(dir)

print(X_train_tensor.size())

torch.Size([224, 490, 24])


In [4]:
def create_sliding_window(data, window_size, target_cols):
    """
    将时间序列数据分割成滑动窗口形式。
    
    Args:
        data: shape (samples, time_steps, features)，输入数据张量。
        window_size: 滑动窗口大小。
        target_cols: 目标列索引列表（多任务）。
    
    Returns:
        X_windows: shape (num_windows, window_size, features)，输入窗口。
        y_windows: shape (num_windows, len(target_cols))，目标值。
    """
    X_windows, y_windows = [], []
    samples, time_steps, features = data.shape

    for sample_idx in range(samples):
        for start_idx in range(time_steps - window_size):
            # 滑动窗口的输入特征
            X_windows.append(data[sample_idx, start_idx:start_idx + window_size, :])
            # 滑动窗口的目标值
            y_windows.append(data[sample_idx, start_idx + window_size, target_cols])
    
    return torch.tensor([np.array(t) for t in X_windows], dtype=torch.float32), torch.tensor(y_windows, dtype=torch.float32)


# 构造滑动窗口
X_windows, y_windows = create_sliding_window(X_train_tensor, 5, [0])
print(f"X_windows shape: {X_windows.shape}")  # (num_windows, window_size, features)
print(f"y_windows shape: {y_windows.shape}")  # (num_windows, len(target_cols))



X_windows shape: torch.Size([104760, 5, 24])
y_windows shape: torch.Size([104760])


  return torch.tensor([np.array(t) for t in X_windows], dtype=torch.float32), torch.tensor(y_windows, dtype=torch.float32)


# <div style="text-align:center; padding:15px; color:white; margin:0; font-size:150%; font-family:'Times New Roman'; background-color:#7E8083   ; overflow:hidden"><b>Data</b></div>

In [None]:
class MultiTaskBiLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_tasks, dropout=0.3):
        """
        多任务双向 LSTM 模型
        
        Args:
            input_size: 输入特征数量。
            hidden_size: LSTM 隐藏单元数。
            num_layers: LSTM 层数。
            num_tasks: 任务数量（即目标列数量）。
            dropout: Dropout 比例。
        """
        super(MultiTaskBiLSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.num_tasks = num_tasks
        
        # 共享双向 LSTM 层
        self.bilstm = nn.LSTM(input_size, hidden_size, num_layers, 
                              batch_first=True, bidirectional=True, dropout=dropout)
        
        # 为每个任务定义独立的全连接层
        self.task_layers = nn.ModuleList([nn.Linear(hidden_size * 2, 1) for _ in range(num_tasks)])
    
    def forward(self, x):
        """
        Args:
            x: 输入张量，形状为 (batch_size, time_steps, input_size)。
        
        Returns:
            outputs: 每个任务对应的输出列表，长度为 num_tasks, 每个元素形状为 (batch_size,)。
        """
        # LSTM 前向传播
        lstm_out, _ = self.bilstm(x)  # lstm_out shape: (batch_size, time_steps, hidden_size * 2)
        
        # 取最后一个时间步的输出
        last_out = lstm_out[:, -1, :]  # shape: (batch_size, hidden_size * 2)
        
        # 每个任务的输出
        outputs = [layer(last_out) for layer in self.task_layers]
        outputs = torch.cat(outputs, dim=1).view(-1)  # 将每个任务的输出拼接成一个张量
        return outputs

# 实例化模型
model = MultiTaskBiLSTM(23, lstm_cell, 2, 1)

# 打印模型结构
print(model)

# 定义损失函数和优化器
criterion = nn.MSELoss()  # 对于回归任务，使用均方误差损失
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

MultiTaskBiLSTM(
  (bilstm): LSTM(23, 64, num_layers=2, batch_first=True, dropout=0.3, bidirectional=True)
  (task_layers): ModuleList(
    (0): Linear(in_features=128, out_features=1, bias=True)
  )
)


: 

In [None]:
# 训练循环示例
num_epochs = 50
model.train()
for epoch in range(num_epochs):
    optimizer.zero_grad()
    outputs = model(X_windows)  # 前向传播
    print(outputs)
    print(y_windows)
    loss = criterion(outputs, y_windows)  # 计算损失
    loss.backward()  # 反向传播
    optimizer.step()  # 更新权重
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

tensor([nan, nan, nan,  ..., nan, nan, nan], grad_fn=<ViewBackward0>)
tensor([32.,  3.,  7.,  ...,  0.,  0.,  0.])
