In [None]:
import pandas as pd
import torch
import numpy as np
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import KFold

df = pd.read_csv("/opt/hdd_1/research_hub/csr_project/Green_patent_dataset/merged_dataset/csr_embeddings_forward_count_value.csv")
company_ids = df['ticker'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

folds = []
for train_idx, test_idx in kf.split(company_ids):
    train_companies = company_ids[train_idx]
    test_companies = company_ids[test_idx]
    folds.append((train_companies, test_companies))


In [14]:
class CompanySequenceDataset(torch.utils.data.Dataset):
    def __init__(self, df, company_list):
        self.df = df[df['ticker'].isin(company_list)].copy()
        self.company_groups = self.df.groupby('ticker')

        self.x_seq = []
        self.y_seq = []
        self.lengths = []

        for _, group in self.company_groups:
            group = group.sort_values("year")
            x = torch.tensor(group[[f'dim_{i}' for i in range(1024)]].values, dtype=torch.float32)
            y = torch.tensor(group[['patents_count', 'total_5yr_forward_citations', 'total_values_real']].values, dtype=torch.float32)
            self.x_seq.append(x)
            self.y_seq.append(y)
            self.lengths.append(len(group))

        # padding
        self.x_seq = pad_sequence(self.x_seq, batch_first=True)  # [B, max_seq_len, 1024]
        self.y_seq = pad_sequence(self.y_seq, batch_first=True)  # [B, max_seq_len, 3]

    def __len__(self):
        return len(self.lengths)

    def __getitem__(self, idx):
        return self.x_seq[idx], self.y_seq[idx], self.lengths[idx]


In [15]:
import torch.nn as nn

class LSTM_MTL(nn.Module):
    def __init__(self, input_dim=1024, hidden_dim=256):
        super().__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.head_count = nn.Linear(hidden_dim, 1)
        self.head_citation = nn.Linear(hidden_dim, 1)
        self.head_value = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        lstm_out, _ = self.lstm(x)  # [B, T, H]
        y1 = self.head_count(lstm_out).squeeze(-1)    # [B, T]
        y2 = self.head_citation(lstm_out).squeeze(-1)
        y3 = self.head_value(lstm_out).squeeze(-1)
        return y1, y2, y3


In [16]:
def compute_loss(y_pred, y_true, lengths):
    y1_pred, y2_pred, y3_pred = y_pred
    y1_true = y_true[:,:,0]
    y2_true = y_true[:,:,1]
    y3_true = y_true[:,:,2]

    loss_fn = nn.MSELoss(reduction='none')
    mask = torch.arange(y_true.shape[1])[None, :].to(lengths.device) < lengths[:, None]

    loss1 = loss_fn(y1_pred, y1_true) * mask
    loss2 = loss_fn(y2_pred, y2_true) * mask
    loss3 = loss_fn(y3_pred, y3_true) * mask

    total_loss = (loss1 + loss2 + loss3).sum() / mask.sum()
    return total_loss


In [None]:
def smape(y_pred, y_true, mask):
    numerator = torch.abs(y_pred - y_true)
    denominator = (torch.abs(y_pred) + torch.abs(y_true)) / 2
    smape = numerator / (denominator + 1e-8)
    return (smape * mask).sum() / mask.sum()

In [None]:
def evaluate_metrics(model, dataloader):
    model.eval()

    y_true_total = []
    y_pred_total = []

    with torch.no_grad():
        for x_batch, y_batch, lengths in dataloader:
            x_batch, y_batch, lengths = x_batch.to(device), y_batch.to(device), lengths.to(device)
            y1_pred, y2_pred, y3_pred = model(x_batch)
            y_pred = torch.stack([y1_pred, y2_pred, y3_pred], dim=2)  # [B, T, 3]

            mask = torch.arange(y_batch.shape[1])[None, :].to(lengths.device) < lengths[:, None]
            mask = mask.unsqueeze(-1).expand_as(y_batch)  # [B, T, 3]

            y_true_total.append(y_batch[mask])
            y_pred_total.append(y_pred[mask])

    # 拼接所有 batch 結果
    y_true_all = torch.cat(y_true_total, dim=0).cpu().numpy()  # [N]
    y_pred_all = torch.cat(y_pred_total, dim=0).cpu().numpy()  # [N]

    metrics = {}

    for i, name in enumerate(["count", "citation", "value"]):
        y_true_i = y_true_all[i::3]
        y_pred_i = y_pred_all[i::3]

        mse = np.mean((y_pred_i - y_true_i) ** 2)
        mae = np.mean(np.abs(y_pred_i - y_true_i))
        rmse = np.sqrt(mse)
        smape_i = np.mean(2 * np.abs(y_pred_i - y_true_i) / (np.abs(y_pred_i) + np.abs(y_true_i) + 1e-8))

        metrics[name] = {
            "MSE": mse,
            "MAE": mae,
            "RMSE": rmse,
            "SMAPE": smape_i
        }

    return metrics

In [None]:
for fold_id, (train_coms, test_coms) in enumerate(folds):
    train_dataset = CompanySequenceDataset(df, train_coms)
    test_dataset = CompanySequenceDataset(df, test_coms)

    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

    model = LSTM_MTL().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

    for epoch in range(20):
        model.train()
        for x_batch, y_batch, lengths in train_loader:
            x_batch, y_batch, lengths = x_batch.to(device), y_batch.to(device), lengths.to(device) 
            preds = model(x_batch)
            loss = compute_loss(preds, y_batch, lengths)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
        metrics = evaluate_metrics(model, val_loader)
    print(f"[Fold {fold_id}] Epoch {epoch} | Train Loss: {loss.item():.4f}")
    for task, task_metrics in metrics.items():
        print(f"  [{task}] MSE: {task_metrics['MSE']:.2f}, MAE: {task_metrics['MAE']:.2f}, RMSE: {task_metrics['RMSE']:.2f}, SMAPE: {task_metrics['SMAPE']:.2f}")



[Fold 0] Epoch 19 | Train Loss: 2712.5825 | Val Loss: 113072.1212
[Fold 1] Epoch 19 | Train Loss: 345181.8438 | Val Loss: 194926.8376
[Fold 2] Epoch 19 | Train Loss: 2221.2410 | Val Loss: 735801.4321
[Fold 3] Epoch 19 | Train Loss: 4980.2764 | Val Loss: 29732.5675
[Fold 4] Epoch 19 | Train Loss: 298545.4062 | Val Loss: 31078.2586
