In [8]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import KFold

df = pd.read_csv("../data/csr_embeddings_leq2019_filled_gp.csv")
company_ids = df['ticker'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

folds = []
for train_idx, test_idx in kf.split(company_ids):
    train_companies = company_ids[train_idx]
    test_companies = company_ids[test_idx]
    folds.append((train_companies, test_companies))

In [None]:
class SlidingWindowDataset(torch.utils.data.Dataset):
    def __init__(self, df, company_list, window_size=5):
        self.x_seq = []
        self.y_seq = []

        df = df[df['ticker'].isin(company_list)].copy()
        for _, group in df.groupby('ticker'):
            group = group.sort_values('year')
            x_all = torch.tensor(group[[f'dim_{i}' for i in range(1024)]].values, dtype=torch.float32)
            y_all = torch.tensor(group[['patents_count', 'total_5yr_forward_citations', 'total_values_real']].values, dtype=torch.float32)
            y_all = torch.log1p(y_all)  # log transform
            # 5 years of data for x, 1 year of data for y
            for i in range(len(group) - window_size):
                self.x_seq.append(x_all[i:i+window_size])       # [5, 1024]
                self.y_seq.append(y_all[i+window_size])         # [3]

    def __len__(self):
        return len(self.x_seq)

    def __getitem__(self, idx):
        return self.x_seq[idx], self.y_seq[idx]

In [10]:
# --- LSTM + MTL ---
class LSTM_MTL_Stepwise(nn.Module):
    def __init__(self, input_dim=1024, hidden_dim=256):
        super().__init__()
        self.lstm_cell = nn.LSTMCell(input_dim, hidden_dim)
        self.head_count = nn.Linear(hidden_dim, 1)
        self.head_citation = nn.Linear(hidden_dim, 1)
        self.head_value = nn.Linear(hidden_dim, 1)

    def forward(self, x):  # x: [B, 5, 1024]
        B, T, D = x.shape
        h, c = torch.zeros(B, 256).to(x.device), torch.zeros(B, 256).to(x.device)
        for t in range(T):
            h, c = self.lstm_cell(x[:, t, :], (h, c))  # 單步 forward 且更新 state
        # 最終 t=5 時的 h 用來預測第6年
        y1 = self.head_count(h)
        y2 = self.head_citation(h)
        y3 = self.head_value(h)
        return torch.cat([y1, y2, y3], dim=1)  # [B, 3]

In [11]:
# --- Metrics (expm1 to reverse log1p) ---
def compute_metrics(y_pred, y_true):
    y_pred = torch.expm1(y_pred).cpu().numpy()
    y_true = torch.expm1(y_true).cpu().numpy()
    metrics = {}
    for i, name in enumerate(["count", "citation", "value"]):
        y_p, y_t = y_pred[:, i], y_true[:, i]
        mse = np.mean((y_p - y_t) ** 2)
        mae = np.mean(np.abs(y_p - y_t))
        rmse = np.sqrt(mse)
        smape = np.mean(2 * np.abs(y_p - y_t) / (np.abs(y_p) + np.abs(y_t) + 1e-8))
        metrics[name] = {"MSE": mse, "MAE": mae, "RMSE": rmse, "SMAPE": smape}
    return metrics

In [None]:
# --- Training ---
company_ids = df['ticker'].unique()
# 5 folds 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
folds = [(company_ids[train], company_ids[test]) for train, test in kf.split(company_ids)]

for fold_id, (train_coms, test_coms) in enumerate(folds):
    train_dataset = SlidingWindowDataset(df, train_coms)
    test_dataset = SlidingWindowDataset(df, test_coms)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

    model = LSTM_MTL_Stepwise().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    for epoch in range(100):
        model.train()
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            preds = model(x_batch)  # [B, 3]
            loss = loss_fn(preds, y_batch)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        model.eval()
        all_preds, all_y = [], []
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                preds = model(x_batch)
                all_preds.append(preds)
                all_y.append(y_batch)
        all_preds = torch.cat(all_preds)
        all_y = torch.cat(all_y)
        metrics = compute_metrics(all_preds, all_y)

        print(f"[Fold {fold_id}] Epoch {epoch} | Train Loss: {loss.item():.4f}")
        for task, vals in metrics.items():
            print(f"  [{task}] MSE: {vals['MSE']:.2f}, MAE: {vals['MAE']:.2f}, RMSE: {vals['RMSE']:.2f}, SMAPE: {vals['SMAPE']:.2f}")


[Fold 0] Epoch 0 | Train Loss: 3.6867
  [count] MSE: 995.55, MAE: 11.14, RMSE: 31.55, SMAPE: 1.74
  [citation] MSE: 46709.38, MAE: 72.46, RMSE: 216.12, SMAPE: 1.81
  [value] MSE: 388878.31, MAE: 277.10, RMSE: 623.60, SMAPE: 1.90
[Fold 0] Epoch 1 | Train Loss: 5.2801
  [count] MSE: 976.48, MAE: 11.57, RMSE: 31.25, SMAPE: 1.70
  [citation] MSE: 46227.84, MAE: 73.96, RMSE: 215.01, SMAPE: 1.78
  [value] MSE: 385472.72, MAE: 278.94, RMSE: 620.86, SMAPE: 1.86
[Fold 0] Epoch 2 | Train Loss: 6.0120
  [count] MSE: 967.47, MAE: 11.74, RMSE: 31.10, SMAPE: 1.69
  [citation] MSE: 46473.50, MAE: 73.06, RMSE: 215.58, SMAPE: 1.80
  [value] MSE: 390573.31, MAE: 276.02, RMSE: 624.96, SMAPE: 1.92
[Fold 0] Epoch 3 | Train Loss: 3.0335
  [count] MSE: 1016.58, MAE: 10.71, RMSE: 31.88, SMAPE: 1.89
  [citation] MSE: 47094.59, MAE: 71.38, RMSE: 217.01, SMAPE: 1.89
  [value] MSE: 392832.69, MAE: 274.79, RMSE: 626.76, SMAPE: 1.96
[Fold 0] Epoch 4 | Train Loss: 4.6928
  [count] MSE: 981.76, MAE: 11.12, RMSE: 31.3