In [59]:
import pandas as pd
import torch
import numpy as np
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import KFold

df = pd.read_csv("../data/csr_embeddings_leq2019_filled_gp.csv")
company_ids = df['ticker'].unique()
kf = KFold(n_splits=5, shuffle=True, random_state=42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

folds = []
for train_idx, test_idx in kf.split(company_ids):
    train_companies = company_ids[train_idx]
    test_companies = company_ids[test_idx]
    folds.append((train_companies, test_companies))

In [60]:
df

Unnamed: 0,file_name,dim_0,dim_1,dim_2,dim_3,dim_4,dim_5,dim_6,dim_7,dim_8,...,dim_1019,dim_1020,dim_1021,dim_1022,dim_1023,ticker,year,total_5yr_forward_citations,patents_count,total_values_real
0,NASDAQ_AAPL_2014,-0.091830,0.293040,-0.179719,0.083607,-0.032453,0.012720,-0.027948,0.030055,0.105388,...,-0.223406,-0.325722,0.009091,-0.170071,0.163041,AAPL,2014,751.0,110.0,4231.820586
1,NASDAQ_AAPL_2015,-0.085034,0.276596,-0.139275,0.068203,-0.015527,-0.055515,-0.033100,0.032737,0.124428,...,-0.212523,-0.331629,-0.030166,-0.150414,0.180043,AAPL,2015,910.0,122.0,4498.666235
2,NASDAQ_AAPL_2016,-0.067159,0.277524,-0.148442,0.092723,-0.000616,-0.040362,-0.032554,0.009260,0.143966,...,-0.216122,-0.327691,-0.037364,-0.151454,0.178229,AAPL,2016,892.0,123.0,4672.332352
3,NASDAQ_AAPL_2017,-0.053141,0.259755,-0.160064,0.098964,-0.006857,-0.001506,-0.052370,0.012755,0.146752,...,-0.213012,-0.317366,-0.034005,-0.147262,0.175490,AAPL,2017,801.0,96.0,4542.854141
4,NASDAQ_AAPL_2018,-0.070878,0.338528,-0.095608,0.093072,-0.027314,-0.033892,-0.052765,0.021670,0.055076,...,-0.190008,-0.347579,-0.004540,-0.125971,0.124562,AAPL,2018,381.0,88.0,3391.064455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1097,NYSE_XOM_2019,-0.050290,0.340054,-0.000729,-0.018335,-0.089113,0.006662,-0.126592,-0.014879,0.054125,...,-0.087151,-0.416121,-0.037118,-0.023944,0.063998,XOM,2019,0.0,0.0,0.000000
1098,NYSE_XRX_2016,-0.094249,0.258920,-0.112034,-0.031890,-0.040051,0.081830,-0.113565,-0.021055,0.120458,...,-0.078453,-0.269516,0.010672,-0.152912,0.172861,XRX,2016,10.0,4.0,18.897989
1099,NYSE_XRX_2017,-0.116771,0.273073,0.085469,-0.172837,-0.261041,-0.295415,-0.034079,0.221862,0.033038,...,-0.213148,-0.552106,-0.062122,-0.010183,0.144048,XRX,2017,56.0,6.0,4.897709
1100,NYSE_XRX_2018,-0.120495,0.256214,-0.124283,-0.008754,-0.040228,0.083673,-0.134553,-0.033440,0.127288,...,-0.078939,-0.259030,0.024605,-0.203625,0.167591,XRX,2018,0.0,0.0,0.000000


In [61]:
class SlidingWindowDataset(torch.utils.data.Dataset):
    def __init__(self, df, company_list, window_size=5):
        self.x_seq = []
        self.y_seq = []

        df = df[df['ticker'].isin(company_list)].copy()
        for _, group in df.groupby('ticker'):
            group = group.sort_values('year')
            x_all = torch.tensor(group[[f'dim_{i}' for i in range(1024)]].values, dtype=torch.float32)
            y_all = torch.tensor(group[['patents_count', 'total_5yr_forward_citations', 'total_values_real']].values, dtype=torch.float32)
            y_all = torch.log1p(y_all)  # log transform
            # 5 years of data for x, 1 year of data for y
            for i in range(len(group) - window_size):
                self.x_seq.append(x_all[i:i+window_size])       # [5, 1024]
                self.y_seq.append(y_all[i+window_size])         # [3]

    def __len__(self):
        return len(self.x_seq)

    def __getitem__(self, idx):
        return self.x_seq[idx], self.y_seq[idx]

In [62]:
# --- LSTM + MTL ---
class LSTM_MTL_Stepwise(nn.Module):
    def __init__(self, input_dim=1024, hidden_dim=256):
        super().__init__()
        self.lstm_cell = nn.LSTMCell(input_dim, hidden_dim)
        self.head_count = nn.Linear(hidden_dim, 1)
        self.head_citation = nn.Linear(hidden_dim, 1)
        self.head_value = nn.Linear(hidden_dim, 1)

    def forward(self, x):  # x: [B, 5, 1024]
        B, T, D = x.shape
        h, c = torch.zeros(B, 256).to(x.device), torch.zeros(B, 256).to(x.device)
        for t in range(T):
            h, c = self.lstm_cell(x[:, t, :], (h, c))  # 單步 forward 且更新 state
        # 最終 t=5 時的 h 用來預測第6年
        y1 = self.head_count(h)
        y2 = self.head_citation(h)
        y3 = self.head_value(h)
        return torch.cat([y1, y2, y3], dim=1)  # [B, 3]

In [None]:
# --- Metrics (expm1 to reverse log1p) ---
def compute_metrics(y_pred, y_true):
    y_pred = torch.expm1(y_pred).cpu().numpy()
    y_true = torch.expm1(y_true).cpu().numpy()
    metrics = {}
    for i, name in enumerate(["count", "citation", "value"]):
        y_p, y_t = y_pred[:, i], y_true[:, i]
        mse = np.mean((y_p - y_t) ** 2)
        mae = np.mean(np.abs(y_p - y_t))
        rmse = np.sqrt(mse)
        smape = np.mean(2 * np.abs(y_p - y_t) / (np.abs(y_p) + np.abs(y_t) + 1e-8))
        metrics[name] = {"MSE": mse, "MAE": mae, "RMSE": rmse, "SMAPE": smape}
    return metrics

In [None]:
# --- Training ---
company_ids = df['ticker'].unique()
# 5 folds 
kf = KFold(n_splits=5, shuffle=True, random_state=42)
folds = [(company_ids[train], company_ids[test]) for train, test in kf.split(company_ids)]

for fold_id, (train_coms, test_coms) in enumerate(folds):
    train_dataset = SlidingWindowDataset(df, train_coms)
    test_dataset = SlidingWindowDataset(df, test_coms)
    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = torch.utils.data.DataLoader(test_dataset, batch_size=16, shuffle=False)

    model = LSTM_MTL_Stepwise().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.MSELoss()

    for epoch in range(100):
        model.train()
        for x_batch, y_batch in train_loader:
            x_batch, y_batch = x_batch.to(device), y_batch.to(device)
            
            preds = model(x_batch)  # [B, 3]
            # backward
            loss = loss_fn(preds, y_batch)
            # Apply sample weights
            # Compute the mean loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        model.eval()
        all_preds, all_y = [], []
        with torch.no_grad():
            for x_batch, y_batch in val_loader:
                x_batch, y_batch = x_batch.to(device), y_batch.to(device)
                preds = model(x_batch)
                all_preds.append(preds)
                all_y.append(y_batch)
        all_preds = torch.cat(all_preds)
        all_y = torch.cat(all_y)
        metrics = compute_metrics(all_preds, all_y)

        print(f"[Fold {fold_id}]  {epoch} | Train Loss: {loss.item():.4f}")
        for task, vals in metrics.items():
            print(f"  [{task}] MSE: {vals['MSE']:.2f}, MAE: {vals['MAE']:.2f}, RMSE: {vals['RMSE']:.2f}, SMAPE: {vals['SMAPE']:.2f}")


(35, 3) (35, 3)
[Fold 0]  0 | Train Loss: 1.2831
  [count] MSE: 2529.21, MAE: 26.77, RMSE: 50.29, SMAPE: 1.05
  [citation] MSE: 122375.19, MAE: 190.28, RMSE: 349.82, SMAPE: 1.30
  [value] MSE: 913238.00, MAE: 681.29, RMSE: 955.63, SMAPE: 1.14
(35, 3) (35, 3)
[Fold 0]  1 | Train Loss: 0.6212
  [count] MSE: 2644.71, MAE: 27.33, RMSE: 51.43, SMAPE: 1.09
  [citation] MSE: 124844.90, MAE: 191.25, RMSE: 353.33, SMAPE: 1.30
  [value] MSE: 941444.31, MAE: 689.23, RMSE: 970.28, SMAPE: 1.15
(35, 3) (35, 3)
[Fold 0]  2 | Train Loss: 1.0120
  [count] MSE: 2491.46, MAE: 26.61, RMSE: 49.91, SMAPE: 1.04
  [citation] MSE: 118842.69, MAE: 188.93, RMSE: 344.74, SMAPE: 1.28
  [value] MSE: 933068.38, MAE: 686.80, RMSE: 965.95, SMAPE: 1.15
(35, 3) (35, 3)
[Fold 0]  3 | Train Loss: 1.2684
  [count] MSE: 2404.15, MAE: 26.32, RMSE: 49.03, SMAPE: 1.02
  [citation] MSE: 117190.62, MAE: 188.35, RMSE: 342.33, SMAPE: 1.28
  [value] MSE: 909829.56, MAE: 680.12, RMSE: 953.85, SMAPE: 1.13
(35, 3) (35, 3)
[Fold 0]  4 

In [None]:
train_dataset = SlidingWindowDataset(df, train_coms)
test_dataset = SlidingWindowDataset(df, test_coms)

print(f"Fold {fold_id} | Train samples: {len(train_dataset)} | Test samples: {len(test_dataset)}")

Fold 4 | Train samples: 360 | Test samples: 66


In [67]:
def company_sample_breakdown(df, company_list, window_size=5):
    summary = []
    for ticker in company_list:
        group = df[df['ticker'] == ticker].sort_values('year')
        years = group['year'].nunique()
        samples = max(0, years - window_size)
        summary.append({
            'ticker': ticker,
            'years_available': years,
            'samples_generated': samples
        })
    return pd.DataFrame(summary).sort_values('samples_generated', ascending=False)

# Example for Fold 4 (train)
train_coms = folds[4][0]
test_coms = folds[4][1]

train_breakdown = company_sample_breakdown(df, train_coms)
test_breakdown = company_sample_breakdown(df, test_coms)

print(f"Train Breakdown (Fold 4):")
print(train_breakdown)
print(f"\nTotal Samples (Train): {train_breakdown['samples_generated'].sum()}")

print(f"\nTest Breakdown (Fold 4):")
print(test_breakdown)
print(f"\nTotal Samples (Test): {test_breakdown['samples_generated'].sum()}")


Train Breakdown (Fold 4):
    ticker  years_available  samples_generated
127     PG               21                 16
37    SBUX               19                 14
78     CVX               18                 13
96     IBM               18                 13
119    NKE               17                 12
..     ...              ...                ...
139    TEL                2                  0
147    VMW                4                  0
145    UTX                1                  0
144    TXT                2                  0
151    WNC                1                  0

[152 rows x 3 columns]

Total Samples (Train): 360

Test Breakdown (Fold 4):
   ticker  years_available  samples_generated
9       A               20                 15
35    WMT               15                 10
5     STX               14                  9
32    TNC               11                  6
36    XOM               10                  5
28    ORA                9                  4
26    LMT 