# BasicLSTM

## Preparation

### Import modules

In [295]:
# Prediction using LSTM, GRU-LSTM, xLSTM
import copy
import math
from typing import List

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from pandas import DataFrame
from sklearn.model_selection import KFold, GroupShuffleSplit
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LRScheduler
from torch.utils.data import DataLoader, Dataset, Subset

import thesis_utils.datastruc as tuds
import thesis_utils.models as tumod

### Configuration

In [296]:
# Config for saving outputs
SAVE_ENABLED = True
SERIAL_NUMBER = "NOT_SET"

# Model parameters
HORIZON = 1
BATCH_SIZE = 128
NUM_EPOCHS = 25
HIDDEN_SIZE = 128
N_LAYERS = 3
DROPOUT = 0.2
EMBEDDING_SIZE = 32

# Train parameters
TARGET = "EXPORT_centered"
FEATURES = [
  "contig", "comlang_off", "colony", "smctry",  # dist cepii categorical
]
N_SPLITS = 5
PATIENCE = 5
LEARNING_RATE = 0.01
WEIGHT_DECAY = 0.01
RANDOM_SEED = 16
KEEP_FRAC = 1.0
N_LAGS = 2
SUBSAMPLE_ENABLED = True
N_DYADS = 10

EPS = 1e-19

SANCTION_COLS = ["arms", "military", "trade", "travel", "other"]

# Torch config
torch.manual_seed(RANDOM_SEED)
device = (
  torch.device("cpu")
)

### Load Data

In [297]:
processed = pd.read_parquet(path="../../data/model/processed.parquet", engine="fastparquet")
df: DataFrame = processed.copy(deep=True)

### Sort, shift and compute data

In [298]:
# Sort data by Report + Partner + Year
df["dyad_id"] = df["ISO3_reporter"] + "_" + df["ISO3_partner"]
df = df.sort_values(by=["dyad_id", "Year"], ignore_index=True)

In [299]:
if SUBSAMPLE_ENABLED:
  dyad_subsample = pd.Series(df["dyad_id"].unique()).sample(n=N_DYADS, random_state=RANDOM_SEED, replace=False)
  df = df[df["dyad_id"].isin(dyad_subsample)]
print(df["dyad_id"].nunique())

10


In [300]:
df["sanction"] = (df[SANCTION_COLS]
                  .sum(axis=1)).astype(int)

### Coerce numerical values and convert dyad_id to categorical

In [301]:
num_cols = ["distw", "GDP_reporter", "GDP_partner", "sanction", "contig",
            "comlang_off", "colony", "smctry", "Year", ]
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors="coerce").astype(float)
df = df.dropna(subset=num_cols)

In [302]:
df["Year"] = df["Year"].astype(int)
for col in ["dyad_id"]:
  df[col] = pd.Categorical(df[col], categories=sorted(df[col].unique()))

### Center data

In [303]:
center_columns = ["distw", "GDP_reporter", "GDP_partner", "EXPORT"]
for col in center_columns:
  col_max = df[col].max()
  col_min = df[col].min()
  df[col + "_centered"] = (df[col] - col_min) / (col_max - col_min) - 0.5

FEATURES += ["distw_centered"]

In [304]:
lag_cols = ["GDP_reporter_centered", "GDP_partner_centered", "sanction"]
for col in lag_cols:
  for index in range(1, N_LAGS + 1):
    df[f"{col}_lag{index}"] = df.groupby("dyad_id", observed=True)[col].shift(index)

In [305]:
df = df.dropna()
FEATURES += [f"{c}_lag{index}" for c in lag_cols for index in range(1, N_LAGS + 1)]

In [306]:
FEATURES

['contig',
 'comlang_off',
 'colony',
 'smctry',
 'distw_centered',
 'GDP_reporter_centered_lag1',
 'GDP_reporter_centered_lag2',
 'GDP_partner_centered_lag1',
 'GDP_partner_centered_lag2',
 'sanction_lag1',
 'sanction_lag2']

### Split data

In [237]:
# Embeddings
dyad_to_idx = { dyad: i for i, dyad in enumerate(df["dyad_id"].cat.categories) }
df["dyad_idx"] = df["dyad_id"].map(dyad_to_idx).astype(int)

In [238]:
# Split into Train, Validation and Test sets
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=RANDOM_SEED)

train_idx, test_idx = next(gss.split(df, groups=df["dyad_id"]))
test_df = df.iloc[test_idx]
train_df = df.iloc[train_idx]

train_idx, val_idx = next(gss.split(train_df, groups=train_df["dyad_id"]))
val_df = train_df.iloc[val_idx]
train_df = train_df.iloc[train_idx]

In [239]:
train_df.loc[:, FEATURES] = train_df.loc[:, FEATURES].astype(
  "float32",
  copy=False
)

# Train

## Define Fold and Epoch steps
_For reusability_

In [240]:
# Create KFold object
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

In [241]:
# Define epoch step
def epoch_step(model: nn.Module, optimizer: Optimizer, criterion: nn.Module,
               scheduler: LRScheduler, train_loader: DataLoader, val_loader: DataLoader,
               device: any) -> float:
  model.train()
  print("Training...")
  for X, y, di in train_loader:
    X, y, di = map(lambda t: t.to(device, non_blocking=True), (X, y, di))
    optimizer.zero_grad()
    loss = criterion(model(X, di), y)
    loss.backward()
    optimizer.step()
  print("Training done.")

  model.eval()
  val_losses = []
  with (torch.no_grad()):
    for X, y, di in val_loader:
      X, y, di = map(lambda t: t.to(device, non_blocking=True), (X, y, di))
      val_losses.append(criterion(model(X, di), y).item())

  val_rmse = math.sqrt((sum(val_losses) / len(val_losses)))
  scheduler.step(val_rmse)
  print(f"Validation RMSE: {val_rmse:.4f} for epoch")
  return val_rmse

In [242]:
# Define fold step
def fold_step(fold: int, train_idx: List, val_idx: List,
              dataset: Dataset, batch_size: int, num_epochs: int, patience: int,
              model: nn.Module, device: any,
              optimizer: Optimizer, criterion: nn.Module, scheduler: LRScheduler) -> (float, dict):
  train_loader = DataLoader(
    Subset(dataset, train_idx),
    batch_size=batch_size,
    shuffle=True,
    num_workers=4,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory=True
  )

  val_loader = DataLoader(
    Subset(dataset, val_idx),
    batch_size=batch_size,
    shuffle=False,
    num_workers=10,
    persistent_workers=True,
    prefetch_factor=2,
    pin_memory=False
  )

  best_state = copy.deepcopy(model.state_dict())
  best_rmse = float("inf")
  patience_left = patience

  print(f"Start epoch train for fold {fold}")
  for epoch in range(num_epochs):
    val_rmse = epoch_step(model=model, optimizer=optimizer, criterion=criterion,
                          scheduler=scheduler, train_loader=train_loader, val_loader=val_loader,
                          device=device)
    print(f"Epoch {epoch + 1:02d}/{num_epochs}  |  val RMSE: {val_rmse:.4f}")

    if val_rmse < best_rmse - 1e-4:
      best_rmse, patience_left = val_rmse, 10
      best_state = model.state_dict()
    else:
      patience_left -= 1
      if patience_left == 0:
        print("Early stop.")
        break
  model.load_state_dict(best_state)
  model.eval()
  preds, truth = [], []
  with torch.no_grad():
    for X, y, di in val_loader:
      X, di = map(lambda t: t.to(device, non_blocking=True), (X, di))
      preds.append(model(X, di).cpu())
      truth.append(y)
  preds = torch.cat(preds).numpy()
  truth = torch.cat(truth).numpy()

  rmse = np.sqrt(((preds - truth) ** 2).mean())
  mae = np.abs(preds - truth).mean()
  r2 = 1 - ((preds - truth) ** 2).sum() / ((truth - truth.mean()) ** 2).sum()
  print(f" Fold {fold}  RMSE {rmse:.4f} | MAE {mae:.4f} | R² {r2:.4f}")

  return rmse, copy.deepcopy(best_state)


## Train Raw dataset

### Split dataset

In [243]:
# Convert df_scaled to pytorch Tensor
dataset, dyad_to_idx = tuds.make_panel_datasets_dyad(
  data=df,
  features=FEATURES,
  target=TARGET,
  horizon=HORIZON,
)

In [244]:
# Create DataLoaders for the 3 sets
train_loader = DataLoader(
  Subset(dataset, train_idx),
  batch_size=BATCH_SIZE,
  shuffle=True,
  num_workers=10,
  persistent_workers=True,
  prefetch_factor=2,
  pin_memory=False
)

val_loader = DataLoader(
  Subset(dataset, val_idx),
  batch_size=BATCH_SIZE,
  shuffle=False,
  num_workers=10,
  persistent_workers=True,
  prefetch_factor=2,
  pin_memory=False
)

test_loader = DataLoader(
  Subset(dataset, test_idx),
  batch_size=BATCH_SIZE,
  shuffle=False,
  num_workers=10,
  persistent_workers=True,
  prefetch_factor=2,
  pin_memory=False
)

### Train model

In [245]:
# Save config
SAVE_ENABLED = False
SERIAL_NUMBER = f"BasicLSTM-RawData"

In [246]:
# Save best train iteration
best_fold_state = None
best_fold_rmse = float("inf")

In [247]:
for fold, (train_idx, val_idx) in enumerate(kf.split(np.arange(len(dataset))), 1):
  model = tumod.DyadLSTM(
    n_features=len(FEATURES),
    n_layers=N_LAYERS,
    n_dyads=len(dyad_to_idx),
    embed_dim=32,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT,
    horizon=HORIZON,
  ).to(device=device)

  criterion = nn.MSELoss()
  optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=PATIENCE
  )

  print(f"=== FOLD {fold}/{N_SPLITS} ===")
  fold_rmse, best_state = fold_step(fold=fold,
                                    train_idx=train_idx,
                                    val_idx=val_idx,
                                    dataset=dataset,
                                    batch_size=BATCH_SIZE,
                                    num_epochs=NUM_EPOCHS,
                                    patience=PATIENCE,
                                    model=model,
                                    device=device,
                                    optimizer=optimizer,
                                    criterion=criterion,
                                    scheduler=scheduler)
  if fold_rmse < best_fold_rmse:
    best_fold_rmse = fold_rmse
    best_fold_state = copy.deepcopy(best_state)

=== FOLD 1/5 ===
Start epoch train for fold 1
Training...
Training done.
Validation RMSE: 0.2935 for epoch
Epoch 01/25  |  val RMSE: 0.2935
Training...
Training done.
Validation RMSE: 0.1683 for epoch
Epoch 02/25  |  val RMSE: 0.1683
Training...
Training done.
Validation RMSE: 0.1175 for epoch
Epoch 03/25  |  val RMSE: 0.1175
Training...
Training done.
Validation RMSE: 0.0743 for epoch
Epoch 04/25  |  val RMSE: 0.0743
Training...
Training done.
Validation RMSE: 0.0799 for epoch
Epoch 05/25  |  val RMSE: 0.0799
Training...
Training done.
Validation RMSE: 0.0754 for epoch
Epoch 06/25  |  val RMSE: 0.0754
Training...
Training done.
Validation RMSE: 0.0637 for epoch
Epoch 07/25  |  val RMSE: 0.0637
Training...
Training done.
Validation RMSE: 0.0417 for epoch
Epoch 08/25  |  val RMSE: 0.0417
Training...
Training done.
Validation RMSE: 0.0777 for epoch
Epoch 09/25  |  val RMSE: 0.0777
Training...
Training done.
Validation RMSE: 0.0843 for epoch
Epoch 10/25  |  val RMSE: 0.0843
Training...
Tr

## Train Lagged features 🐌