# Preparation

## Import modules

In [16]:
import copy
import math
from typing import List

import numpy as np
import pandas as pa
import torch
import torch.nn as nn
import torch.optim as optim
from pandas import DataFrame
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LRScheduler
from torch.utils.data import DataLoader, Dataset, Subset

import thesis_utils.datastruc as tuds
import thesis_utils.models as tumod

## Configuration

In [17]:
# Config for saving outputs
SAVE_ENABLED = True
SERIAL_NUMBER = "NOT_SET"

# Model parameters
SEQ_LEN = 5
HORIZON = 1
BATCH_SIZE = 256
NUM_EPOCHS = 20
HIDDEN_SIZE = 128
N_LAYERS = 2
DROPOUT = 0.3

# Train parameters
TARGET = "gravity_trade"
FEATURES = [
  "GDP_reporter",
  "GDP_partner",
  "distw",
  "TOTAL",
  "arms", "military", "trade", "financial", "travel", "other",  # sanctions categorical
  "contig", "comlang_off", "colony", "smctry",  # dist cepii categorical
  "fyear", "GDP_yearly_average"  # additional features
]
N_SPLITS = 3
PATIENCE = 15
LEARNING_RATE = 0.01
WEIGHT_DECAY = 0.01
RANDOM_SEED = 16
KEEP_FRAC = 1

# Torch config
torch.manual_seed(RANDOM_SEED)
device = (
  torch.device("mps") if torch.backends.mps.is_available()
  else torch.device("cpu")
)

## Load Data

In [18]:
processed = pa.read_parquet(path="../data/model/processed.parquet", engine="fastparquet")
df: DataFrame = processed.copy(deep=True)

### Sort, shift and compute data

In [19]:
# Sort data by Report + Partner + Year
df["dyad_id"] = df["ISO3_reporter"] + "_" + df["ISO3_partner"]
df = df.sort_values(by=["dyad_id", "Year"], ignore_index=True)

In [20]:
# Add gravity_trade as value column
df["gravity_trade"] = np.log1p((df["GDP_reporter"] * df["GDP_partner"]) / df["distw"])
df["TOTAL"] = df["IMPORT"] + df["EXPORT"]

# Add year feature
df["fyear"] = df["Year"]

## Normalize

In [21]:
# Scale data
scale_columns_minmax = ["GDP_reporter", "GDP_partner", "TOTAL", "fyear", "GDP_yearly_average"]
scaler_rb = RobustScaler()
scaler_mm = MinMaxScaler()
df_scaled: DataFrame = df.copy(deep=True)
df_scaled[scale_columns_minmax] = scaler_mm.fit_transform(df[scale_columns_minmax])
df_scaled[["distw"]] = scaler_rb.fit_transform(df[["distw"]])

## Split data

In [22]:
# Split into Train, Validation and Test sets
idx = np.arange(len(df_scaled))

train_idx, test_idx = train_test_split(
  idx, test_size=0.20, random_state=RANDOM_SEED
)
train_idx, val_idx = train_test_split(
  train_idx, test_size=20, random_state=RANDOM_SEED
)

# Train

## Define Fold and Epoch steps
_For reusability_

In [23]:
# Create KFold object
kf = KFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_SEED)

In [24]:
# Define epoch step
def epoch_step(model: nn.Module, optimizer: Optimizer, criterion: nn.Module,
               scheduler: LRScheduler, train_loader: DataLoader, val_loader: DataLoader,
               device: any) -> float:
  model.train()
  for X, y, _ in train_loader:
    X = X.to(device)
    y = y.to(device)
    optimizer.zero_grad()
    loss = criterion(model(X), y)
    loss.backward()
    optimizer.step()

  model.eval()
  val_losses = []
  with (torch.no_grad()):
    for X, y, _ in val_loader:
      X = X.to(device)
      y = y.to(device)
      val_losses.append(criterion(model(X), y).item())

  val_rmse = math.sqrt((sum(val_losses) / len(val_losses)))
  scheduler.step(val_rmse)
  return val_rmse

In [25]:
# Define fold step
def fold_step(fold: int, train_idx: List, val_idx: List,
              dataset: Dataset, batch_size: int, num_epochs: int, patience: int,
              model: nn.Module, device: any,
              optimizer: Optimizer, criterion: nn.Module, scheduler: LRScheduler) -> (float, dict):
  train_loader = DataLoader(
    Subset(dataset, train_idx),
    batch_size=batch_size,
    shuffle=True
  )

  val_loader = DataLoader(
    Subset(dataset, val_idx),
    batch_size=batch_size,
    shuffle=False
  )

  best_state = copy.deepcopy(model.state_dict())
  best_rmse = float("inf")
  patience_left = patience

  for epoch in range(num_epochs):
    val_rmse = epoch_step(model=model, optimizer=optimizer, criterion=criterion,
                          scheduler=scheduler, train_loader=train_loader, val_loader=val_loader,
                          device=device)
    print(f"Epoch {epoch + 1:02d}/{num_epochs}  |  val RMSE: {val_rmse:.4f}")

    if val_rmse < best_rmse - 1e-4:
      best_rmse, patience_left = val_rmse, 10
      best_state = model.state_dict()
    else:
      patience_left -= 1
      if patience_left == 0:
        print("Early stop.")
        break
  model.load_state_dict(best_state)
  model.eval()
  preds, truth = [], []
  with torch.no_grad():
    for X, y, _ in val_loader:
      X = X.to(device)
      preds.append(model(X).cpu())
      truth.append(y)
  preds = torch.cat(preds).numpy()
  truth = torch.cat(truth).numpy()

  rmse = np.sqrt(((preds - truth) ** 2).mean())
  mae = np.abs(preds - truth).mean()
  r2 = 1 - ((preds - truth) ** 2).sum() / ((truth - truth.mean()) ** 2).sum()
  print(f" Fold {fold}  RMSE {rmse:.4f} | MAE {mae:.4f} | R² {r2:.4f}")

  return rmse, copy.deepcopy(best_state)


## Raw dataset

### Split dataset

In [26]:
# Convert df_scaled to pytorch Tensor
dataset, _ = tuds.make_panel_datasets(data=df_scaled, features=FEATURES, target=TARGET, horizon=HORIZON,
                                      keep_frac=KEEP_FRAC)

Shape original data:  (1099125, 32)
Shape sampled:  (1099125, 32)
Shape remainder:  (0, 32)


In [27]:
# Create DataLoaders for the 3 sets
train_loader = DataLoader(
  Subset(dataset, train_idx),
  batch_size=BATCH_SIZE,
  shuffle=True
)

val_loader = DataLoader(
  Subset(dataset, val_idx),
  batch_size=BATCH_SIZE,
  shuffle=False
)

test_loader = DataLoader(
  Subset(dataset, test_idx),
  batch_size=BATCH_SIZE,
  shuffle=False
)

### Train model

In [28]:
# Save config
SAVE_ENABLED = False
SERIAL_NUMBER = f"BasicLSTM-RawData"

In [29]:
# Save best train iteration
best_fold_state = None
best_fold_rmse = float("inf")

In [None]:
for fold, (train_idx, val_idx) in enumerate(kf.split(np.arange(len(dataset))), 1):
  model = tumod.BasicGRU(
    n_features=len(FEATURES),
    n_layers=N_LAYERS,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT,
    horizon=HORIZON).to(device=device)

  criterion = nn.MSELoss()
  optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=PATIENCE
  )

  print(f"=== FOLD {fold}/{N_SPLITS} ===")
  fold_rmse, best_state = fold_step(fold=fold,
                                    train_idx=train_idx,
                                    val_idx=val_idx,
                                    dataset=dataset,
                                    batch_size=BATCH_SIZE,
                                    num_epochs=NUM_EPOCHS,
                                    patience=PATIENCE,
                                    model=model,
                                    device=device,
                                    optimizer=optimizer,
                                    criterion=criterion,
                                    scheduler=scheduler)
  if fold_rmse < best_fold_rmse:
    best_fold_rmse = fold_rmse
    best_fold_state = copy.deepcopy(best_state)

=== FOLD 1/3 ===
Epoch 01/20  |  val RMSE: 1.8568
Epoch 02/20  |  val RMSE: 1.7172
Epoch 03/20  |  val RMSE: 1.7307
Epoch 04/20  |  val RMSE: 1.5907
Epoch 05/20  |  val RMSE: 1.5882
Epoch 06/20  |  val RMSE: 1.3018
Epoch 07/20  |  val RMSE: 1.1094
Epoch 08/20  |  val RMSE: 1.1029
Epoch 09/20  |  val RMSE: 1.0905
Epoch 10/20  |  val RMSE: 1.1655
Epoch 11/20  |  val RMSE: 0.9776
Epoch 12/20  |  val RMSE: 1.0695


## Lagged features

### Split dataset and lag features

In [11]:
# Convert df_scaled to pytorch Tensor
LAGGED_COLS = ["GDP_partner", "GDP_reporter", "TOTAL"]
LAG = 3
dataset, _ = tuds.make_panel_laggedsets(data=df_scaled, features=FEATURES, target=TARGET,
                                        lag=LAG, lag_columns=LAGGED_COLS, keep_frac=KEEP_FRAC)

Shape original data:  (1099125, 32)
Shape sampled:  (1099125, 32)
Shape remainder:  (0, 32)


In [12]:
# Create DataLoaders for the 3 sets
train_loader = DataLoader(
  Subset(dataset, train_idx),
  batch_size=BATCH_SIZE,
  shuffle=True,
  num_workers=4
)

val_loader = DataLoader(
  Subset(dataset, val_idx),
  batch_size=BATCH_SIZE,
  shuffle=False,
  num_workers=4
)

test_loader = DataLoader(
  Subset(dataset, test_idx),
  batch_size=BATCH_SIZE,
  shuffle=False,
  num_workers=4
)

### Train model

In [13]:
# Save config
SAVE_ENABLED = False
SERIAL_NUMBER = f"BasicLSTM-LaggedFeatures5"

In [14]:
# Save best train iteration
best_fold_state = None
best_fold_rmse = float("inf")

In [15]:
# Add lagged feature names
LAGGED_FEATURES = copy.deepcopy(FEATURES)
for feature in LAGGED_COLS:
  for i in range(1, LAG + 1):
    LAGGED_FEATURES.append(f"{feature}_lag{i}")

In [None]:
for fold, (train_idx, val_idx) in enumerate(kf.split(np.arange(len(dataset))), 1):
  print(f"=== FOLD {fold}/{N_SPLITS} ===")
  model = tumod.BasicGRU(
    n_features=len(LAGGED_FEATURES),
    n_layers=N_LAYERS,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT,
    horizon=HORIZON).to(device=device)

  criterion = nn.MSELoss()
  optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=PATIENCE
  )

  fold_rmse, best_state = fold_step(fold=fold,
                                    train_idx=train_idx,
                                    val_idx=val_idx,
                                    dataset=dataset,
                                    batch_size=BATCH_SIZE,
                                    num_epochs=NUM_EPOCHS,
                                    patience=PATIENCE,
                                    model=model,
                                    device=device,
                                    optimizer=optimizer,
                                    criterion=criterion,
                                    scheduler=scheduler)
  if fold_rmse < best_fold_rmse:
    best_fold_rmse = fold_rmse
    best_fold_state = copy.deepcopy(best_state)

=== FOLD 1/3 ===
Epoch 01/20  |  val RMSE: 1.5268
Epoch 02/20  |  val RMSE: 1.3515
Epoch 03/20  |  val RMSE: 1.1812
Epoch 04/20  |  val RMSE: 1.0388
Epoch 05/20  |  val RMSE: 0.6907
Epoch 06/20  |  val RMSE: 0.5752
Epoch 07/20  |  val RMSE: 0.9570
Epoch 08/20  |  val RMSE: 0.6555
Epoch 09/20  |  val RMSE: 0.5155


## Train Sliding Window

In [None]:
# Convert df_scaled to pytorch Tensor
dataset, _ = tuds.make_panel_slidingwindows(data=df_scaled, features=FEATURES, target=TARGET, seq_len=SEQ_LEN,
                                            horizon=HORIZON, keep_frac=KEEP_FRAC)

### Split dataset

In [None]:
# Create DataLoaders for the 3 sets
train_loader = DataLoader(
  Subset(dataset, train_idx),
  batch_size=BATCH_SIZE,
  shuffle=True,
  num_workers=4
)

val_loader = DataLoader(
  Subset(dataset, val_idx),
  batch_size=BATCH_SIZE,
  shuffle=False,
  num_workers=4
)

test_loader = DataLoader(
  Subset(dataset, test_idx),
  batch_size=BATCH_SIZE,
  shuffle=False,
  num_workers=4
)

### Train model

In [None]:
# Save config
SAVE_ENABLED = False
SERIAL_NUMBER = f"BasicLSTM-SlidingWindow{SEQ_LEN}"

In [None]:
# Save best train iteration
best_fold_state = None
best_fold_rmse = float("inf")

In [None]:
for fold, (train_idx, val_idx) in enumerate(kf.split(np.arange(len(dataset))), 1):
  print(f"=== FOLD {fold}/{N_SPLITS} ===")
  model = tumod.BasicGRU(
    n_features=len(FEATURES),
    n_layers=N_LAYERS,
    hidden_size=HIDDEN_SIZE,
    dropout=DROPOUT,
    horizon=HORIZON).to(device=device)

  criterion = nn.MSELoss()
  optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
  scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=PATIENCE
  )

  fold_rmse, best_state = fold_step(fold=fold,
                                    train_idx=train_idx,
                                    val_idx=val_idx,
                                    dataset=dataset,
                                    batch_size=BATCH_SIZE,
                                    num_epochs=NUM_EPOCHS,
                                    patience=PATIENCE,
                                    model=model,
                                    device=device,
                                    optimizer=optimizer,
                                    criterion=criterion,
                                    scheduler=scheduler)
  if fold_rmse < best_fold_rmse:
    best_fold_rmse = fold_rmse
    best_fold_state = copy.deepcopy(best_state)