In [1]:
# Check RAM
import psutil

ram_gb = psutil.virtual_memory().total / 1e9
used_gb = psutil.virtual_memory().used / 1e9
print(f"RAM: {used_gb:.2f} GB / {ram_gb:.2f} GB used")

# Check GPU
import torch

if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.memory_allocated(0) / 1e9:.2f} GB allocated")
else:
    print("No GPU detected")

RAM: 1.02 GB / 179.37 GB used
GPU: NVIDIA A100-SXM4-80GB
VRAM: 0.00 GB allocated



# MAT Training - Fold 1 (Weighted MSE)
# * **Path:** Drive/MAT_Weighted_Loss/Modality-aware-transformer
# * **Split:** Train (2010-2015), Val (2016), Test (2017)
# * **Loss:** Weighted MSE (Alpha=100)

In [2]:
import sys
import gc
import torch
import torch.nn as nn
import torch.optim as optim
from pathlib import Path
from google.colab import drive
from torch.utils.data import DataLoader

In [6]:
# --- 1. SETUP ENV ---
# Mount Google Drive
drive.mount("content/drive/MyDrive")

# Define the exact path to your project
# (Updated to your new folder structure)
PROJECT_ROOT = "/content/drive/MyDrive/MAT_Weighted_Loss/Modality-aware-transformer"

# Add this path to Python's list of folders to search for imports
if PROJECT_ROOT not in sys.path:
    sys.path.append(PROJECT_ROOT)

print(f"Project Root set to: {PROJECT_ROOT}")

# Check if the path actually exists
if not Path(PROJECT_ROOT).exists():
    raise FileNotFoundError(
        f"Could not find project at {PROJECT_ROOT}. Please check the folder names in Drive."
    )

KeyboardInterrupt: 

In [None]:
# --- 2. IMPORTS FROM SRC ---
try:
    from src.utils.data_loader import load_and_merge_data, prepare_scaled_fold
    from src.models.dataset import FinancialDataset
    from src.models.architectures.mat import MAT
    from src.training.callbacks import EarlyStopping
    from src.training.engine import train_epoch, validate_epoch
    from src.evaluation.predictions.inference import WalkForwardEvaluator
    print("SUCCESS: Local modules imported.")
except ImportError as e:
    print(f"ERROR: Could not import modules. {e}")
    print("Double check that 'src' folder exists inside the PROJECT_ROOT.")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using Device: {device} ({torch.cuda.get_device_name(0)})")

In [None]:
# ## 3. Define Custom Loss
# We define this here to ensure it's available immediately.

class WeightedMSE(nn.Module):
    """
    Weighted Mean Squared Error.
    Penalizes errors on large targets more than errors on small targets.
    Formula: Loss = (pred - target)^2 * (1 + alpha * |target|)
    """
    def __init__(self, alpha=100.0):
        super().__init__()
        self.alpha = alpha
        
    def forward(self, preds, targets):
        se = (preds - targets) ** 2
        # Weighting: 1.0 for zero targets, increasing for larger targets
        weights = 1 + self.alpha * torch.abs(targets)
        return torch.mean(se * weights)

# %% [markdown]

In [None]:
# ## 4. Data Loading & Preparation

DATA_DIR = Path(PROJECT_ROOT) / "data"

# 1. Load Data
print(f"Loading data from {DATA_DIR}...")
df_main = load_and_merge_data(DATA_DIR, start_date="2010-01-01", end_date="2023-12-31")

# 2. Define Split for Fold 1
split = {
    'train': ("2010-01-01", "2015-12-31"),
    'val':   ("2016-01-01", "2016-12-31"),
    'test':  ("2017-01-01", "2017-12-31")
}

# 3. Features Setup
non_feature_cols = [
    "date", "permno", "target", "emb_mean", 
    "sent_score_mean", "sent_pos_mean", "sent_neg_mean", 
    "log_n_news", "sent_score_std"
]

# All input features (including has_news)
all_num_cols = [c for c in df_main.columns if c not in non_feature_cols]

# Features to actually scale (exclude binary flags)
scale_cols = [c for c in all_num_cols if c != "has_news"]

print(f"Total Input Features: {len(all_num_cols)}")
print(f"Features being scaled: {len(scale_cols)}")

# 4. Scale
print("Scaling and splitting data...")
df_train, df_val, df_test = prepare_scaled_fold(df_main, scale_cols, split)

In [None]:
# %% [markdown]
# ## 5. Create Datasets & Loaders (A100 OPTIMIZED)

# %%
# Configuration for High-RAM + A100
WINDOW_SIZE = 60
FORECAST_HORIZON = 1

# A100 Beast Mode Settings
BATCH_SIZE = 4096   # Massive batch size for speed & stability
NUM_WORKERS = 8     # Use more CPU cores to load data

print("Creating PyTorch Datasets...")
train_ds = FinancialDataset(df_train, window_size=WINDOW_SIZE, forecast_horizon=FORECAST_HORIZON)
val_ds   = FinancialDataset(df_val,   window_size=WINDOW_SIZE, forecast_horizon=FORECAST_HORIZON)
test_ds  = FinancialDataset(df_test,  window_size=WINDOW_SIZE, forecast_horizon=FORECAST_HORIZON)

# Clean RAM (even though you have plenty, it's good practice)
del df_train, df_val, df_test
gc.collect()

print(f"Train Size: {len(train_ds)}")
print(f"Val Size:   {len(val_ds)}")
print(f"Test Size:  {len(test_ds)}")

# Loaders
# persistent_workers=True keeps the workers alive between epochs (saves startup time)
train_loader = DataLoader(
    train_ds, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=NUM_WORKERS, 
    pin_memory=True,
    persistent_workers=True 
)

val_loader = DataLoader(
    val_ds,   
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=NUM_WORKERS, 
    pin_memory=True,
    persistent_workers=True
)

test_loader = DataLoader(
    test_ds,  
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=NUM_WORKERS, 
    pin_memory=True,
    persistent_workers=True
)

print(f"Data Loaders Ready! Batch Size: {BATCH_SIZE}")

In [None]:
# ## 6. Initialize Model & Training

# Input dim must match ALL columns (22 approx)
model = MAT(
    num_input_dim=len(all_num_cols), 
    n_sent=5, 
    d_model=128,
    forecast_horizon=FORECAST_HORIZON
).to(device)

optimizer = optim.AdamW(model.parameters(), lr=1e-4)

# --- USING WEIGHTED MSE ---
criterion = WeightedMSE(alpha=100.0) 

# Save Checkpoints to Drive so you don't lose them if Colab disconnects
ckpt_dir = Path(PROJECT_ROOT) / "models"
ckpt_dir.mkdir(parents=True, exist_ok=True)
ckpt_path = ckpt_dir / "mat_weighted_fold1.pt"

early_stopping = EarlyStopping(patience=5, path=str(ckpt_path), verbose=True)

print("Starting Training...")
EPOCHS = 20

for epoch in range(EPOCHS):
    t_loss = train_epoch(model, train_loader, optimizer, criterion, device)
    v_loss = validate_epoch(model, val_loader, criterion, device)
    
    print(f"Epoch {epoch+1}/{EPOCHS} | Train Loss: {t_loss:.6f} | Val Loss: {v_loss:.6f}")
    
    early_stopping(v_loss, model)
    if early_stopping.early_stop:
        print("Early Stopping Triggered!")
        break

In [None]:
# ## 7. Inference & Variance Check

print("Loading Best Model for Inference...")
model.load_state_dict(torch.load(ckpt_path))

evaluator = WalkForwardEvaluator(model, device)
print("Running Inference on Test Set (2017)...")
df_preds = evaluator.predict_fold(test_loader, fold_name="MAT_Weighted_2017")

# --- Stats Check ---
print("\n--- Variance Analysis ---")
pred_std = df_preds['pred'].std()
target_std = df_preds['target'].std()
ratio = pred_std / target_std

print(f"Pred Std Dev:   {pred_std:.6f}")
print(f"Target Std Dev: {target_std:.6f}")
print(f"Ratio: {ratio:.4f}")

if ratio < 0.05:
    print("⚠️ WARNING: Predictions still look collapsed (Low Variance).")
else:
    print("✅ SUCCESS: Model variance looks healthy.")

# Save Predictions to Drive
save_path = Path(PROJECT_ROOT) / "data" / "processed" / "predictions" / "mat_weighted_2017.csv"
save_path.parent.mkdir(parents=True, exist_ok=True)
df_preds.to_csv(save_path, index=False)
print(f"Predictions saved to {save_path}")