### setup

In [31]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
import torch # type: ignore
import torch.nn as nn # type: ignore
from torch.utils.data import DataLoader # type: ignore
from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts # type: ignore
from torch.utils.tensorboard import SummaryWriter # type: ignore
import pandas as pd # type: ignore
from model import PitcherTransformerModel, BatterTransformerModel, PretrainingTransformer
from dataset import MLB_Batter_Dataset, MLB_Pitcher_Dataset, PretrainingDataset
from config import pitcher_features, batter_features
from utils import compute_metrics
from datetime import datetime
import os


### Training Loops

#### Pretraining

In [6]:
##############################################
# Pretraining Training Loop
##############################################
def train_pretraining_model(model, dataloader, num_epochs=10, lr=1e-3, device='cpu'):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for batch in dataloader:
            # batch is a sequence: [batch, seq_len, input_dim]
            batch = batch.to(device)
            optimizer.zero_grad()
            reconstructions = model(batch)
            loss = criterion(reconstructions, batch)
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * batch.size(0)
        epoch_loss = running_loss / len(dataloader.dataset)
        print(f"Pretraining Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f}")
    return model

##############################################
# Training Loop for Masked Pretraining
##############################################
def train_masked_pretraining_model(model, dataloader, num_epochs=10, lr=1e-3, device='cpu'):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    # Use MSELoss with no reduction so we can average only over masked tokens
    criterion = nn.MSELoss(reduction='none')
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        for batch in dataloader:
            # Assume each batch is a tensor of shape [batch, seq_len, input_dim]
            batch = batch.to(device)
            optimizer.zero_grad()
            reconstructed, mask = model(batch)
            # Compute reconstruction loss only over masked positions
            loss = criterion(reconstructed, batch)  # [batch, seq_len, input_dim]
            # Create a mask for the loss: expand the mask to cover the feature dimension
            loss_mask = mask.unsqueeze(-1).expand_as(loss)
            if loss_mask.sum() > 0:
                loss = loss[loss_mask].mean()
            else:
                loss = loss.mean()
            loss.backward()
            optimizer.step()
            running_loss += loss.item() * batch.size(0)
        
        epoch_loss = running_loss / len(dataloader.dataset)
        print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")
    
    return model

#### Supervised Training

In [9]:
def train_model(model, train_dataloader, val_dataloader, num_epochs=10, lr=1e-3, device='cpu'):
    model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    
    scheduler = CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)
    model_dir = os.path.join("models", f"experiment1_{datetime.now().strftime('%Y%m%d-%H%M%S')}")

    # create model_dir if it does not exist
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)
        
    log_dir = os.path.join("runs", f"experiment1_{datetime.now().strftime('%Y%m%d-%H%M%S')}")
    writer = SummaryWriter(log_dir=log_dir)
    
    best_val_loss = float('inf')
    best_model_path = os.path.join(model_dir, "best_model.pth")
    global_step = 0
    
    for epoch in range(num_epochs):
        # Training Phase
        model.train()
        running_loss = 0.0
        for batch in train_dataloader:
            primary_seq, opponent_seq, targets = batch
            primary_seq = primary_seq.to(device)
            opponent_seq = opponent_seq.to(device)
            targets = targets.to(device)
            
            optimizer.zero_grad()
            outputs = model(primary_seq, opponent_seq)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            scheduler.step(epoch + global_step / len(train_dataloader))
            
            running_loss += loss.item() * primary_seq.size(0)
            global_step += 1
        train_loss = running_loss / len(train_dataloader.dataset)
        
        # Validation Phase
        model.eval()
        val_running_loss = 0.0
        all_preds, all_targets = [], []
        with torch.no_grad():
            for batch in val_dataloader:
                primary_seq, opponent_seq, targets = batch
                primary_seq = primary_seq.to(device)
                opponent_seq = opponent_seq.to(device)
                targets = targets.to(device)
                outputs = model(primary_seq, opponent_seq)
                loss = criterion(outputs, targets)
                val_running_loss += loss.item() * primary_seq.size(0)
                all_preds.append(outputs)
                all_targets.append(targets)
        val_loss = val_running_loss / len(val_dataloader.dataset)
        all_preds = torch.cat(all_preds)
        all_targets = torch.cat(all_targets)
        mae, rmse, r2 = compute_metrics(all_preds, all_targets)
        
        print(f"Epoch {epoch+1}/{num_epochs} - "
              f"Train Loss: {train_loss:.4f}, Val Loss: {val_loss:.4f}, "
              f"MAE: {mae:.4f}, RMSE: {rmse:.4f}, R2: {r2:.4f}")
        
        # Log metrics to TensorBoard
        writer.add_scalars("Loss", {"train": train_loss, "val": val_loss}, epoch)
        writer.add_scalar("MAE/val", mae, epoch)
        writer.add_scalar("RMSE/val", rmse, epoch)
        writer.add_scalar("R2/val", r2, epoch)
        writer.add_scalar("LearningRate", optimizer.param_groups[0]['lr'], epoch)
        writer.flush()
        
        # Checkpointing: Save model if validation loss improved
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'val_loss': val_loss,
            }, best_model_path)
            print(f"--> Saved best model with val_loss: {val_loss:.4f} at epoch {epoch+1}")
    
    writer.close()
    return model


## Run

### load data

In [25]:
batter_df_original = pd.read_csv('data/train_batters.csv')[batter_features + ['opp_starting_pitcher', 'PLAYER-ID', 'DATE', 'TEAM', 'OPPONENT', 'next_target_H']]
pitcher_df_original = pd.read_csv('data/train_pitchers.csv')[pitcher_features + ['PLAYER-ID', 'DATE', 'TEAM', 'OPPONENT', 'next_target_SO']]

# Assuming batter_df_original and pitcher_df_original are your full datasets.
n_batter = batter_df_original.shape[0]
n_pitcher = pitcher_df_original.shape[0]

train_batter_df = batter_df_original.head(int(n_batter * 0.9))
val_batter_df = batter_df_original.tail(n_batter - train_batter_df.shape[0])

train_pitcher_df = pitcher_df_original.head(int(n_pitcher * 0.9))
val_pitcher_df = pitcher_df_original.tail(n_pitcher - train_pitcher_df.shape[0])

print("Train batter data:", train_batter_df.shape)
print("Validation batter data:", val_batter_df.shape)
print("Train pitcher data:", train_pitcher_df.shape)
print("Validation pitcher data:", val_pitcher_df.shape)


Train batter data: (122263, 15)
Validation batter data: (13585, 15)
Train pitcher data: (8459, 15)
Validation pitcher data: (940, 15)


### Setup Pretraining Dataloaders

In [None]:
# Create the pretraining dataset and dataloader
batter_pretrain_dataset = PretrainingDataset(
    train_batter_df, 
    player_id_col='PLAYER-ID', 
    date_col='DATE',
    useful_stats_cols=batter_features, 
    sequence_length=3
    )

pitcher_pretrain_dataset = PretrainingDataset(
    df=train_pitcher_df,
    player_id_col='PLAYER-ID',
    date_col='DATE',
    useful_stats_cols=pitcher_features,
    sequence_length=3
)

batter_pretrain_loader = DataLoader(batter_pretrain_dataset, batch_size=32, shuffle=True)
pitcher_pretrain_loader = DataLoader(pitcher_pretrain_dataset, batch_size=32, shuffle=True)

### Create Pretraining Models

In [19]:
# Instantiate and pretrain the autoencoder model
batter_pretrain_model = PretrainingTransformer(
    input_dim=len(batter_features), 
    model_dim=64, 
    n_heads=4, 
    num_layers=2, 
    sequence_length=3
    )

# Instantiate the PretrainingTransformer for pitchers.
pitcher_pretrain_model = PretrainingTransformer(
    input_dim=len(pitcher_features),
    model_dim=64,
    n_heads=4,
    num_layers=2,
    sequence_length=3
)

### Run Pretraining

In [20]:
print("Starting Pitcher Pretraining....")
pitcher_pretrain_model = train_pretraining_model(
    pitcher_pretrain_model, 
    pitcher_pretrain_loader, 
    num_epochs=100, 
    lr=5e-4, 
    device='cuda'
)

# print("\nStarting Batter Pretraining....")
# batter_pretrain_model = train_pretraining_model(
#     batter_pretrain_model, 
#     batter_pretrain_loader, 
#     num_epochs=25, 
#     lr=1e-3, 
#     device='cuda'
# )

Starting Pitcher Pretraining....
Pretraining Epoch 1/100 - Loss: 642.0559
Pretraining Epoch 2/100 - Loss: 479.2214
Pretraining Epoch 3/100 - Loss: 314.1846
Pretraining Epoch 4/100 - Loss: 185.2379
Pretraining Epoch 5/100 - Loss: 101.8665
Pretraining Epoch 6/100 - Loss: 59.9848
Pretraining Epoch 7/100 - Loss: 29.7881
Pretraining Epoch 8/100 - Loss: 15.6668
Pretraining Epoch 9/100 - Loss: 12.0270
Pretraining Epoch 10/100 - Loss: 5.3355
Pretraining Epoch 11/100 - Loss: 3.4583
Pretraining Epoch 12/100 - Loss: 2.7421
Pretraining Epoch 13/100 - Loss: 1.8877
Pretraining Epoch 14/100 - Loss: 2.9260
Pretraining Epoch 15/100 - Loss: 0.8205
Pretraining Epoch 16/100 - Loss: 1.8411
Pretraining Epoch 17/100 - Loss: 0.5377
Pretraining Epoch 18/100 - Loss: 0.5478
Pretraining Epoch 19/100 - Loss: 1.3938
Pretraining Epoch 20/100 - Loss: 0.3483
Pretraining Epoch 21/100 - Loss: 0.5432
Pretraining Epoch 22/100 - Loss: 2.0234
Pretraining Epoch 23/100 - Loss: 0.5058
Pretraining Epoch 24/100 - Loss: 2.4208
Pr

### Supervised Dataloaders

In [26]:
# Create the datasets for each model
batter_dataset = MLB_Batter_Dataset(
    train_batter_df, 
    train_pitcher_df, 
    sequence_length=3, 
    batter_features=batter_features, 
    pitcher_features=pitcher_features
    )

pitcher_dataset = MLB_Pitcher_Dataset(
    train_pitcher_df, 
    train_batter_df, 
    sequence_length=3, 
    pitcher_features=pitcher_features, 
    batter_features=batter_features
    )

pitcher_dataset_val = MLB_Pitcher_Dataset(
    val_pitcher_df, 
    val_batter_df, 
    sequence_length=3, 
    pitcher_features=pitcher_features, 
    batter_features=batter_features
    )

#batter_loader = DataLoader(batter_dataset, batch_size=2, shuffle=True)
pitcher_loader = DataLoader(pitcher_dataset, batch_size=32, shuffle=True)
pitcher_loader_val = DataLoader(pitcher_dataset_val, batch_size=32, shuffle=True)

### Create Supervised Models

In [None]:
batter_model = BatterTransformerModel(
    batter_input_dim=len(batter_features),
    opp_pitcher_input_dim=len(pitcher_features),
    model_dim=64, 
    n_heads=4, 
    num_layers=2, 
    sequence_length=3
)
    
# Load the pretrained encoder weights into the batter model.
batter_model.load_pretrained_encoder(batter_pretrain_model)

pitcher_model = PitcherTransformerModel(
    pitcher_input_dim=len(pitcher_features),
    opp_batter_input_dim=len(batter_features),
    model_dim=64, 
    n_heads=4, 
    num_layers=2, 
    sequence_length=3,
    dropout=0.3
)
    
pitcher_model.load_pretrained_encoder(pitcher_pretrain_model)

### Train Models

In [34]:
# Train the pitcher model (example)
print("Training Pitcher Model:")
trained_pitcher_model = train_model(
    pitcher_model, 
    pitcher_loader, 
    pitcher_loader_val, 
    num_epochs=1000, 
    lr=5e-4, 
    device='cuda')

# Train the batter model (example)
#print("Training Batter Model:")
#trained_batter_model = train_model(batter_model, batter_loader, num_epochs=5, lr=5e-3, device='cuda')

Training Pitcher Model:
Epoch 1/1000 - Train Loss: 7.6310, Val Loss: 7.0077, MAE: 2.1534, RMSE: 2.6472, R2: 0.0089
--> Saved best model with val_loss: 7.0077 at epoch 1
Epoch 2/1000 - Train Loss: 7.0708, Val Loss: 7.0373, MAE: 2.1669, RMSE: 2.6528, R2: 0.0047
Epoch 3/1000 - Train Loss: 6.6798, Val Loss: 6.8552, MAE: 2.1325, RMSE: 2.6182, R2: 0.0304
--> Saved best model with val_loss: 6.8552 at epoch 3
Epoch 4/1000 - Train Loss: 6.8322, Val Loss: 6.7990, MAE: 2.1132, RMSE: 2.6075, R2: 0.0384
--> Saved best model with val_loss: 6.7990 at epoch 4
Epoch 5/1000 - Train Loss: 6.6461, Val Loss: 6.7688, MAE: 2.1138, RMSE: 2.6017, R2: 0.0427
--> Saved best model with val_loss: 6.7688 at epoch 5
Epoch 6/1000 - Train Loss: 6.4847, Val Loss: 6.4714, MAE: 2.0520, RMSE: 2.5439, R2: 0.0847
--> Saved best model with val_loss: 6.4714 at epoch 6
Epoch 7/1000 - Train Loss: 6.3567, Val Loss: 6.4311, MAE: 2.0456, RMSE: 2.5360, R2: 0.0904
--> Saved best model with val_loss: 6.4311 at epoch 7
Epoch 8/1000 - 

KeyboardInterrupt: 

### Debug and Eval

In [None]:
# first few samples from the batter dataset
for i in range(3):
    batter_seq, opp_pitcher_seq, target = pitcher_dataset[i]
    print(f"Sample {i+1}")
    print("pitcher Sequence:")
    print(batter_seq)
    print("Opposing batter Sequence:")
    print(opp_pitcher_seq)
    print("Target:")
    print(target)
    print("===")