# CNN Ensemble Model Training

This notebook creates, trains, and saves a CNN ensemble model for protein activity prediction.

# Setup and Config

In [None]:
! pip install tqdm numpy pandas torch seaborn scikit-learn

In [None]:
import sys
sys.path.append('mount')

In [None]:
from pathlib import Path
import pandas as pd
import torch

import src.data as data
import src.models as models
import src.training as training

In [None]:
# Define data paths
TRAINING_DATA_PATH = "mount/data/esm2_15b_embeddings_and_meta.csv"
OUTPUTS_DIR = Path("mount/outputs")
MODEL_PATH = OUTPUTS_DIR / "cnn_ensemble_model.pth"
METRICS_PATH = OUTPUTS_DIR / "training_metrics.json"

# Data Loading

In [None]:
df = pd.read_csv(TRAINING_DATA_PATH)
print(f"Loaded {len(df)} sequences for training from {TRAINING_DATA_PATH}")

# Training Configuration

In [None]:
class TrainingConfig:
    def __init__(self):
        self.learning_rate = 1e-3
        self.batch_size = 64
        self.epochs = 200
        self.weight_decay = 0.01
        self.gradient_clip = 1.0

In [None]:
config = TrainingConfig()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Data Preparation

In [None]:
data_holder = data.ESMDataHolder(df)
all_loader = data_holder.loader_all_data()
train_loader, val_loader = data_holder.train_val_split()

print(f"Training batches: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

# Model Creation and Training

In [None]:
model = models.Ensemble().to(device)
print(f"Created CNN ensemble model on {device}")

In [None]:
# Train the model
training_result = training.train_variant_cnn(model, train_loader, val_loader, config, device, updates=True)
model = training_result['model']
print(f"\nTraining completed! Final R²: {training_result['final_r2']:.4f}")

100%|██████████| 200/200 [24:54<00:00,  7.47s/it, train_loss=0.1220, val_loss=0.1666, val_r2=0.8505]


Training completed! Final R²: 0.8505





# Model Evaluation

In [None]:
# Verify R² score on validation set
val_r2 = training.r2_score_for_model_and_loader(model, val_loader)
print(f"Validation R² score: {val_r2:.4f}")

Validation R² score: 0.8505


In [None]:
# Display training metrics
print("\n=== Training Summary ===")
print(f"Final training loss: {training_result['train_losses'][-1]:.4f}")
print(f"Final validation loss: {training_result['val_losses'][-1]:.4f}")
print(f"Final validation R²: {training_result['final_r2']:.4f}")
print(f"Best validation R²: {max(training_result['val_r2_history']):.4f}")


=== Training Summary ===
Final training loss: 0.1220
Final validation loss: 0.1666
Final validation R²: 0.8505
Best validation R²: 0.8509


# Save Trained Model

In [None]:
# Create outputs directory if it doesn't exist
OUTPUTS_DIR.mkdir(exist_ok=True)

# Save the trained model
torch.save(model.state_dict(), MODEL_PATH)
print(f"Model saved to: {MODEL_PATH}")

# Save training metrics
import json
metrics = {
    'final_r2': float(training_result['final_r2']),
    'final_train_loss': float(training_result['train_losses'][-1]),
    'final_val_loss': float(training_result['val_losses'][-1]),
    'best_val_r2': float(max(training_result['val_r2_history'])),
    'epochs_trained': len(training_result['train_losses'])
}

with open(METRICS_PATH, 'w') as f:
    json.dump(metrics, f, indent=2)
print(f"Training metrics saved to: {METRICS_PATH}")

Model saved to: mount/outputs/cnn_ensemble_model.pth
Training metrics saved to: mount/outputs/training_metrics.json


# Training Complete

The CNN ensemble model has been successfully trained and saved. You can now use the saved model in the sequence generation notebook (`02_generate_sequences.ipynb`).