# Sequence Generation and Activity Prediction

This notebook loads the trained CNN ensemble model, generates new protein sequences, and predicts their activities.

# Setup and Imports

In [1]:
! pip install tqdm numpy pandas torch fair-esm fairscale

Collecting fair-esm
  Downloading fair_esm-2.0.0-py3-none-any.whl.metadata (37 kB)
Collecting fairscale
  Downloading fairscale-0.4.13.tar.gz (266 kB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Downloading fair_esm-2.0.0-py3-none-any.whl (93 kB)
Building wheels for collected packages: fairscale
  Building wheel for fairscale (pyproject.toml) ... [?25ldone
[?25h  Created wheel for fairscale: filename=fairscale-0.4.13-py3-none-any.whl size=332208 sha256=32e44e3858ec7eefa622f72e0cd725371c3edb12600b1f8d44cde39404e85ec2
  Stored in directory: /tmp/pip-ephem-wheel-cache-541g2cuj/wheels/95/ef/96/5044bde220b2ea299bdc6ec05051e0ef187fad45b341d1c273
Successfully built fairscale
Installing collected packages: fair-esm, fairscale
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2/2[0m [fairscale]/2[0m [fairs

In [3]:
from pathlib import Path
import random
from tqdm import tqdm
from typing import List, Set

import numpy as np
import pandas as pd
import torch

import src.models as models

In [None]:
# Define data paths
EXISTING_SEQUENCES_PATH = "data/seq_and_score.csv"
OUTPUTS_DIR = Path("outputs")
MODEL_PATH = OUTPUTS_DIR / "cnn_ensemble_model.pth"
METRICS_PATH = OUTPUTS_DIR / "training_metrics.json"
FASTA_PATH = OUTPUTS_DIR / "new_seqs.fasta"
EMBEDDINGS_DIR = OUTPUTS_DIR / "new_seq_embeddings"
RESULTS_CSV_PATH = OUTPUTS_DIR / "all_sequences_predictions.csv"
RESULTS_PT_PATH = OUTPUTS_DIR / "all_sequences_predictions.pt"
TOP_5_FASTA_PATH = OUTPUTS_DIR / "top_5_sequences.fasta"
BOTTOM_5_FASTA_PATH = OUTPUTS_DIR / "bottom_5_sequences.fasta"

# Load Trained Model

In [5]:
# Load the trained model
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = models.Ensemble().to(device)

if not MODEL_PATH.exists():
    raise FileNotFoundError(f"Trained model not found at {MODEL_PATH}. Please run 01_train_model.ipynb first.")

model.load_state_dict(torch.load(MODEL_PATH, map_location=device))
model.eval()
print(f"Loaded trained model from {MODEL_PATH} on {device}")

Loaded trained model from mount/outputs/cnn_ensemble_model.pth on cuda


In [6]:
# Load and display training metrics
import json
if METRICS_PATH.exists():
    with open(METRICS_PATH, 'r') as f:
        metrics = json.load(f)
    print(f"Model training metrics:")
    print(f"  Final R²: {metrics['final_r2']:.4f}")
    print(f"  Epochs trained: {metrics['epochs_trained']}")
else:
    print(f"Training metrics not found at {METRICS_PATH}.")

Model training metrics:
  Final R²: 0.8505
  Epochs trained: 200


# Sequence Generation

In [7]:
# Wild-type GFP sequence
wt_seq = "MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK"
print(f"Wild-type sequence length: {len(wt_seq)}")

Wild-type sequence length: 238


In [8]:
def generate_mutant_sequences(wt_sequence: str, n_mutations: int = 4, n_sequences: int = 10) -> List[str]:
    """
    Generate protein sequences with exactly n mutations from a wild-type sequence.
    
    Args:
        wt_sequence (str): Wild-type protein sequence using single letter amino acid codes
        n_mutations (int): Number of mutations per sequence
        n_sequences (int): Number of mutant sequences to generate
    
    Returns:
        List[str]: List of mutated sequences
    
    Raises:
        ValueError: If inputs are invalid
    """
    VALID_AA = set('ACDEFGHIKLMNPQRSTVWY')

    def mutate_sequence(seq: str, positions: List[int]) -> str:
        seq_list = list(seq)
        key = []
        for pos in positions:
            
            # Get all possible mutations at this position
            possible_mutations = VALID_AA - {seq[pos]}

            mutation = random.choice(list(possible_mutations))
            seq_list[pos] = mutation
            key.append(f"{pos}{mutation}")
        return ":".join(key), ''.join(seq_list)
    
    generated_sequences: Set[str] = set()
    
    while len(generated_sequences) < n_sequences:
        mutation_positions = random.sample(range(len(wt_sequence)), n_mutations)
        key, new_sequence = mutate_sequence(wt_sequence, mutation_positions)
        
        if new_sequence not in generated_sequences:
            generated_sequences.add((key, new_sequence))
            
    return list(generated_sequences)

In [9]:
# Generate mutant sequences
print("Generating mutant sequences...")
mutants = generate_mutant_sequences(wt_seq, n_mutations=4, n_sequences=1000)
print(f"Generated {len(mutants)} mutant sequences")

Generating mutant sequences...
Generated 1000 mutant sequences


# Filter Out Existing Sequences

In [10]:
# Load existing sequences to filter out duplicates
df_seqs = pd.read_csv(EXISTING_SEQUENCES_PATH)
existing_seqs = set(df_seqs.sequence)
print(f"Found {len(existing_seqs)} existing sequences to filter out from {EXISTING_SEQUENCES_PATH}")

# Filter to only new sequences
new_seqs = [(k, s) for k, s in mutants if s not in existing_seqs]
print(f"After filtering: {len(new_seqs)} new sequences to evaluate")

Found 51715 existing sequences to filter out from mount/data/seq_and_score.csv
After filtering: 1000 new sequences to evaluate


In [11]:
# Create outputs directory and save sequences to FASTA
OUTPUTS_DIR.mkdir(exist_ok=True)

with open(FASTA_PATH, "w") as fh:
    for key, seq in new_seqs:
        fh.write(f">{key}\n")
        fh.write(f"{seq}\n")
        
print(f"Saved {len(new_seqs)} sequences to {FASTA_PATH}")

Saved 1000 sequences to mount/outputs/new_seqs.fasta


# Generate Embeddings for New Sequences

In [12]:
! apt update && apt install -y ca-certificates && update-ca-certificates

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB]                [0m[33m
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [3253 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [48.5 kB]
Get:7 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [5235 kB]
Get:8 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1271 kB]
Get:9 http://archive.ubuntu.com/ubuntu jammy/universe amd64 Packages [17.5 MB]33m[33m
Get:10 http://archive.ubuntu.com/ubuntu jammy/multiverse amd64 Packages [266 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy/main amd64 Packages [1792 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy/restricted amd64 Packages [164 kB]
Get:13 htt

In [13]:
# Generate embeddings using ESM-2 model
print(f"Generating embeddings for {len(new_seqs)} sequences...")
print("This may take several minutes depending on the number of sequences.")

! python mount/src/embeddings.py --fasta {FASTA_PATH} --output_dir {EMBEDDINGS_DIR} --truncation_seq_length 238

Generating embeddings for 1000 sequences...
This may take several minutes depending on the number of sequences.
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/models/esm2_t48_15B_UR50D.pt" to /root/.cache/torch/hub/checkpoints/esm2_t48_15B_UR50D.pt
Downloading: "https://dl.fbaipublicfiles.com/fair-esm/regression/esm2_t48_15B_UR50D-contact-regression.pt" to /root/.cache/torch/hub/checkpoints/esm2_t48_15B_UR50D-contact-regression.pt
Processing 1 of 500 batches (2 sequences)
  if data.storage().size() > 0:
Processing 2 of 500 batches (2 sequences)
Processing 3 of 500 batches (2 sequences)
Processing 4 of 500 batches (2 sequences)
Processing 5 of 500 batches (2 sequences)
Processing 6 of 500 batches (2 sequences)
Processing 7 of 500 batches (2 sequences)
Processing 8 of 500 batches (2 sequences)
Processing 9 of 500 batches (2 sequences)
Processing 10 of 500 batches (2 sequences)
Processing 11 of 500 batches (2 sequences)
Processing 12 of 500 batches (2 sequences)
Processing 13 of 50

# Predict Activity for New Sequences

In [14]:
# Process all sequence embeddings and generate predictions

# Load embeddings from the generated files
embedding_files = list(EMBEDDINGS_DIR.glob("*.pt"))

labels = []
embeddings = []
sequences = []

print(f"Processing {len(embedding_files)} embedding files from {EMBEDDINGS_DIR}...")

# Extract labels and embeddings from all files
for emb_file in tqdm(embedding_files):
    data = torch.load(emb_file)
    label = data["label"]
    embedding = data["mean_representations"][47]  # Use layer 47 (final transformer layer)
    
    # Convert label to actual sequence (WT + mutations)
    seq = list(wt_seq)
    for mutation in label.split(":"):
        pos = int(mutation[:-1])
        new_aa = mutation[-1]
        seq[pos] = new_aa
    sequence = "".join(seq)
    
    labels.append(label)
    embeddings.append(embedding)
    sequences.append(sequence)

print(f"Loaded embeddings for {len(embeddings)} sequences")

Processing 1000 embedding files from mount/outputs/new_seq_embeddings...


100%|██████████| 1000/1000 [00:06<00:00, 147.28it/s]

Loaded embeddings for 1000 sequences





In [15]:
# Convert embeddings to tensor
embeddings_tensor = torch.from_numpy(np.array(embeddings)).float().to(device)

# Generate predictions
with torch.no_grad():
    predictions = model(embeddings_tensor).cpu().flatten().detach().numpy()

print(f"Generated predictions for {len(predictions)} sequences")
print(f"Score range: {predictions.min():.3f} to {predictions.max():.3f}")

Generated predictions for 1000 sequences
Score range: 1.193 to 3.886


# Rank and Save Results

In [16]:
results_df = pd.DataFrame({
    "label": labels,
    "sequence": sequences,
    "predicted_score": predictions
})

# Sort by predicted score (highest first)
results_df = results_df.sort_values("predicted_score", ascending=False).reset_index(drop=True)
results_df['rank'] = range(1, len(results_df) + 1)

print(f"Created ranked results for {len(results_df)} sequences")
print(f"\nTop 5 sequences:")
print(results_df[['rank', 'label', 'predicted_score']].head())

Created ranked results for 1000 sequences

Top 5 sequences:
   rank               label  predicted_score
0     1  156Y:63I:195L:128Y         3.885675
1     2   233E:189A:11A:12R         3.797362
2     3  116Y:46A:196T:143F         3.790655
3     4      86M:83Y:6V:28L         3.785476
4     5  219M:104Q:51Q:157A         3.778043


In [17]:
results_df.to_csv(RESULTS_CSV_PATH, index=False)
print(f"Results saved to: {RESULTS_CSV_PATH}")

# Save PyTorch tensor for compatibility
torch.save(results_df, RESULTS_PT_PATH)
print(f"Results also saved as PyTorch object to: {RESULTS_PT_PATH}")

# Summary statistics
print(f"\n=== Prediction Summary ===")
print(f"Total sequences predicted: {len(results_df)}")
print(f"Score range: {results_df['predicted_score'].min():.3f} to {results_df['predicted_score'].max():.3f}")
print(f"Mean predicted score: {results_df['predicted_score'].mean():.3f}")
print(f"Std predicted score: {results_df['predicted_score'].std():.3f}")

Results saved to: mount/outputs/all_sequences_predictions.csv
Results also saved as PyTorch object to: mount/outputs/all_sequences_predictions.pt

=== Prediction Summary ===
Total sequences predicted: 1000
Score range: 1.193 to 3.886
Mean predicted score: 1.852
Std predicted score: 0.662


In [18]:
# Save top 5 sequences
with open(TOP_5_FASTA_PATH, "w") as fh:
    for _, row in results_df.head(5).iterrows():
        fh.write(f">{row['label']}|score:{row['predicted_score']:.3f}|rank:{row['rank']}\n")
        fh.write(f"{row['sequence']}\n")
print(f"Top 5 sequences saved to: {TOP_5_FASTA_PATH}")

# Save bottom 5 sequences for comparison
with open(BOTTOM_5_FASTA_PATH, "w") as fh:
    for _, row in results_df.tail(5).iterrows():
        fh.write(f">{row['label']}|score:{row['predicted_score']:.3f}|rank:{row['rank']}\n")
        fh.write(f"{row['sequence']}\n")
print(f"Bottom 5 sequences saved to: {BOTTOM_5_FASTA_PATH}")

Top 5 sequences saved to: mount/outputs/top_5_sequences.fasta
Bottom 5 sequences saved to: mount/outputs/bottom_5_sequences.fasta


# Sequence Generation Complete

Successfully generated, filtered, and ranked protein sequences by predicted activity. The results are saved and ready for visualization in the figures notebook (`03_create_figures.ipynb`).

## Files Created:
- `outputs/all_sequences_predictions.csv` - Full ranked results
- `outputs/top_5_sequences.fasta` - Top 5 predicted sequences
- `outputs/bottom_5_sequences.fasta` - Bottom 5 predicted sequences