In [None]:
!pip install fair-esm -q

In [None]:
import sys
import os
import numpy as np
import pickle
from pathlib import Path

# Check GPU
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

In [None]:
!mkdir -p src/data src/models src/utils

# We'll create minimal versions of required files inline
# (Alternative: upload as dataset and unzip)

print("✓ Directories created")
print("Next: Copy minimal code for preprocessing, models, and training")

In [None]:
preprocessing_code = '''
import pandas as pd
import numpy as np
from pathlib import Path
import pickle

class ProteinDataPreprocessor:
    def __init__(self, data_dir):
        self.data_dir = Path(data_dir)
        self.train_dir = self.data_dir / "Train"
        self.go_term_to_idx = {}
        self.idx_to_go_term = {}
        self.go_term_to_aspect = {}
        self.aspect_to_terms = {"C": [], "F": [], "P": []}

    def load_sequences(self, split="train"):
        if split == "train":
            fasta_file = self.train_dir / "train_sequences.fasta"
        else:
            fasta_file = self.data_dir / "Test" / "testsuperset.fasta"

        sequences = {}
        current_id = None
        current_seq = []

        with open(fasta_file, "r") as f:
            for line in f:
                line = line.strip()
                if line.startswith(">"):
                    if current_id:
                        sequences[current_id] = "".join(current_seq)
                    current_id = line[1:].split()[0]
                    if "|" in current_id:
                        current_id = current_id.split("|")[1]
                    current_seq = []
                else:
                    current_seq.append(line)
            if current_id:
                sequences[current_id] = "".join(current_seq)

        print(f"Loaded {len(sequences)} {split} sequences")
        return sequences

    def load_annotations(self):
        annotations_file = self.train_dir / "train_terms.tsv"
        df = pd.read_csv(annotations_file, sep="\\t")
        print(f"Loaded {len(df)} annotations")
        return df

    def build_label_encodings(self, annotations_df):
        unique_terms = sorted(annotations_df["term"].unique())
        self.go_term_to_idx = {term: idx for idx, term in enumerate(unique_terms)}
        self.idx_to_go_term = {idx: term for term, idx in self.go_term_to_idx.items()}

        term_aspect_map = annotations_df[["term", "aspect"]].drop_duplicates()
        self.go_term_to_aspect = dict(zip(term_aspect_map["term"], term_aspect_map["aspect"]))

        for term, aspect in self.go_term_to_aspect.items():
            self.aspect_to_terms[aspect].append(term)

        print(f"Total GO terms: {len(self.go_term_to_idx)}")

    def create_multi_label_matrix(self, annotations_df, protein_ids):
        n_proteins = len(protein_ids)
        n_terms = len(self.go_term_to_idx)
        labels = np.zeros((n_proteins, n_terms), dtype=np.float32)

        protein_to_idx = {pid: idx for idx, pid in enumerate(protein_ids)}

        for _, row in annotations_df.iterrows():
            protein_id = row["EntryID"]
            term = row["term"]

            if protein_id in protein_to_idx and term in self.go_term_to_idx:
                protein_idx = protein_to_idx[protein_id]
                term_idx = self.go_term_to_idx[term]
                labels[protein_idx, term_idx] = 1.0

        print(f"Created label matrix: {labels.shape}")
        return labels

    def create_aspect_specific_matrices(self, annotations_df, protein_ids):
        aspect_data = {}

        for aspect in ["C", "F", "P"]:
            aspect_terms = self.aspect_to_terms[aspect]
            aspect_term_indices = [self.go_term_to_idx[term] for term in aspect_terms]
            aspect_idx_map = {global_idx: local_idx
                            for local_idx, global_idx in enumerate(aspect_term_indices)}

            n_proteins = len(protein_ids)
            n_aspect_terms = len(aspect_term_indices)
            labels = np.zeros((n_proteins, n_aspect_terms), dtype=np.float32)

            protein_to_idx = {pid: idx for idx, pid in enumerate(protein_ids)}
            aspect_annotations = annotations_df[annotations_df["aspect"] == aspect]

            for _, row in aspect_annotations.iterrows():
                protein_id = row["EntryID"]
                term = row["term"]

                if protein_id in protein_to_idx and term in self.go_term_to_idx:
                    protein_idx = protein_to_idx[protein_id]
                    global_term_idx = self.go_term_to_idx[term]
                    local_term_idx = aspect_idx_map[global_term_idx]
                    labels[protein_idx, local_term_idx] = 1.0

            aspect_data[aspect] = (labels, aspect_term_indices)
            print(f"Aspect {aspect}: {labels.shape}")

        return aspect_data
'''

with open('/kaggle/working/preprocessingp.py', 'w') as f:
    f.write(preprocessing_code)

print("✓ Preprocessing code created")

In [None]:
import torch
import numpy as np
from tqdm import tqdm
import esm
import pickle
import gc

# Configuration
# MODEL_NAME = 'esm2_t30_150M_UR50D'  # Using 150M for speed
MODEL_NAME = 'esm2_t6_8M_UR50D'  # Using 8M for speed
BATCH_SIZE = 16  # Small batch for Kaggle GPU
MAX_LENGTH = 1024
DATA_DIR = '/kaggle/input/cafa-6-protein-function-prediction'

In [None]:
# Truncation helper
def truncate_balanced(seq, max_len):
    if len(seq) <= max_len:
        return seq
    n_term = max_len // 2
    c_term = max_len - n_term
    return seq[:n_term] + seq[-c_term:]

In [None]:
print(f"Loading ESM-2 model: {MODEL_NAME}")
model, alphabet = esm.pretrained.load_model_and_alphabet(MODEL_NAME)
batch_converter = alphabet.get_batch_converter()
model = model.cuda()
model.eval()
num_layers = model.num_layers

# Load sequences
sys.path.append('/kaggle/working')
from preprocessingp import ProteinDataPreprocessor

preprocessor = ProteinDataPreprocessor(DATA_DIR)
sequences = preprocessor.load_sequences('train')
protein_ids = list(sequences.keys())

print(f"\nGenerating embeddings for {len(protein_ids)} proteins...")


# Generate embeddings in batches
all_embeddings = []
all_protein_ids = []

for i in tqdm(range(0, len(protein_ids), BATCH_SIZE)):
    batch_ids = protein_ids[i:i + BATCH_SIZE]
    batch_data = [(pid, truncate_balanced(sequences[pid], MAX_LENGTH))
                  for pid in batch_ids]

    # Convert and embed
    _, _, batch_tokens = batch_converter(batch_data)
    batch_tokens = batch_tokens.cuda()

    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[num_layers], return_contacts=False)
        embeddings = results['representations'][num_layers][:, 0, :].cpu().numpy()

    all_embeddings.extend(embeddings)
    all_protein_ids.extend(batch_ids)

    # Clear GPU memory periodically
    if (i // BATCH_SIZE) % 100 == 0:
        torch.cuda.empty_cache()
        gc.collect()

embeddings_array = np.array(all_embeddings)

# Save embeddings
embeddings_data = {
    'embeddings': embeddings_array,
    'protein_ids': all_protein_ids,
    'model_name': MODEL_NAME,
    'embedding_dim': model.embed_dim,
    'num_proteins': len(all_protein_ids)
}

with open('/kaggle/working/train_embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings_data, f)

print(f"\n✓ Embeddings generated: {embeddings_array.shape}")
print(f"✓ Saved to /kaggle/working/train_embeddings.pkl")
print(f"Size: {Path('/kaggle/working/train_embeddings.pkl').stat().st_size / 1e9:.2f} GB")

# Free GPU memory
del model
torch.cuda.empty_cache()
gc.collect()

In [None]:
sys.path.append('/kaggle/working')
from preprocessing import ProteinDataPreprocessor

preprocessor = ProteinDataPreprocessor(DATA_DIR)
sequences = preprocessor.load_sequences('train')
annotations_df = preprocessor.load_annotations()
preprocessor.build_label_encodings(annotations_df)

protein_ids = sorted(list(set(sequences.keys()) & set(annotations_df['EntryID'].unique())))
print(f"Proteins with both sequence and annotations: {len(protein_ids)}")

# Create label matrices
full_labels = preprocessor.create_multi_label_matrix(annotations_df, protein_ids)
aspect_labels = preprocessor.create_aspect_specific_matrices(annotations_df, protein_ids)

# Save
preprocessed_data = {
    'protein_ids': protein_ids,
    'full_labels': full_labels,
    'aspect_labels': aspect_labels,
    'go_term_to_idx': preprocessor.go_term_to_idx,
    'idx_to_go_term': preprocessor.idx_to_go_term,
    'go_term_to_aspect': preprocessor.go_term_to_aspect
}

with open('/kaggle/working/preprocessed_labels.pkl', 'wb') as f:
    pickle.dump(preprocessed_data, f)

print("✓ Labels preprocessed and saved")

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split

# Simple model
class SimpleClassifier(nn.Module):
    def __init__(self, input_dim, num_classes):
        super().__init__()
        self.classifier = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.BatchNorm1d(512),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        return self.classifier(x)

# Simple dataset
class EmbeddingDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = torch.FloatTensor(embeddings)
        self.labels = torch.FloatTensor(labels)

    def __len__(self):
        return len(self.embeddings)

    def __getitem__(self, idx):
        return {
            'embedding': self.embeddings[idx],
            'labels': self.labels[idx]
        }

# Focal Loss (simplified)
class FocalLoss(nn.Module):
    def __init__(self, alpha=0.25, gamma=2.0):
        super().__init__()
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = nn.functional.binary_cross_entropy_with_logits(
            inputs, targets, reduction='none'
        )
        probs = torch.sigmoid(inputs)
        p_t = probs * targets + (1 - probs) * (1 - targets)
        focal_weight = (1 - p_t) ** self.gamma
        alpha_weight = self.alpha * targets + (1 - self.alpha) * (1 - targets)
        loss = alpha_weight * focal_weight * bce_loss
        return loss.mean()

In [None]:
# Load data
with open('/kaggle/working/train_embeddings.pkl', 'rb') as f:
    emb_data = pickle.load(f)
with open('/kaggle/working/preprocessed_labels.pkl', 'rb') as f:
    label_data = pickle.load(f)

embeddings = emb_data['embeddings']
labels = label_data['full_labels']

print(f"Embeddings: {embeddings.shape}")
print(f"Labels: {labels.shape}")

# Create dataset
dataset = EmbeddingDataset(embeddings, labels)
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Model
model = SimpleClassifier(
    input_dim=emb_data['embedding_dim'],
    num_classes=labels.shape[1]
).cuda()

criterion = FocalLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

print(f"\n{'='*60}")
print("Training baseline model...")
print(f"{'='*60}\n")

# Training loop
for epoch in range(10):
    # Train
    model.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/10"):
        embeddings = batch['embedding'].cuda()
        labels_batch = batch['labels'].cuda()

        optimizer.zero_grad()
        outputs = model(embeddings)
        loss = criterion(outputs, labels_batch)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()

    # Validate
    model.eval()
    val_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in val_loader:
            embeddings = batch['embedding'].cuda()
            labels_batch = batch['labels'].cuda()

            outputs = model(embeddings)
            loss = criterion(outputs, labels_batch)
            val_loss += loss.item()

            preds = torch.sigmoid(outputs).cpu().numpy()
            all_preds.append(preds)
            all_labels.append(labels_batch.cpu().numpy())

    # Calculate F1
    all_preds = np.vstack(all_preds)
    all_labels = np.vstack(all_labels)

    from sklearn.metrics import f1_score
    f1 = f1_score(all_labels, (all_preds > 0.5).astype(int),
                  average='micro', zero_division=0)

    print(f"Epoch {epoch+1}: Train Loss={train_loss/len(train_loader):.4f}, "
          f"Val Loss={val_loss/len(val_loader):.4f}, F1={f1:.4f}")

# Save model
torch.save({
    'model_state_dict': model.state_dict(),
    'embedding_dim': emb_data['embedding_dim'],
    'num_classes': labels.shape[1]
}, '/kaggle/working/baseline_model.pt')

print(f"\n✓ Model trained and saved!")
print(f"Final F1: {f1:.4f}")

In [None]:
import torch
import esm
import pickle
import gc
from tqdm import tqdm
import sys

print("Loading ESM-2 model...")
MODEL_NAME = 'esm2_t6_8M_UR50D'  # Using 8M for speed
BATCH_SIZE = 16  # Small batch for Kaggle GPU
MAX_LENGTH = 1024
print(f"{MODEL_NAME}")
model, alphabet = esm.pretrained.load_model_and_alphabet(MODEL_NAME)
batch_converter = alphabet.get_batch_converter()
model = model.cuda()
model.eval()
num_layers = model.num_layers

# Load test sequences
print("Loading test sequences...")
sys.path.append('/kaggle/working')
from preprocessing import ProteinDataPreprocessor

preprocessor = ProteinDataPreprocessor(DATA_DIR)
test_sequences = preprocessor.load_sequences('test')
test_protein_ids = list(test_sequences.keys())

print(f"Generating embeddings for {len(test_protein_ids)} test proteins...")

# Generate embeddings
test_embeddings = []
test_ids = []

for i in tqdm(range(0, len(test_protein_ids), BATCH_SIZE)):
    batch_ids = test_protein_ids[i:i + BATCH_SIZE]
    batch_data = [(pid, truncate_balanced(test_sequences[pid], MAX_LENGTH))
                  for pid in batch_ids]

    _, _, batch_tokens = batch_converter(batch_data)
    batch_tokens = batch_tokens.cuda()

    with torch.no_grad():
        results = model(batch_tokens, repr_layers=[num_layers], return_contacts=False)
        embeddings = results['representations'][num_layers][:, 0, :].cpu().numpy()

    test_embeddings.extend(embeddings)
    test_ids.extend(batch_ids)

    if (i // BATCH_SIZE) % 100 == 0:
        torch.cuda.empty_cache()
        gc.collect()

test_embeddings_array = np.array(test_embeddings)

# Save test embeddings
test_emb_data = {
    'embeddings': test_embeddings_array,
    'protein_ids': test_ids,
    'model_name': MODEL_NAME,
    'embedding_dim': model.embed_dim
}

with open('/kaggle/working/test_embeddings.pkl', 'wb') as f:
    pickle.dump(test_emb_data, f)

print(f"\n✓ Test embeddings generated: {test_embeddings_array.shape}")

# Free memory
del model
torch.cuda.empty_cache()
gc.collect()

In [None]:
# Kaggle Notebook Cell - Memory-Efficient Submission Creation (TSV format)

import torch
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm
import gc

print("Loading data...")

# Load test embeddings
with open('/kaggle/working/test_embeddings.pkl', 'rb') as f:
    test_data = pickle.load(f)
test_embeddings = test_data['embeddings']
protein_ids = test_data['protein_ids']

# Load model
checkpoint = torch.load('/kaggle/working/baseline_model.pt')
model = SimpleClassifier(
    input_dim=checkpoint['embedding_dim'],
    num_classes=checkpoint['num_classes']
).cuda()
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

# Load GO term mappings
with open('/kaggle/working/preprocessed_labels.pkl', 'rb') as f:
    label_data = pickle.load(f)
idx_to_go_term = label_data['idx_to_go_term']
del f

print(f"Generating predictions for {len(protein_ids)} proteins...")

# ============================================================
# Memory-Efficient Submission Creation (TSV format)
# ============================================================

output_file = '/kaggle/working/submission.tsv'
threshold = 0.5
batch_size = 64  # Small batch to avoid OOM

# Write header (TSV format - tab separated)
with open(output_file, 'w') as f:
    f.write("Protein ID\tGO Term\tConfidence\n")

total_preds = 0
no_pred_count = 0

# Process in batches and write immediately
with torch.no_grad():
    for start_idx in tqdm(range(0, len(protein_ids), batch_size)):
        end_idx = min(start_idx + batch_size, len(protein_ids))

        # Get batch
        batch_emb = test_embeddings[start_idx:end_idx]
        batch_ids = protein_ids[start_idx:end_idx]

        # Predict
        batch_tensor = torch.FloatTensor(batch_emb).cuda()
        outputs = model(batch_tensor)
        probs = torch.sigmoid(outputs).cpu().numpy()

        # Process each protein in batch
        rows = []
        for i, pid in enumerate(batch_ids):
            protein_probs = probs[i]

            # Get predictions above threshold
            pred_indices = np.where(protein_probs > threshold)[0]

            # If no predictions, use top 3
            if len(pred_indices) == 0:
                no_pred_count += 1
                pred_indices = np.argsort(protein_probs)[-3:]

            # Create TSV rows for this protein (tab separated)
            for idx in pred_indices:
                go_term = idx_to_go_term[idx]
                confidence = float(protein_probs[idx])
                rows.append(f"{pid}\t{go_term}\t{confidence:.6f}\n")
                total_preds += 1

        # Write batch to file immediately
        with open(output_file, 'a') as f:
            f.writelines(rows)

        # Clear memory aggressively
        del batch_tensor, outputs, probs, rows
        gc.collect()
        torch.cuda.empty_cache()

print(f"\n{'='*60}")
print(f"✓ Submission created: {output_file}")
print(f"  Total predictions: {total_preds}")
print(f"  Proteins with no predictions: {no_pred_count}")
print(f"{'='*60}")

# Validate (TSV format - tab separated)
submission = pd.read_csv(output_file, sep='\t', nrows=20)
print(f"\nFirst 20 rows:")
print(submission)

# Check stats
submission_full = pd.read_csv(output_file, sep='\t')
print(f"\nSubmission stats:")
print(f"  Total rows: {len(submission_full)}")
print(f"  Unique proteins: {submission_full['Protein ID'].nunique()}")
print(f"  Unique GO terms: {submission_full['GO Term'].nunique()}")

# Verify all proteins have predictions
submitted_proteins = submission_full['Protein ID'].nunique()
if submitted_proteins < len(protein_ids):
    print(f"⚠ Warning: {len(protein_ids) - submitted_proteins} proteins missing!")
else:
    print(f"✓ All {len(protein_ids)} proteins have predictions")

In [None]:
def validate_submission(submission_df, test_protein_ids):
    """Validate submission meets competition requirements"""

    issues = []

    # Check required columns
    required_cols = ['Protein ID', 'GO Term', 'Confidence']
    if not all(col in submission_df.columns for col in required_cols):
        issues.append(f"Missing required columns. Need: {required_cols}")

    # Check all test proteins have predictions
    submission_proteins = set(submission_df['Protein ID'].unique())
    missing_proteins = set(test_protein_ids) - submission_proteins
    if missing_proteins:
        issues.append(f"Missing predictions for {len(missing_proteins)} proteins")
        print(f"  First 5 missing: {list(missing_proteins)[:5]}")

    # Check confidence scores are in [0, 1]
    if submission_df['Confidence'].min() < 0 or submission_df['Confidence'].max() > 1:
        issues.append("Confidence scores must be in [0, 1]")

    # Check GO term format
    invalid_terms = submission_df[~submission_df['GO Term'].str.match(r'^GO:\d{7}$')]
    if len(invalid_terms) > 0:
        issues.append(f"Found {len(invalid_terms)} invalid GO term formats")

    # Check for NaN values
    if submission_df.isnull().any().any():
        issues.append("Found NaN values in submission")

    # Statistics
    print("Submission Statistics:")
    print(f"  Total predictions: {len(submission_df):,}")
    print(f"  Unique proteins: {submission_df['Protein ID'].nunique():,}")
    print(f"  Unique GO terms: {submission_df['GO Term'].nunique():,}")
    print(f"  Avg predictions per protein: {len(submission_df) / submission_df['Protein ID'].nunique():.1f}")
    print(f"  Min confidence: {submission_df['Confidence'].min():.4f}")
    print(f"  Max confidence: {submission_df['Confidence'].max():.4f}")
    print(f"  Avg confidence: {submission_df['Confidence'].mean():.4f}")

    if issues:
        print("\n❌ Validation Issues:")
        for issue in issues:
            print(f"  - {issue}")
        return False
    else:
        print("\n✅ Submission is valid!")
        return True

submission_df = submission_full
with open('/kaggle/working/test_embeddings.pkl', 'rb') as f:
    test_data = pickle.load(f)

# print(f"{test_data['protein_ids']}")
print(f"{submission_df.columns}")
# Validate
is_valid = validate_submission(submission_df, test_data['protein_ids'])

if is_valid:
    print("\n🎉 Ready to submit to Kaggle!")
else:
    print("\n⚠️ Please fix issues before submitting")