In [1]:
import h5py

file_name = '../carbonara_compressed.h5'
f = h5py.File(file_name, 'r')

In [2]:
import pandas as pd

df = pd.read_csv("../train_updated.csv")
protein_sequences = df["protein_sequence"].values
seq_ids = df["seq_id"].values
tm = df["tm"].values
output_tm = {}
for i in range(len(tm)):
    output_tm[protein_sequences[i]]=tm[i]

In [3]:
import numpy as np

def compute_features(embeddings):
    features = np.mean(embeddings, axis=0)
    features = np.concatenate((features, np.min(embeddings, axis=0)))
    features = np.concatenate((features, np.median(embeddings, axis=0)))
    features = np.concatenate((features, np.max(embeddings, axis=0)))
    features = np.concatenate((features, np.std(embeddings, axis=0)))
    features = np.concatenate((features, embeddings[0]))
    return np.concatenate((features, embeddings[-1]))

sequences = f["sequences"]
filtered_tm = []
input = []
for i in range(19149):
    if sequences[i].decode('utf-8') not in protein_sequences:
        # Skip sequences removed in the train_updated.csv
        continue
    tmp = np.concatenate((np.array(f[f"carbonara_z_{i}"]),np.array([f[f"plddt_{i}"]]).T), axis=1)
    input.append(compute_features(tmp))
    filtered_tm.append(output_tm[sequences[i].decode('utf-8')])
input = np.array(input)

In [4]:
from scipy.stats import spearmanr
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset, Subset
from sklearn.model_selection import KFold

# Prepare the data
X = torch.tensor(input, dtype=torch.float32)
y = torch.tensor(filtered_tm, dtype=torch.float32).unsqueeze(1)  # Make y 2D for PyTorch
dataset = TensorDataset(X, y)

class NeuralNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(NeuralNetwork, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),  # Input to hidden layer
            nn.ReLU(),  # Activation function
            nn.Linear(64, 32),  # Second hidden layer
            nn.ReLU(),
            nn.Linear(32, output_size)  # Output layer
        )
    
    def forward(self, x):
        return self.model(x)

# Define loss function and optimizer
loss_fn = nn.MSELoss()  # Mean squared error for regression

# K-Fold Cross Validation
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)
input_size = X.shape[1]
output_size = 1  # Single output (regression)

predictions_per_fold = []
labels_per_fold = []

fold_results = []  # Store results for each fold

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f"Fold {fold+1}/{k}")
    
    # Create subsets for the current fold
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)

    # Create data loaders
    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=32, shuffle=False)

    # Initialize model, optimizer, and reset states
    model = NeuralNetwork(input_size, output_size)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    epochs = 100
    correlation_per_epoch = []
    loss_per_epoch = []

    # Training loop
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        
        for batch_X, batch_y in train_loader:
            predictions = model(batch_X)
            loss = loss_fn(predictions, batch_y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation loop
        model.eval()
        all_predictions = []
        all_labels = []
        val_loss = 0.0
        with torch.no_grad():
            for batch_X, batch_y in val_loader:
                predictions = model(batch_X)
                loss = loss_fn(predictions, batch_y)
                val_loss += loss.item()
                all_predictions.append(predictions)
                all_labels.append(batch_y)

        # Concatenate all predictions and true labels
        all_predictions = torch.cat(all_predictions, dim=0)
        all_labels = torch.cat(all_labels, dim=0)

        # Compute validation correlation
        val_corr = spearmanr(all_predictions.flatten(), all_labels.flatten()).correlation
        correlation_per_epoch.append(val_corr)
        loss_per_epoch.append(val_loss / len(val_loader))
        
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Validation Correlation: {val_corr:.4f}, Validation Loss: {val_loss/len(val_loader):.4f}")
    
    model.eval()
    all_predictions = []
    all_labels = []
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            predictions = model(batch_X)
            loss = loss_fn(predictions, batch_y)
            val_loss += loss.item()
            all_predictions.append(predictions)
            all_labels.append(batch_y)

    # Concatenate all predictions and true labels
    all_predictions = torch.cat(all_predictions, dim=0).tolist()
    all_labels = torch.cat(all_labels, dim=0).tolist()

    predictions_per_fold.append(all_predictions)
    labels_per_fold.append(all_labels)
    
    # Store fold results
    fold_results.append({
        "fold": fold + 1,
        "validation_correlation": correlation_per_epoch[-1],  # Last epoch correlation
        "validation_loss": loss_per_epoch[-1]  # Last epoch loss
    })

# Aggregate fold results
avg_corr = sum([result["validation_correlation"] for result in fold_results]) / k
avg_loss = sum([result["validation_loss"] for result in fold_results]) / k

print(f"\nAverage Correlation: {avg_corr:.4f}\nAverage Loss: {avg_loss:.4f}")

Fold 1/5
Epoch 1/100, Train Loss: 232.5184, Validation Correlation: 0.2606, Validation Loss: 147.0817
Epoch 2/100, Train Loss: 155.3156, Validation Correlation: 0.2903, Validation Loss: 143.5556
Epoch 3/100, Train Loss: 150.4214, Validation Correlation: 0.3031, Validation Loss: 141.2074
Epoch 4/100, Train Loss: 149.0435, Validation Correlation: 0.3156, Validation Loss: 147.8382
Epoch 5/100, Train Loss: 147.9808, Validation Correlation: 0.3256, Validation Loss: 140.3937
Epoch 6/100, Train Loss: 146.1269, Validation Correlation: 0.3232, Validation Loss: 139.8370
Epoch 7/100, Train Loss: 144.8172, Validation Correlation: 0.3293, Validation Loss: 139.9011
Epoch 8/100, Train Loss: 143.6289, Validation Correlation: 0.3303, Validation Loss: 140.7195
Epoch 9/100, Train Loss: 142.8776, Validation Correlation: 0.3287, Validation Loss: 149.0514
Epoch 10/100, Train Loss: 141.4655, Validation Correlation: 0.3434, Validation Loss: 137.4073
Epoch 11/100, Train Loss: 139.1206, Validation Correlation: 

In [5]:
results = {}
for i in range(k):
    results[f"tm_{i}"] = [item[0] for item in labels_per_fold[i]]
    results[f"preds_{i}"] = [item[0] for item in predictions_per_fold[i]]

pd.DataFrame(results).to_csv("../predictions/carbonara_simple_z_plddt.csv", index=False)