In [10]:
import h5py

file_name = '../carbonara_compressed.h5'
f = h5py.File(file_name, 'r')

In [11]:
import pandas as pd

df = pd.read_csv("../train_updated.csv")
protein_sequences = df["protein_sequence"].values
seq_ids = df["seq_id"].values
tm = df["tm"].values
output_tm = {}
for i in range(len(tm)):
    output_tm[protein_sequences[i]]=tm[i]

In [12]:
import numpy as np

sequences = f["sequences"]
filtered_tm = []
input = []
for i in range(19149):
    if sequences[i].decode('utf-8') not in protein_sequences:
        # Skip sequences removed in the train_updated.csv
        continue
    input.append(np.array(f[f"carbonara_z_{i}"]))
    filtered_tm.append(output_tm[sequences[i].decode('utf-8')])

In [13]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [14]:
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

class SequenceDataset(Dataset):
    def __init__(self, sequences, targets):
        """
        :param sequences: List of NumPy arrays, each array is a sequence (matrix of shape [sequence_length, features]).
        :param targets: List of target values, one per sequence.
        """
        self.sequences = [torch.tensor(seq, dtype=torch.float32) for seq in sequences]
        self.targets = torch.tensor(targets, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]
    
dataset = SequenceDataset(input, filtered_tm)

def collate_fn(batch):
    sequences, targets = zip(*batch)
    lengths = torch.tensor([seq.shape[0] for seq in sequences])
    padded_sequences = pad_sequence(sequences, batch_first=True).to(device)  # Pad sequences to the same length
    targets = torch.tensor(targets, dtype=torch.float32).unsqueeze(1)
    return padded_sequences, lengths, targets

In [15]:
from sklearn.model_selection import KFold
from torch.nn.utils.rnn import pack_padded_sequence
import torch
from torch import nn
from torch.utils.data import DataLoader, Subset, TensorDataset

collate_fn = lambda batch: (
    nn.utils.rnn.pad_sequence([item[0] for item in batch], batch_first=True),
    torch.tensor([len(item[0]) for item in batch]),
    torch.tensor([item[1] for item in batch], dtype=torch.float32)
)

class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)  # LSTM layer
        self.fc = nn.Linear(hidden_size, output_size)  # Fully connected layer for output
    
    def forward(self, x, lengths):
        # Pack the padded sequences
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, _) = self.rnn(packed_input)  # LSTM output
        
        # Use the last hidden state
        output = self.fc(hidden[-1])  # Take the last layer of the hidden state
        return output

# Model hyperparameters
input_size = input[0].shape[1]  # Number of features
hidden_size = 64
output_size = 1  # Regression target
loss_fn = nn.MSELoss()

# K-Fold Cross Validation
k = 5  # Number of folds
kf = KFold(n_splits=k, shuffle=True, random_state=42)

predictions_per_fold = []
labels_per_fold = []
fold_results = []

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f"Fold {fold+1}/{k}")
    
    # Create train and validation subsets
    train_subset = Subset(dataset, train_idx)
    val_subset = Subset(dataset, val_idx)

    # DataLoaders
    train_loader = DataLoader(train_subset, batch_size=32, shuffle=True, collate_fn=collate_fn)
    val_loader = DataLoader(val_subset, batch_size=32, shuffle=False, collate_fn=collate_fn)

    # Initialize model and optimizer
    model = RNNModel(input_size, hidden_size, output_size).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    
    # Train and validate
    epochs = 100
    val_losses = []
    
    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        
        for batch_X, lengths, batch_y in train_loader:
            batch_X, batch_y = batch_X.to(device), batch_y.to(device)
            predictions = model(batch_X, lengths)
            loss = loss_fn(predictions, batch_y.unsqueeze(1))
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for batch_X, lengths, batch_y in val_loader:
                batch_X, batch_y = batch_X.to(device), batch_y.to(device)
                predictions = model(batch_X, lengths)
                loss = loss_fn(predictions, batch_y.unsqueeze(1))
                val_loss += loss.item()
        
        val_losses.append(val_loss / len(val_loader))
        print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}")

    model.eval()
    all_predictions = []
    all_labels = []
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, lengths, batch_y in val_loader:
            predictions = model(batch_X.to(device), lengths).to("cpu")
            loss = loss_fn(predictions, batch_y.unsqueeze(1))
            val_loss += loss.item()
            all_predictions.append(predictions)
            all_labels.append(batch_y)

    # Concatenate all predictions and true labels
    all_predictions = torch.cat(all_predictions, dim=0).tolist()
    all_labels = torch.cat(all_labels, dim=0).tolist()

    predictions_per_fold.append(all_predictions)
    labels_per_fold.append(all_labels)

    # Save results for this fold
    fold_results.append({
        "fold": fold + 1,
        "final_validation_loss": val_losses[-1]
    })

# Aggregate fold results
avg_loss = sum([result["final_validation_loss"] for result in fold_results]) / k
print(f"\nK-Fold Cross-Validation Results:\nAverage Validation Loss: {avg_loss:.4f}")


Fold 1/5
Epoch 1/100, Train Loss: 1679.7865, Validation Loss: 862.3233
Epoch 2/100, Train Loss: 469.3542, Validation Loss: 249.8520
Epoch 3/100, Train Loss: 197.9593, Validation Loss: 167.7333
Epoch 4/100, Train Loss: 170.1092, Validation Loss: 163.9583
Epoch 5/100, Train Loss: 168.8528, Validation Loss: 163.9257
Epoch 6/100, Train Loss: 169.0677, Validation Loss: 163.9106
Epoch 7/100, Train Loss: 168.6483, Validation Loss: 163.9128
Epoch 8/100, Train Loss: 168.8901, Validation Loss: 162.5688
Epoch 9/100, Train Loss: 167.5064, Validation Loss: 161.8030
Epoch 10/100, Train Loss: 166.5852, Validation Loss: 161.0312
Epoch 11/100, Train Loss: 165.9932, Validation Loss: 160.1927
Epoch 12/100, Train Loss: 165.5084, Validation Loss: 160.3042
Epoch 13/100, Train Loss: 164.8084, Validation Loss: 159.2007
Epoch 14/100, Train Loss: 164.0223, Validation Loss: 158.5847
Epoch 15/100, Train Loss: 163.3085, Validation Loss: 158.3607
Epoch 16/100, Train Loss: 162.8345, Validation Loss: 157.6304
Epoch 1

In [None]:
results = {}
for i in range(k):
    results[f"tm_{i}"] = labels_per_fold[i]
    results[f"preds_{i}"] = [item[0] for item in predictions_per_fold[i]]

pd.DataFrame(results).to_csv("../predictions/carbonara_rnn_z.csv", index=False)