In [3]:
import h5py

file_name = '../carbonara_compressed.h5'
f = h5py.File(file_name, 'r')

In [4]:
import numpy as np

output_tm = np.array(f['output_tm'])
input = []
for i in range(19149):
    input.append(np.array(f[f"carbonara_z_{i}"]))

In [5]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [6]:
from torch import nn
from torch.utils.data import DataLoader, Dataset
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

class SequenceDataset(Dataset):
    def __init__(self, sequences, targets):
        """
        :param sequences: List of NumPy arrays, each array is a sequence (matrix of shape [sequence_length, features]).
        :param targets: List of target values, one per sequence.
        """
        self.sequences = [torch.tensor(seq, dtype=torch.float32) for seq in sequences]
        self.targets = torch.tensor(targets, dtype=torch.float32)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return self.sequences[idx], self.targets[idx]
    
dataset = SequenceDataset(input, output_tm)

def collate_fn(batch):
    sequences, targets = zip(*batch)
    lengths = torch.tensor([seq.shape[0] for seq in sequences])
    padded_sequences = pad_sequence(sequences, batch_first=True).to(device)  # Pad sequences to the same length
    targets = torch.tensor(targets, dtype=torch.float32).unsqueeze(1)
    return padded_sequences, lengths, targets

train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)

In [7]:
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)  # LSTM layer
        self.fc = nn.Linear(hidden_size, output_size)  # Fully connected layer for output
    
    def forward(self, x, lengths):
        # Pack the padded sequences
        packed_input = pack_padded_sequence(x, lengths, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, _) = self.rnn(packed_input)  # RNN output
        
        # Use the last hidden state
        output = self.fc(hidden[-1])  # Take the last layer of the hidden state
        return output
    
input_size = input[0].shape[1]  # Number of features
hidden_size = 64
output_size = 1  # Regression target

model = RNNModel(input_size, hidden_size, output_size).to(device)

# Step 3: Loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Step 4: Train the Model
epochs = 200
for epoch in range(epochs):
    model.train()  # Set model to training mode
    train_loss = 0.0
    
    for batch_X, lengths, batch_y in train_loader:
        predictions = model(batch_X, lengths)
        loss = loss_fn(predictions, batch_y.to(device))
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    # Validate the Model
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():
        for batch_X, lengths, batch_y in val_loader:
            predictions = model(batch_X, lengths)
            loss = loss_fn(predictions, batch_y.to(device))
            val_loss += loss.item()
    
    print(f"Epoch {epoch+1}/{epochs}, Train Loss: {train_loss/len(train_loader):.4f}, Validation Loss: {val_loss/len(val_loader):.4f}")

Epoch 1/200, Train Loss: 1435.6221, Validation Loss: 730.0390
Epoch 2/200, Train Loss: 458.5718, Validation Loss: 301.6448
Epoch 3/200, Train Loss: 264.8930, Validation Loss: 244.1045
Epoch 4/200, Train Loss: 245.2759, Validation Loss: 239.8449
Epoch 5/200, Train Loss: 241.6103, Validation Loss: 236.8275
Epoch 6/200, Train Loss: 239.5471, Validation Loss: 234.9616
Epoch 7/200, Train Loss: 237.5834, Validation Loss: 234.3267
Epoch 8/200, Train Loss: 235.2414, Validation Loss: 229.7176
Epoch 9/200, Train Loss: 232.8836, Validation Loss: 228.0556
Epoch 10/200, Train Loss: 231.4047, Validation Loss: 226.4322
Epoch 11/200, Train Loss: 229.4878, Validation Loss: 223.4454
Epoch 12/200, Train Loss: 226.4268, Validation Loss: 220.4719
Epoch 13/200, Train Loss: 224.5106, Validation Loss: 219.6126
Epoch 14/200, Train Loss: 223.0211, Validation Loss: 220.5961
Epoch 15/200, Train Loss: 221.9643, Validation Loss: 216.6128
Epoch 16/200, Train Loss: 220.4563, Validation Loss: 216.0991
Epoch 17/200, Tr

In [11]:
from scipy.stats import spearmanr

# Compute correlation loss on validation set
model.eval()  # Ensure model is in evaluation mode
all_predictions = []
all_labels = []
with torch.no_grad():
    for batch_X, lengths, batch_y in val_loader:
        predictions = model(batch_X, lengths)
        all_predictions.append(predictions)
        all_labels.append(batch_y.to(device))

all_predictions = torch.cat(all_predictions, dim=0)
all_labels = torch.cat(all_labels, dim=0)

val_corr = spearmanr(all_predictions.cpu(), all_labels.cpu()).correlation
print(f"Validation Correlation: {val_corr:.4f}")

Validation Correlation: 0.3139
