In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.preprocessing import StandardScaler
import pandas as pd

In [7]:
train_data = pd.read_csv("./data/train_sequences.csv")
train_labels = pd.read_csv("./data/train_labels.csv")
val_data = pd.read_csv("./data/validation_sequences.csv")
val_labels = pd.read_csv("./data/validation_labels.csv")


In [8]:
def one_hot_encode(sequence):
    # Map nucleotides to index (A: 0, C: 1, G: 2, U: 3)
    mapping = {'A': 0, 'C': 1, 'G': 2, 'U': 3}
    encoded = np.zeros((len(sequence), 4), dtype=np.float32)
    
    for i, nucleotide in enumerate(sequence):
        if nucleotide in mapping:
            encoded[i, mapping[nucleotide]] = 1.0
    
    return encoded

In [9]:
class RNA3DNN(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(RNA3DNN, self).__init__()
        # Define layers
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, output_dim)  # output 3 coordinates for each residue

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [10]:
def train_model(model, train_data, train_labels, epochs=100, lr=0.001):
    model.train()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        optimizer.zero_grad()
        
        # Get the input data and labels
        sequences = [seq for seq in train_data['sequence']]
        true_coords = train_labels[['x_1', 'y_1', 'z_1']].values
        
        # One-hot encode the sequences
        inputs = np.array([one_hot_encode(seq) for seq in sequences])
        inputs = torch.tensor(inputs, dtype=torch.float32)
        true_coords = torch.tensor(true_coords, dtype=torch.float32)
        
        # Flatten inputs to (batch_size, sequence_length * 4)
        inputs = inputs.view(inputs.shape[0], -1)
        
        # Forward pass
        preds = model(inputs)
        
        # Compute loss
        loss = criterion(preds, true_coords)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        if epoch % 10 == 0:
            print(f'Epoch {epoch}, Loss: {loss.item()}')

In [11]:
def compute_tm_score(pred_coords, true_coords):
    """
    Compute a TM-score between two structures based on the provided formula.
    """
    L_ref = true_coords.shape[0]
    L_align = pred_coords.shape[0]
    
    # Compute d0 (scaling factor)
    if L_ref >= 30:
        d0 = 0.6 * np.sqrt(L_ref - 0.5) - 2.5
    else:
        d0_values = {12: 0.3, 15: 0.4, 19: 0.5, 23: 0.6, 29: 0.7}
        d0 = d0_values.get(L_ref, 0.7)
    
    # Compute pairwise distances between corresponding residues
    distances = np.linalg.norm(pred_coords - true_coords, axis=1)

    # Compute the TM-score
    tm_score = np.sum(1 / (1 + (distances / d0)**2)) / L_ref
    return tm_score

# Evaluate the model on the validation set
def evaluate_tm_score(model, val_data, val_labels):
    model.eval()
    tm_scores = []
    
    with torch.no_grad():
        # Get the input data and labels
        sequences = [seq for seq in val_data['sequence']]
        true_coords = val_labels[['x_1', 'y_1', 'z_1']].values
        
        # One-hot encode the sequences
        inputs = np.array([one_hot_encode(seq) for seq in sequences])
        inputs = torch.tensor(inputs, dtype=torch.float32)
        true_coords = torch.tensor(true_coords, dtype=torch.float32)
        
        # Flatten inputs
        inputs = inputs.view(inputs.shape[0], -1)
        
        # Predict
        preds = model(inputs)
        
        # Compute TM-score
        pred_coords = preds.cpu().numpy()
        true_coords = true_coords.cpu().numpy()
        tm = compute_tm_score(pred_coords, true_coords)
        tm_scores.append(tm)
    
    avg_tm_score = np.mean(tm_scores)
    return avg_tm_score

In [12]:
input_dim = 4 * 30  # For example, 30 nucleotide-long sequences with 4 possible nucleotides (A, C, G, U)
hidden_dim = 256
output_dim = 3  # x, y, z coordinates
model = RNA3DNN(input_dim, hidden_dim, output_dim)

In [13]:
train_model(model, train_data, train_labels, epochs=100, lr=0.001)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (844,) + inhomogeneous part.