The third model used is a Recurrent Neural Network (RNN) using Pytorch. An RNN was chosen as it has a couple advantages when searching through a sequence for binding sites or other information that a sequence can tell us. One of the design features allow RNN's to handle temporal dependancies, the effects of previous inputs on the current decision making, which is vital when looking at sequences as the surrounding base pairs change the information that is given from the one being observed.

In [7]:
import torch
import torch.nn as nn
from sklearn.metrics import roc_auc_score
import pickle

In [8]:
def load_data(filename):
    with open(filename, 'rb') as f:
        data = pickle.load(f)
    return data

In [12]:
pkl_file = "test_1_df.pkl"
test_df = load_data(pkl_file)
X = test_df['one_hot_data']
y = test_df['label']

Index(['chromosome', 'start', 'end', 'size', 'peak_seq', 'label',
       'one_hot_data'],
      dtype='object')


In [14]:
len(X)

11987

In [13]:
len(y)

11987

In [27]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        # Forward propagate RNN
        out, _ = self.rnn(x, h0)
        # Decode the hidden state of the last time step
        out = self.fc(out[:, -1, :])
        return out


def train_model(model, criterion, optimizer, input_data, labels, num_epochs=10):
    for epoch in range(num_epochs):
        optimizer.zero_grad()
        outputs = model(input_data)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        if (epoch+1) % 1 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

def calculate_auc(model, input_data, labels):
    with torch.no_grad():
        predictions = torch.sigmoid(model(input_data)).numpy()
        auc = roc_auc_score(labels.numpy(), predictions)
        print(f'AUC: {auc:.4f}')
        return auc

# Example usage:
input_size = 100
hidden_size = 64
num_layers = 50
output_size = 37

model = RNN(input_size, hidden_size, num_layers, output_size)
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Dummy input data and labels (replace with your actual data)
input_data = torch.randn(32, 5, input_size)  # Batch size 32, sequence length 5
labels = torch.randint(0, 2, (32, output_size)).float()

# Train the model
train_model(model, criterion, optimizer, input_data, labels)

# Calculate AUC
auc = calculate_auc(model, input_data, labels)

Epoch [1/10], Loss: 0.6956
Epoch [2/10], Loss: 0.6917
Epoch [3/10], Loss: 0.6887
Epoch [4/10], Loss: 0.6861
Epoch [5/10], Loss: 0.6840
Epoch [6/10], Loss: 0.6826
Epoch [7/10], Loss: 0.6817
Epoch [8/10], Loss: 0.6813
Epoch [9/10], Loss: 0.6810
Epoch [10/10], Loss: 0.6808
AUC: 0.5087
