# Replication of NetSurfP 2.0 with pyTorch

The purpose of this notebook is to replicate the current version of NetSurfP 2.0

**Load libraries**

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision

from torch.utils.data import Dataset, DataLoader

## 1. Preparation of the data

**Load datasets**

In [2]:
train_hhblits = np.load("../data/nsp2/training_data/Train_HHblits.npz")
CB513_hhblits = np.load("../data/nsp2/training_data/CB513_HHblits.npz")
TS115_hhblits = np.load("../data/nsp2/training_data/TS115_HHblits.npz")
CASP12_hhblits = np.load("../data/nsp2/training_data/CASP12_HHblits.npz")

train_mmseqs = np.load("../data/nsp2/training_data/Train_MMseqs.npz")
CB513_mmseqs = np.load("../data/nsp2/training_data/CB513_MMseqs.npz")
TS115_mmseqs = np.load("../data/nsp2/training_data/TS115_MMseqs.npz")
CASP12_mmseqs = np.load("../data/nsp2/training_data/CASP12_MMseqs.npz")

**Data loader class**

In [3]:
class NSPData(Dataset):
    def __init__(self, X, y):
        """
        Args:
            X (np.array): The array that contains the training data
            y (np.array): The array that contains the test data
        """
        self.data = torch.tensor(X).permute(0,2,1).float()
        self.targets = torch.tensor(y).permute(0,2,1).float()

    def __getitem__(self, index):
        """ Returns train and test data at an index
        Args:
            index (int): Index at the array
        """
        return self.data[index], self.targets[index]
    
    def __len__(self):
        """Returns the length of the data"""
        return len(self.data)

**Instanciate data into the data loader class**

In [4]:
batch_size = 15

train_hhblits = DataLoader(NSPData(X=train_hhblits['data'][:1000, :1000, :50], \
                                    y=train_hhblits['data'][:1000, :1000, 50:68]), \
                                     batch_size=batch_size)

## 2. The NSP Classifier

In [5]:
class NSP_Classifier(nn.Module):
    def __init__(self, n_init_channels, n_class, n_channels, n_hidden, filter_size=(129, 257)):
        """
        Args:
            n_init_channels (int): The size of the incoming feature vector
            n_classes (int): The size of the output prediction vector
            n_channel: (int) The channel size to use throughout the convolutional network
            filter_size (tuple n=2) The filter size of the kernel for each convolutional network
            n_hidden: (int) The amount of hidden neurons in the bidirectional lstm
        """
        super(NSP_Classifier, self).__init__()
            
        self.conv1 = nn.Conv1d(in_channels=n_init_channels, out_channels=n_channels, kernel_size=filter_size[0])
        self.conv2 = nn.Conv1d(in_channels=n_init_channels, out_channels=n_channels, kernel_size=filter_size[1])

        self.lstm = nn.LSTM(input_size=n_init_channels+n_channels*2, hidden_size=n_hidden, \
                                 num_layers=2, batch_first=True, bidirectional=True)
        
        # Output layer
        self.fc = nn.Linear(in_features=n_hidden*2, out_features=n_class)
        
    def concatenate_channels(self, x, conv1, conv2):
        """ Concats the output channels of the convolutional networks back to initial channel.
        Args:
            x (torch.sensor): input data
            conv1 (torch.sensor): convolutional 1D output data
            conv2 (torch.sensor): convolutional 1D output data 
        """
        # Concatenate channels from conv1 and conv2
        conv_cat = torch.cat([conv1[:, :, -1], conv2[:, :, -1]],dim=1)
        conv_cat = conv_cat.unsqueeze(2).expand(-1, -1, x.shape[2])
        
        return torch.cat([x, conv_cat], dim=1)
        
    def forward(self, x):
        """ Forwarding of the classifier input
        Args:
            x (torch.sensor): input data
        """
        
        conv1 = F.relu(self.conv1(x))
        conv2 = F.relu(self.conv2(x))
        
        # Concatenate and permutate for LSTM
        x = self.concatenate_channels(x, conv1, conv2)
        x = x.permute(0,2,1)
        
        x, _ = self.lstm(x)
        x = self.fc(x)
        
        return x

## 3. Loss function and optimizer

In [6]:
nsp_classifier = NSP_Classifier(50, 18, 32, 1024)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(nsp_classifier.parameters(), lr=0.001,betas=(0.85,0.95),weight_decay=1e-6)

## 4. Training

In [None]:
for epoch in range(50):  # loop over the dataset multiple times

    running_loss = 0.0
    
    for i, data in enumerate(train_hhblits, 0):
        inputs, labels = data
        
        # Conversion from a vector of predicted probabilities to a class label
        labels = torch.argmax(labels, dim=2)

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = nsp_classifier(inputs)
        
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Check every batch
        print(i)
        if i % batch_size == 0:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / batch_size))
            running_loss = 0.0

print('Finished Training')

> [0;32m<ipython-input-7-c52dc79422b2>[0m(8)[0;36m<module>[0;34m()[0m
[0;32m      6 [0;31m    [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      7 [0;31m[0;34m[0m[0m
[0m[0;32m----> 8 [0;31m    [0;32mfor[0m [0mi[0m[0;34m,[0m [0mdata[0m [0;32min[0m [0menumerate[0m[0;34m([0m[0mtrain_hhblits[0m[0;34m,[0m [0;36m0[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m      9 [0;31m        [0minputs[0m[0;34m,[0m [0mlabels[0m [0;34m=[0m [0mdata[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     10 [0;31m[0;34m[0m[0m
[0m


ipdb>  len(train_hhblits)


67
--KeyboardInterrupt--

KeyboardInterrupt: Interrupted by user
