In [1]:
##Author: Simona
#References
#https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html
#https://www.analyticsvidhya.com/blog/2020/01/first-text-classification-in-pytorch/
#https://towardsdatascience.com/media/935b97e7b4c541849529cf3b40e4e5ac

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.nn.utils.rnn as rnn_utils
from numpy import load
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader



# torch.cuda.is_available() checks and returns a Boolean True if a GPU is available, else it'll return False
is_cuda = torch.cuda.is_available()
# If we have a GPU available, we'll set our device to GPU. We'll use this device variable later in our code.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

#List of all x and y files in the database
path = os.listdir("../../MachineLearning SourceCode/Dataset-vectorized/itasserSx9")
x_List = []
y_List = []
xyfileTuple = []

for names in path[:200]:                  ####choose here the files to process
    if names.endswith("-x-.npy"):
        x_List.append(names)
    else:
        y_List.append(names)

for xfile in x_List:
    for yfile in y_List:
        if (xfile.replace('-x-.npy','') == yfile.replace('-y-.npy','')):
            xyfileTuple.append((xfile,yfile))
            break
            
## Define dataset x_data, y_data
##lists of N_batches elements of shape (N_sequence, features)
x_data = []           # x as list of inputs array from different files
y_data = []           # y as list of outputs array from different files

N_batches = len(x_List)
for i in range(N_batches):
    x_i, y_i = load("../../MachineLearning SourceCode/Dataset-vectorized/itasserSx9"+xyfileTuple[i][0]), load("../../MachineLearning SourceCode/Dataset-vectorized/itasserSx9"+xyfileTuple[i][1])
    x_i, y_i = torch.from_numpy(x_i).type(torch.FloatTensor), torch.from_numpy(y_i).type(torch.FloatTensor) 
    x_data.append(x_i)
    y_data.append(y_i)
    
# Padding data to obtain x and y tensors with same amino-sequencelengths (by adding zeros manually) 
lengths = [len(x) for x in x_data]
longest_sequence = max(lengths)
batch_size = len(x_data)
D_in = x_data[0].shape[1]
D_out = y_data[0].shape[1]
padded_x = torch.zeros((N_batches, longest_sequence, D_in))    
padded_y = torch.zeros((N_batches, longest_sequence, D_out))

# copy over the actual sequences
for i, x_len in enumerate(lengths):
    x_i = x_data[i]
    y_i = y_data[i]
    padded_x[i, 0:x_len, :] = x_i[:x_len, :]         
    padded_y[i, 0:x_len, :] = y_i[:x_len, :]
    
def makedirs(dirname):
    if not os.path.exists(dirname):
        os.makedirs(dirname) 
makedirs('./Basic_LSTM_outputs')

##x and y are lists of N_batches tensors
#each of shape (N_sequence, D_in) and (N_sequence, D_out) 

#split first 100 data in training and validation
x_train, x_valid, y_train, y_valid = train_test_split(padded_x, padded_y, test_size=0.2)

GPU not available, CPU used


In [13]:
##Add data normalization to improve the model prediction
##Normalize output angles 
def normalize(x, m, s):
    for i in range(len(x)):
        x_n = x
        x_n[i, : ,0] = (x[i, : ,0]-m[i, 0])/s[i, 0]
        x_n[i, : ,1] = (x[i, : ,1]-m[i, 1])/s[i, 1]
    return x_n

def unnormalize(x_n, m, s):
    for i in range(len(x_n)):
        x = x_n
        x[i, : ,0] = x_n[i, : ,0]*s[i, 0]+m[i, 0]
        x[i, : ,1] = x_n[i, : ,1]*s[i, 1]+m[i, 1]
    return x

mean, std_dev = y_train[:, :, 4:].mean(dim=1), y_train[:, :, 4:].std(dim=1)
y_train_norm = normalize(y_train[:, :, 4:], mean, std_dev)
y_valid_norm = normalize(y_valid[:, :, 4:], mean, std_dev)

In [None]:
## Set parameters
#Data params
N_batch_train, N_seq, D_in = x_train.shape 
N_batch_valid = x_valid.shape[0]
D_out = 2 #the last 2 columns contains angles values
        
# Network params
hidden_dim = 50
num_layers = 2
learning_rate = 1e-3
dtype = torch.float
    
# Build model for classiffication of secondary structures angles
# Here we define RNN model as a class
class LSTMangles(nn.Module):
    def __init__(self, input_size, hidden_dim, num_layers, output_size):
        super().__init__()

        # Defining some parameters
        self.input_size = input_size
        self.output_size = output_size 
        self.num_layers = num_layers
        self.hidden_dim = hidden_dim

        #Defining the layers
        self.lstm = nn.LSTM(input_size, hidden_dim, num_layers)      
        # Define the output layer
        self.linear = nn.Linear(hidden_dim, output_size)

    def forward(self, x):
        lstm_out, hidden = self.lstm(x.view(len(x), 1, -1))
        output = self.linear(lstm_out.view(len(lstm_out), -1))
        return output
    
model = LSTMangles(D_in, hidden_dim, num_layers, output_size=2)
loss_function = torch.nn.MSELoss()
optimiser = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [None]:
# Train model
epoch = 400
train_loss = torch.zeros((epoch))  
valid_loss = torch.zeros((epoch))  
best_val_loss = 1000
best_t = 0
patient = 0
t=0

# loop to learn the neural network
for t in range(epoch):
    
    sum_loss = 0.0
    for i in range(N_batch_train):  
        x = x_train[i, :, :]  #parse the x_train values
        y = y_train_norm

        model.zero_grad()
        y_pred = model(x)
        loss = loss_function(y_pred, y)
        loss.backward()
        optimiser.step()
        sum_loss+=loss

    train_loss[t] = sum_loss.item()/N_batch_train   
    
    if t % 100 == 0:
        print("epoch ", t, "Loss: ", sum_loss.item())
    
torch.save(model, './Basic_LSTM_outputs/mytraining_angles_norm.pt')

In [None]:
#non-normalized losses
y_pred_train = torch.zeros((N_batch_train, N_seq, 2))
with torch.no_grad():
    for i in range(N_batch_train):  
        x = x_train[i, :, :]  #parse the x_train values
        y = y_train_norm  #parse the last 2 columns values of y_train, last 2 contains angleS
        y_pred = model(x)
        y_pred_train[i, :, :] = y_pred
        
y_unnorm_train = unnormalize(y_pred_train, mean, std_dev)      
loss = loss_function(y_unnorm_train, y_train[:, :, 4:])/N_batch_train
print(loss, 'train_loss')

y_pred_valid = torch.zeros((N_batch_valid, N_seq, 2))
with torch.no_grad():
    for i in range(N_batch_valid):
        x = x_valid[i, :, :]
        y = y_valid_norm
        y_pred = model(x)
        y_pred_valid[i, :, :] = y_pred
        
y_unnorm_valid = unnormalize(y_pred_valid, mean, std_dev)      
loss = loss_function(y_unnorm_valid, y_valid[:, :, 4:])/N_batch_valid
print(loss, 'valid_loss')