In [3]:
import numpy as np
import pandas as pd
import scipy.io as sio
from Bio import SeqIO
import collections
import sys
import random
from sklearn import metrics
import matplotlib.pyplot as plt
sys.path.append("..")
import data_processing as dp
import evaluation_metrics as em
%matplotlib inline

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import random

### Load Data and Create Train/Dev/Test Sets

#### Human sequences

In [None]:
human_sequences, human_protein_names = dp.load_FASTA('../data/human_sequences.fasta')
human_train_idx, human_valid_idx, human_test_idx, human_train_labels, human_valid_labels, \
    human_test_labels, human_GO_terms = dp.load_test_sets('../data/human_annotations_temporal_holdout.mat')

# Create train, validation, and test sets from the full list of human proteins
human_train_sequences = [human_sequences[i] for i in human_train_idx]
human_valid_sequences = [human_sequences[i] for i in human_valid_idx]
human_test_sequences = [human_sequences[i] for i in human_test_idx]

# Convert corresponding labels for train, validation, and test sets 
# from the full list of human proteins.
human_train_labels = torch.from_numpy(human_train_labels).type(torch.LongTensor)
human_valid_labels = torch.from_numpy(human_valid_labels).type(torch.LongTensor)
human_test_labels = torch.from_numpy(human_test_labels).type(torch.LongTensor)

# Convert protein sequence strings into long tensors where each int corresponds
# to one of 22 amino acids.  The length to truncate to is included.
human_train_tensors = dp.TransformAAsToTensor(human_train_sequences,1000)
human_valid_tensors = dp.TransformAAsToTensor(human_valid_sequences,1000)
human_test_tensors = dp.TransformAAsToTensor(human_test_sequences,1000)

#### Yeast sequences

In [3]:
# Load yeast sequences and training data
yeast_sequences, yeast_protein_names = dp.load_FASTA('../data/yeast_sequences.fasta')
yeast_train_idx, yeast_valid_idx, yeast_test_idx, yeast_train_labels, yeast_valid_labels, \
    yeast_test_labels, yeast_GO_terms = dp.load_test_sets('../data/yeast_MF_temporal_holdout.mat')

# Create train, validation, and test sets from the full list of yeast proteins
yeast_train_sequences = [yeast_sequences[i] for i in yeast_train_idx]
yeast_valid_sequences = [yeast_sequences[i] for i in yeast_valid_idx]
yeast_test_sequences = [yeast_sequences[i] for i in yeast_test_idx]

# Convert corresponding labels for train, validation, and test sets from the full list of yeast proteins.
yeast_train_labels = torch.from_numpy(yeast_train_labels).type(torch.LongTensor)
yeast_valid_labels = torch.from_numpy(yeast_valid_labels).type(torch.LongTensor)
yeast_test_labels = torch.from_numpy(yeast_test_labels).type(torch.LongTensor)

# Convert protein sequence strings into long tensors where each int corresponds
# to one of 22 amino acids.  The length to truncate to is included.
yeast_train_tensors = dp.TransformAAsToTensor(yeast_train_sequences,500)
yeast_valid_tensors = dp.TransformAAsToTensor(yeast_valid_sequences,500)
yeast_test_tensors = dp.TransformAAsToTensor(yeast_test_sequences,500)

## Model

### LSTM class:  

In [5]:
class LSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_labels, batch_size):
        super(LSTM, self).__init__()
        self.embed = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)        
        self.embedding_size = emb_dim
        self.hidden_size = hidden_size
        self.output_size = num_labels
        self.batch_size = batch_size
        
        self.linear_f = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.linear_i = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.linear_ctilde = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.linear_o = nn.Linear(embedding_dim + hidden_size, hidden_size)
        self.decoder = nn.Linear(hidden_size, num_labels)
        
        self.init_weights()
    
    def forward(self, data, hidden, c):
        """
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        emb = self.embed(data)
        embs = torch.chunk(emb, emb.size()[1], 1)
        
        def step(emb, hid, c_t):
            combined = torch.cat((hid,emb),1)
            f = F.sigmoid(self.linear_f(combined))
            i = F.sigmoid(self.linear_i(combined))
            c_tilde = F.tanh(self.linear_ctilde(combined))
            c_t = f*c_t + i*c_tilde
            o = F.sigmoid(self.linear_o(combined))
            hid = o * F.tanh(c_t)
            return hid, c_t
        
        for i in range(len(embs)):
            hidden, c = step(embs[i].squeeze(), hidden, c)
        
        output = self.decoder(hidden)
        return output, hidden
    
    def init_hidden(self):
        h0 = Variable(torch.zeros(self.batch_size, self.hidden_size))
        c0 = Variable(torch.zeros(self.batch_size, self.hidden_size))
        return h0, c0
        
    def init_weights(self):
        initrange = 0.1
        lin_layers = [self.linear_f, self.linear_i, self.linear_ctilde, self.linear_o]
        em_layer = [self.embed]
     
        for layer in lin_layers+em_layer:
            layer.weight.data.uniform_(-initrange, initrange)
            if layer in lin_layers:
                layer.bias.data.fill_(0)
                

### Early stop condition and training stage

In [6]:
def early_stop(val_loss_history, t=10, required_progress=0.001):
    """
    Stop the training if there is no non-trivial progress in k steps
    @param val_acc_history: a list contains all the historical validation acc
    @param required_progress: the next acc should be higher than the previous by 
        at least required_progress amount to be non-trivial
    @param t: number of training steps 
    @return: a boolean indicates if the model should earily stop
    """    
    cnt = 0 # initialize the count --> to store count of cases where difference in
                                    #  accuracy is less than required progress.
    
    if(len(val_loss_history) > 0): # if list has size > 0 
        for i in range(t): # start the loop
            index = len(val_loss_history) - (i+1) # start from the last term in list and move to the left
            if (index >= 1): # to check if index != 0 --> else we can't compare to previous value
                if ((val_loss_history[index-1] - val_loss_history[index]) < required_progress):
                    cnt += 1 # increase the count value
                else:
                    break # break if difference is greater 
    
    if(cnt != t): # if count is equal to t, return True
        return False
    else:
        return True

### Evaluation Metric

In [17]:
from sklearn.metrics import precision_score,recall_score,average_precision_score

def round_manual(data, threshold):
    return (data >= threshold).astype(int)

def calculate_accuracy(predicted, actuals, num_labels):
    """
    @param predicted: data type = Variable
    @param actuals: data type = Variable
    @param num_labels: no of go terms
    @return: accuracy measure
    """
    predicted = np.round(predicted.data.numpy())
    total_predictions = actuals.size()[0]
    accuracy = np.sum(predicted==actuals.data.numpy())/(total_predictions*num_labels)
    return accuracy

def m_tau(predictions):
    return len([np.sum(i) for i in predictions if np.sum(i)!=0])

def n_e(predictions):
    return predictions.shape[0]

def calculate_recall_precision(predicted, actual):
    '''
    Overall, this function calculates the recall and precision of the validation set proteins.
    The function FIRST calculates the precision and recall values of INDIVIDUAL proteins. 
    It then takes the mean average of these values to get "dataset-level" precision and recall.
    '''
    
    PositivesPerRow = actual.numpy().sum(axis=1) #number of functions for each protein
    PosPredictionsPerRow = predicted.sum(axis=1) #number of predictions for each protein
    TPs = np.multiply(actual.numpy(), predicted) #element-wise multiplication: 1 if TP, else 0
    TPsPerRow = TPs.sum(axis=1) #number of true positives for each protein
    
    #PrecisionPerRow (Protein) - if protein has 0 positive predictions, the protein's precision = 0.
    #Else, the protein's precision = TPs/PositivePreds
    PrecisionPerRow = np.where(PosPredictionsPerRow == 0, 0, TPsPerRow/PosPredictionsPerRow)
    RecallPerRow = np.where(PositivesPerRow==0, 0, TPsPerRow/PositivesPerRow) #Recall per Protein
    
    #RecallScore = average of individual protein recall scores
    RecallScore = sum(RecallPerRow)/len(RecallPerRow) #denominator is non-zero
    
    #PrecisionScore = average of CERTAIN individual protein precision scores (see line below)
    #Only consider rows with at least one predicted Go-Term.
    #Note that some proteins can have Precision=0 but still have predictions.
    if sum(PrecisionPerRow)>0:
        PrecisionScore = sum(PrecisionPerRow)/len([x for x in PosPredictionsPerRow if x!=0]) 
    else:
        PrecisionScore = 0
    return RecallScore, PrecisionScore
    
    
def F_score(predicted, actuals):
    """
    @param predicted: data type = Variable
    @param actuals: data type = Variable
    @return: Maximum f score over all values of tau and the corresponding tau threshold
    """
    f_max, optimal_threshold, optimal_precision, optimal_recall = 0, 0, 0, 0
    for threshold in [i/100 for i in range(1,100)]:
        predicted_tau = round_manual(predicted.data.numpy(), threshold)
        recall_score, precision_score = calculate_recall_precision(predicted_tau, actuals)
        
        if recall_score==0 and precision_score==0:
            output = 0
        else:
            output = np.true_divide((2*precision_score*recall_score),(precision_score + recall_score))
        if output > f_max:
            f_max = output
            optimal_threshold = threshold
            optimal_precision = precision_score
            optimal_recall = recall_score
    
    return f_max, optimal_threshold, optimal_precision, optimal_recall

In [18]:
def train_test(batch_size,num_epochs,model,loss,optimizer,train_batch,test_eval_batch,\
               num_labels,lstm=True, early_stop=False):
    eval_step=0
    train_step=0
    epoch=1
    losses = []
    valid_loss_history = []
    if num_labels == 147:
        total_batches = int(len(human_train_tensors)/batch_size)
    else:
        total_batches = int(len(yeast_train_tensors)/batch_size)
        
    while epoch < num_epochs:
        train_data, train_labels = next(data_iter)
        model.train()
        model.zero_grad()

        if lstm:
            hidden, c_t = model.init_hidden()
            outputs, hidden = model(train_data, hidden, c_t)
        else:
            outputs = model(train_data, length_batch)

        loss = criterion(outputs, train_labels.float())
        losses.append(loss.data[0])
        loss.backward()
        optimizer.step()          
        
        if early_stop:
            stop_training = early_stop(valid_loss_history)

            if stop_training:
                print("earily stop triggered")
                break
        
        if train_step % 10 == 0:
            model.eval()
            if lstm:
                hidden, c_t = model.init_hidden()
                valid_outputs, hidden = model(Variable(test_eval_batch[eval_step][0]), hidden, c_t)                
            else:
                valid_outputs = model(Variable(valid_sequences))

            f_score,_,_,_ = F_score(valid_outputs, test_eval_batch[eval_step][1])

            valid_loss = criterion(valid_outputs.data, test_eval_batch[eval_step][1].float()).data.numpy()
            valid_loss_history.append(valid_loss)

            print('Epoch: [{}/{}], Train loss: {}, Validation Loss:{},Valid F_Score: {}'\
                  .format(epoch, num_epochs, np.mean(losses),valid_loss_history[-1], f_score))
            eval_step+=1
            
        if train_step % total_batches==0:
            torch.save(model.state_dict(), PATH) # Saves model after every epoch
            epoch+=1
        
        train_step+=1

## Train and Evaluate Model

#### Hyperparameters 

In [9]:
learning_rate = 0.05
vocab_size = 23 # number words in the vocabulary base
emb_dim = 8 # dimension for n-gram embedding
hidden_dim=12
num_epochs = 50 # number epoch to train
batch_size = 100
PATH='saved_model'

### Human Results

In [None]:
num_labels = human_GO_terms.shape[0] #147

lstm = LSTM(vocab_size, emb_dim, hidden_dim,num_labels,batch_size)
criterion = nn.MultiLabelSoftMarginLoss()  
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate) 

data_iter = dp.batch_iter(batch_size, human_train_tensors, human_train_labels)
valid_eval = dp.eval_iter(batch_size, human_valid_tensors, human_valid_labels)

# Model Training
train_test(batch_size, num_epochs, lstm, criterion, optimizer, data_iter, valid_eval,num_labels) 

### Yeast Results

In [None]:
num_labels = yeast_GO_terms.shape[0] #147

lstm = LSTM(vocab_size, emb_dim, hidden_dim,num_labels,batch_size)
criterion = nn.MultiLabelSoftMarginLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

data_iter = batch_iter(batch_size, yeast_train_tensors, yeast_train_labels)
valid_eval = dp.eval_iter(batch_size, yeast_valid_tensors, yeast_valid_labels)

# Model Training
train_test(batch_size, num_epochs, lstm, criterion, optimizer, data_iter, valid_eval,num_labels) 