In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
from Bio import SeqIO
import collections
import sys

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import random

sys.path.append("..")
from evaluation_metrics import *
from data_processing import *

#Yeast sequences
fasta = '../../../data/yeast_sequences.fasta'
test_set_file = '../../../data/yeast_MF_temporal_holdout.mat'

sequences, names = load_FASTA(fasta)
train_inds, valid_inds, test_inds, y_trainYeast, y_validYeast, y_testYeast, go_termsYeast = load_test_sets(test_set_file)

train_seqsYeast = [sequences[i] for i in train_inds]
print('Number of training prots: ' + str(len(train_seqsYeast)))
valid_seqsYeast = [sequences[i] for i in valid_inds]
print('Number of validation prots: ' + str(len(valid_seqsYeast)))
test_seqsYeast = [sequences[i] for i in test_inds]
print('Number of testing prots: ' + str(len(test_seqsYeast)))

Number of training prots: 3447
Number of validation prots: 963
Number of testing prots: 206


In [2]:
yTrainYeast = torch.from_numpy(y_trainYeast).type(torch.LongTensor)
yValidYeast = torch.from_numpy(y_validYeast).type(torch.LongTensor)
yTestYeast = torch.from_numpy(y_testYeast).type(torch.LongTensor)

# yTrainYeast = torch.from_numpy(y_trainYeast).type(torch.LongTensor)
# yValidYeast = torch.from_numpy(y_validYeast).type(torch.LongTensor)
# yTestYeast = torch.from_numpy(y_testYeast).type(torch.LongTensor)

#yTrainYeast

# Calculating the Length of train, valid and test data 

In [3]:
k = 1 #value for kmers

# train_seqsYeast_length = sequence_lengths_with_kmers(train_seqsYeast, k) 
# valid_seqsYeast_length = sequence_lengths_with_kmers(valid_seqsYeast, k)
# test_seqsYeast_length = sequence_lengths_with_kmers(test_seqsYeast, k)
train_seqsYeast_length = sequence_lengths_with_kmers(train_seqsYeast, k)
valid_seqsYeast_length = sequence_lengths_with_kmers(valid_seqsYeast, k)
test_seqsYeast_length = sequence_lengths_with_kmers(test_seqsYeast, k)

## Vectorize all amino-acid chains in the list 
#### Each amino-acid string becomes one row in a tensor object.
#### This tensor object has dimension NxD, where N is the number of amino-acid strings and D is the length of the longest chain in the set. 

## Get kmers (Can take upto 3-4 mins)

In [4]:

if k==1:
    k_mers_yeast = None
else:
    k_mers_yeast = get_k_mers(train_seqsYeast, valid_seqsYeast, test_seqsYeast, k, org="yeast")


### Runs quickly for Yeast, about 2 minutes for Yeast data. 

In [6]:

TrainSeqsYeast = TransformAAsToTensor_with_kmers(train_seqsYeast, k, k_mers_yeast, acid_dict_yeast)
ValidSeqsYeast = TransformAAsToTensor_with_kmers(valid_seqsYeast, k, k_mers_yeast, acid_dict_yeast)
TestSeqsYeast = TransformAAsToTensor_with_kmers(test_seqsYeast, k, k_mers_yeast, acid_dict_yeast)

# Train Model

## 0) Hyperparameter setting

In [7]:
learning_rate = 0.001
#vocab_size = 26 # number words in the vocabulary base
num_labels = go_termsYeast.shape[0] 
if k == 1:
    vocab_size = len(acid_dict_yeast) + 1
else:
    vocab_size = max(list(k_mers_yeast.values())) + 1
emb_dim = 50 # dimension for n-gram embedding
num_epochs = 5 # number epoch to train
batch_size = 26

## 1) Get batch data method:


## 3) GRU Implementation

In [64]:
class CNN(nn.Module):
    """
    CNN model
    """
       
    def __init__(self, vocab_size, emb_dim, num_labels, hidden_size, n_layers=1, dropout=0.1, is_bidirectional = False):
       
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(CNN,self).__init__()
        Ci = 1
        Co = 2
        self.embed = nn.Embedding(vocab_size, emb_dim, padding_idx=0)
        self.conv13 = nn.Conv2d(Ci, Co, (3, emb_dim))
        self.conv14 = nn.Conv2d(Ci, Co, (4, emb_dim))
        self.conv15 = nn.Conv2d(Ci, Co, (5, emb_dim))
        self.convs = [self.conv13, self.conv14, self.conv15]
        self.dropout = nn.Dropout(dropout)
        self.fc1 = nn.Linear(3*Co, num_labels) 
    
    def forward(self, x):
        x = self.embed(x) 
        x = x.unsqueeze(1) # (N,Ci,W,D)
        x = [F.relu(conv(x)).squeeze(3) for conv in self.convs] #[(N,Co,W), ...]*len(Ks)
        x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] #[(N,Co), ...]*len(Ks)
        x = torch.cat(x, 1)
        x = self.dropout(x) # (N,len(Ks)*Co)
        logit = self.fc1(x) # (N,C)
        #return logit
        return torch.nn.functional.softmax(logit)
        

                

## 4) Training Stage

In [88]:
def early_stop(val_acc_history, t=2, required_progress=0.00001):
    """
    Stop the training if there is no non-trivial progress in k steps
    @param val_acc_history: a list contains all the historical validation acc
    @param required_progress: the next acc should be higher than the previous by 
        at least required_progress amount to be non-trivial
    @param t: number of training steps 
    @return: a boolean indicates if the model should earily stop
    """
    # TODO: add your code here
    
    cnt = 0 # initialize the count --> to store count of cases where difference in
                                    #  accuracy is less than required progress.
    
    if(len(val_acc_history) > 0): # if list has size > 0 
        for i in range(t): # start the loop
            index = len(val_acc_history) - (i+1) # start from the last term in list and move to the left
            if (index >= 1): # to check if index != 0 --> else we can't compare to previous value
                if (abs(val_acc_history[index] - val_acc_history[index-1]) < required_progress):
                    cnt += 1 # increase the count value
                else:
                    break # break if difference is grea-ter 
    
    if(cnt != t): # if count is equal to t, return True
        return False
    else:
        return True
    
    
def train(valid_sequences, valid_label, num_epochs, optimizer, data_iter, model, training_length, threshold):
    losses = []
    total_batches = int(training_length/ batch_size) #375

    eval_every = 10
    print_every = 10
    validate_every = int((eval_every/100)*total_batches)
    show_every = int((print_every/100)*total_batches)
    val_outputs = None
    eval_loss = None
    valid_label = None
    validation_losses = []
    
    for epoch in range(1, num_epochs+1):
        stop_training = False
        for i, (train_data, train_labels, length_batch) in enumerate(data_iter):
                                                # train_data size: (26, 34350) ; train_label size: (26, 147)
                                                # This needs to be modified. Max length is batch specific !!!!!
            model.train(True)
            model.zero_grad()
            outputs = model(train_data)
            loss = criterion(outputs, train_labels.float())
            losses.append(loss.data[0])
            loss.backward()


            clipped = torch.nn.utils.clip_grad_norm(model.parameters(), 0.5)
            # clip gradients because RNN
            for pr in model.parameters():
                pr.data.add_(-clipped, pr.grad.data)

            optimizer.step()

            if (i+1)%validate_every == 0:
                # Erly stop using validation loss
                valid_sequences, valid_label = reduced_set(ValidSeqsYeast, valid_seqsYeast_length, yValidYeast, 100)

                model.eval()
                val_outputs = model(Variable((valid_sequences).type(torch.LongTensor), volatile=True))
                eval_loss = criterion(val_outputs.cpu().data, valid_label.float())
                print(eval_loss.data[0])
                validation_losses.append(eval_loss.data[0])
                stop_training = early_stop(validation_losses, 5)

            # Print statements
            if stop_training:
                print("earily stop triggered")
                break
            if (i+1) % show_every == 0:
                print('Epoch: [{0}/{1}], Step: [{2}/{3}], Train loss: {4}, Validation loss:{5}'.format(
                           epoch, num_epochs, i+1, total_batches, np.mean(losses)/(total_batches*epoch), np.mean(np.array(validation_losses))))
                
        #evaluate_and_save(model, opt.out_dir, val_outputs, valid_label, losses, eval_loss.data[0], "yeast", epoch)
        if stop_training == True:
            break

## 5) Training the model

## Training stage

In [89]:
data_size = len(train_seqsYeast) #9751
num_labels = go_termsYeast.shape[0] #147
hidden_size = 50
eval_every = 2
print_every = 5

model = CNN(vocab_size, 50, num_labels, hidden_size, n_layers=1, dropout=0.1, is_bidirectional=False)
criterion = nn.MultiLabelSoftMarginLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

In [90]:
data_iter = batch_iter(batch_size, TrainSeqsYeast, yTrainYeast, train_seqsYeast_length)

num_epochs = 1
threshold = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1]

# Model Training
ValidSeqsYeast_small, yValidYeast_small = reduced_set(ValidSeqsYeast, valid_seqsYeast_length, yValidYeast, 100)
train(ValidSeqsYeast_small, yValidYeast_small, num_epochs, optimizer, data_iter, model, data_size, threshold)

0.7043743133544922
Epoch: [1/1], Step: [13/132], Train loss: 0.005372096921180512, Validation loss:0.7043743133544922
0.7025437951087952
Epoch: [1/1], Step: [26/132], Train loss: 0.005367826297010853, Validation loss:0.7034590542316437
0.7013904452323914
Epoch: [1/1], Step: [39/132], Train loss: 0.005365214911929934, Validation loss:0.7027695178985596
0.7007626891136169
Epoch: [1/1], Step: [52/132], Train loss: 0.0053632017615791805, Validation loss:0.7022678107023239
0.7012298107147217
Epoch: [1/1], Step: [65/132], Train loss: 0.005363278747438551, Validation loss:0.7020602107048035
0.7013167142868042
Epoch: [1/1], Step: [78/132], Train loss: 0.005362049937711359, Validation loss:0.701936294635137
0.701301097869873
Epoch: [1/1], Step: [91/132], Train loss: 0.005360435817267869, Validation loss:0.7018455522400993
0.7010495662689209
Epoch: [1/1], Step: [104/132], Train loss: 0.005358951535327729, Validation loss:0.7017460539937019
0.7006999850273132
Epoch: [1/1], Step: [117/132], Train 

## 6) Testing performance on test data

In [None]:
def test_set_predictions(model, test_input_seq):
    model.eval()
    test_input_seq = Variable(test_input_seq)
    predicted = model(test_input_seq.transpose(0,1).type(torch.LongTensor))
    return predicted

In [None]:
test_predictions = test_set_predictions(model, TestSeqsYeast)
#FScore, Threshold, Precision, Recall = F_score(test_set_predictions, yTestYeast)

In [None]:
plot_AUPR_curve(test_predictions.data.numpy(), yTestYeast.numpy(), label='GRU', org='Yeast')

In [None]:
plt.show()

In [None]:
plot_AUC_curve(test_predictions.data.numpy(), yTestYeast.numpy(), label='GRU', org='Yeast')
plt.show()