In [1]:
import numpy as np
import pandas as pd
import scipy.io as sio
from Bio import SeqIO
import collections

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import random


pdir = '/Users/Brenton/Documents/Capstone/'

def load_test_sets(filename):
    print("### Loading *.mat file...")
    go_data = sio.loadmat(filename, squeeze_me=True)
    go_terms = go_data['goTerm_labels'] # names of gene ontology function terms
    train_annotations = np.asarray(go_data['trainProts_label'].todense()) # training set of function annotations
    valid_annotations = np.asarray(go_data['validProts_label'].todense()) # valid "" ""
    test_annotations = np.asarray(go_data['testProts_label'].todense()) # test "" ""
    train_inds = go_data['trainProts']
    train_inds = train_inds - 1
    valid_inds = go_data['validProts']
    valid_inds = valid_inds - 1
    test_inds = go_data['testProts']
    test_inds = test_inds - 1 # subtract 1 for matlab index conversion into python

    return train_inds, valid_inds, test_inds, train_annotations, valid_annotations, test_annotations, go_terms

def load_FASTA(filename):
    """ Loads fasta file and returns a list of the Bio SeqIO records """
    print("### Loading fasta file...")
    infile = open(filename, 'rU')
    full_entries = list(SeqIO.parse(infile, 'fasta'))
    sequences = [str(entry.seq) for entry in full_entries]
    names = [str(entry.id) for entry in full_entries]

    return sequences, names

#Human Sequences
fasta = '../../data/human_sequences.fasta'
test_set_file = '../../data/human_annotations_temporal_holdout.mat'

sequences, names = load_FASTA(fasta)
train_inds, valid_inds, test_inds, y_trainHuman, y_validHuman, y_testHuman, go_termsHuman = load_test_sets(test_set_file)

train_seqsHuman = [sequences[i] for i in train_inds]
print('Number of training prots: ' + str(len(train_seqsHuman)))
valid_seqsHuman = [sequences[i] for i in valid_inds]
print('Number of validation prots: ' + str(len(valid_seqsHuman)))
test_seqsHuman = [sequences[i] for i in test_inds]
print('Number of testing prots: ' + str(len(test_seqsHuman)))

#Yeast sequences
fasta = '../../data/yeast_sequences.fasta'
test_set_file = '../../data/yeast_MF_temporal_holdout.mat'

sequences, names = load_FASTA(fasta)
train_inds, valid_inds, test_inds, y_trainYeast, y_validYeast, y_testYeast, go_termsYeast = load_test_sets(test_set_file)

train_seqsYeast = [sequences[i] for i in train_inds]
print('Number of training prots: ' + str(len(train_seqsYeast)))
valid_seqsYeast = [sequences[i] for i in valid_inds]
print('Number of validation prots: ' + str(len(valid_seqsYeast)))
test_seqsYeast = [sequences[i] for i in test_inds]
print('Number of testing prots: ' + str(len(test_seqsYeast)))

### Loading fasta file...




### Loading *.mat file...
Number of training prots: 9751
Number of validation prots: 3871
Number of testing prots: 1647
### Loading fasta file...
### Loading *.mat file...
Number of training prots: 3447
Number of validation prots: 963
Number of testing prots: 206


In [2]:
yTrainYeast = torch.from_numpy(y_trainYeast).type(torch.LongTensor)
yValidYeast = torch.from_numpy(y_validYeast).type(torch.LongTensor)
yTestYeast = torch.from_numpy(y_testYeast).type(torch.LongTensor)

yTrainHuman = torch.from_numpy(y_trainHuman).type(torch.LongTensor)
yValidHuman = torch.from_numpy(y_validHuman).type(torch.LongTensor)
yTestHuman = torch.from_numpy(y_testHuman).type(torch.LongTensor)

yTrainYeast


    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      1     0     1
       ...          ⋱          ...       
    0     0     0  ...      0     0     0
    0     0     0  ...      0     0     0
    0     0     0  ...      1     0     1
[torch.LongTensor of size 3447x26]

# Calculating the Length of train, valid and test data 

In [3]:
# Human data:

train_seqsHuman_length = []
[train_seqsHuman_length.append(len(train_seqsHuman[i])) for i in range(len(train_seqsHuman))]
train_seqsHuman_length = torch.LongTensor(train_seqsHuman_length)

valid_seqsHuman_length = []
[valid_seqsHuman_length.append(len(valid_seqsHuman[i])) for i in range(len(valid_seqsHuman))]
valid_seqsHuman_length = torch.LongTensor(valid_seqsHuman_length)

test_seqsHuman_length = []
[test_seqsHuman_length.append(len(test_seqsHuman[i])) for i in range(len(test_seqsHuman))]
test_seqsHuman_length = torch.LongTensor(test_seqsHuman_length)

print(len(train_seqsHuman_length), len(valid_seqsHuman_length), len(test_seqsHuman_length))

# Yeast data:

train_seqsYeast_length = []
[train_seqsYeast_length.append(len(train_seqsYeast[i])) for i in range(len(train_seqsYeast))]
train_seqsYeast_length = torch.LongTensor(train_seqsYeast_length)

valid_seqsYeast_length = []
[valid_seqsYeast_length.append(len(valid_seqsYeast[i])) for i in range(len(valid_seqsYeast))]
valid_seqsYeast_length = torch.LongTensor(valid_seqsYeast_length)

test_seqsYeast_length = []
[test_seqsYeast_length.append(len(test_seqsYeast[i])) for i in range(len(test_seqsYeast))]
test_seqsYeast_length = torch.LongTensor(test_seqsYeast_length)

print(len(train_seqsYeast_length), len(valid_seqsYeast_length), len(test_seqsYeast_length))

9751 3871 1647
3447 963 206


## Vectorize all amino-acid chains in the list 
#### Each amino-acid string becomes one row in a tensor object.
#### This tensor object has dimension NxD, where N is the number of amino-acid strings and D is the length of the longest chain in the set. 

In [4]:
ConvertCharToInt = {'A':1, 'B':2, 'C':3, 'D':4, 'E':5, 'F':6, 'G':7, 'H':8, 'I':9, 'J':10,
                   'K':11, 'L':12, 'M':13, 'N':14, 'O':15, 'P':16, 'Q':17, 'R':18, 'S':19,
                   'T':20, 'U':21, 'V':22, 'W':23, 'X':24, 'Y':25, 'Z':26}

def vectorize_AAs(string):
    '''This function takes an amino-acid string as input and outputs a vector of integers, with each
    integer representing one amino acid.
    
    For example, 'BACEA' is converted to [2, 1, 3, 5, 1]
    '''
    character_list = list(string) #converts 'BACEA' to ['B','A','C','E','A]
    for i in range(len(character_list)):
        character_list[i] = ConvertCharToInt[character_list[i]] #convert the character to a number
    return character_list

def AddZeros(vector, max_length):
    '''This function adds the necessary number of zeros and returns an array'''
    #max_length = length of longest vector in the batch
    #oldvector = initial vector for that amino-acid chain (in integers)
    diff = max_length - len(vector)
    if diff>0:
        ZerosToAdd = np.zeros(diff)
        vector.extend(ZerosToAdd)
    return vector 

def TransformAAsToTensor(ListOfSequences):
    '''This function takes as input a list of amino acid strings and creates a tensor matrix
    of dimension NxD, where N is the number of strings and D is the length of the longest AA chain
    
    "ListOfSequences" can be training, validation, or test sets
    '''
    #find longest amino-acid sequence
    max_length = len(max(ListOfSequences, key=len))
    Sequences = ListOfSequences.copy() 
    for AA in range(len(Sequences)): #for each amino-acid sequence
        Sequences[AA] = vectorize_AAs(Sequences[AA])
        Sequences[AA] = AddZeros(Sequences[AA], max_length)
    NewTensor = torch.from_numpy(np.array(Sequences))
    return NewTensor

### Runs quickly for Yeast, about 2 minutes for Human data. 

In [38]:
TrainSeqsYeast = TransformAAsToTensor(train_seqsYeast)
ValidSeqsYeast = TransformAAsToTensor(valid_seqsYeast)
TestSeqsYeast = TransformAAsToTensor(test_seqsYeast)
TestSeqsYeast


   13     3     7  ...      0     0     0
   13     4     1  ...      0     0     0
   13     9    11  ...      0     0     0
       ...          ⋱          ...       
   13    12     4  ...      0     0     0
   13    22    19  ...      0     0     0
   13    12    13  ...      0     0     0
[torch.DoubleTensor of size 206x1592]

In [39]:
TrainSeqsHuman = TransformAAsToTensor(train_seqsHuman)
ValidSeqsHuman = TransformAAsToTensor(valid_seqsHuman)
TestSeqsHuman = TransformAAsToTensor(test_seqsHuman)
TestSeqsHuman


   13    18    12  ...      0     0     0
   13    13     5  ...      0     0     0
   13    11     8  ...      0     0     0
       ...          ⋱          ...       
   13    18     1  ...      0     0     0
   24     1    18  ...      0     0     0
   13     1    12  ...      0     0     0
[torch.DoubleTensor of size 1647x5090]

# Train Model

## 0) Hyperparameter setting

In [40]:
learning_rate = 0.001
vocab_size = 26 # number words in the vocabulary base
emb_dim = 50 # dimension for n-gram embedding
num_epochs = 5 # number epoch to train
batch_size = 26

## 1) Get batch data method:

In [41]:
import random

def batch_iter(TrainSeqs, yTrain, TrainSeqsLength, batch_size):
    start = -1 * batch_size
    dataset_size = TrainSeqs.size()[0]
    order = list(range(dataset_size))
    random.shuffle(order)

    while True:
        start += batch_size
        if start > dataset_size - batch_size:
            # Start another epoch.
            start = 0
            random.shuffle(order)
        batch_indices = order[start:start + batch_size]
        batch_indices_tensor = torch.LongTensor(batch_indices)
        batch_train = TrainSeqs[batch_indices_tensor].type(torch.LongTensor)
        batch_train_labels = yTrain[batch_indices_tensor]
        length_batch = TrainSeqsLength[batch_indices_tensor]
        yield [Variable(batch_train), Variable(batch_train_labels), Variable(length_batch)]
        

## 2) FastText class:

In [42]:
class FastText(nn.Module):
    """
    FastText model
    """
       
    def __init__(self, vocab_size, emb_dim, num_labels):
       
        """
        @param vocab_size: size of the vocabulary. 
        @param emb_dim: size of the word embedding
        """
        super(FastText, self).__init__()

        self.num_labels = num_labels
        self.embed = nn.Embedding(vocab_size+1, emb_dim, padding_idx=0)
        self.linear = nn.Linear(emb_dim,num_labels)
        self.init_weights()
    
    def forward(self, data, length):
        """
        @param data: matrix of size (batch_size, max_sentence_length). Each row in data represents a 
            review that is represented using n-gram index. Note that they are padded to have same length.
        @param length: an int tensor of size (batch_size), which represents the non-trivial (excludes padding)
            length of each sentences in the data.
        """
        data = data.type(torch.LongTensor)
        out = self.embed(data)
        out = torch.sum(out, dim=1)
        out = out / length.view(-1,1).float()
            
        out = self.linear(out)
        return nn.functional.sigmoid(out)
    
    def init_weights(self):
        initrange = 0.1
        lin_layers = [self.linear]
        em_layer = [self.embed]
     
        for layer in lin_layers+em_layer:
            layer.weight.data.uniform_(-initrange, initrange)
            if layer in lin_layers:
                layer.bias.data.fill_(0)
                

## 3) Evaluation Metric

In [44]:
from sklearn.metrics import precision_score,recall_score,average_precision_score

def round_manual(data, threshold):
    return (data >= threshold).astype(int)

def calculate_accuracy(predicted, actuals, num_labels, threshold):
    """
    @param predicted: data type = Variable
    @param actuals: data type = Variable
    @param num_labels: no of go terms
    @return: accuracy measure
    """
    predicted = round_manual(predicted.data.numpy(), threshold)
    total_predictions = actuals.size()[0]
    accuracy = np.sum(predicted==actuals.data.numpy())/(total_predictions*num_labels)
    return accuracy

def average_precision(predicted, actuals, threshold):
    """
    @param predicted: data type = Variable
    @param actuals: data type = Variable
    @param num_labels: no of go terms
    @return: precision
    """
    actuals = actuals.data.numpy()
    predicted = round_manual(predicted.data.numpy(), threshold)
    non_zero_go_terms = np.count_nonzero((np.sum(actuals, axis=0)!=0).astype(int))
    return np.sum(precision_score(actuals, predicted, average=None))/non_zero_go_terms
    
def average_recall(predicted, actuals, threshold):
    """
    @param predicted: data type = Variable
    @param actuals: data type = Variable
    @param num_labels: no of go terms
    @return: recall
    """
    actuals = actuals.data.numpy()
    predicted = round_manual(predicted.data.numpy(), threshold)
    non_zero_go_terms = np.count_nonzero((np.sum(actuals, axis=0)!=0).astype(int))
    return np.sum(recall_score(actuals, predicted, average=None))/non_zero_go_terms

def F_score(precision_score, recall_score):
    return ((2*precision_score*recall_score) / (precision_score + recall_score))
    

## 4) Early stop condition and training stage

In [56]:
def early_stop(val_acc_history, t=2, required_progress=0.001):
    """
    Stop the training if there is no non-trivial progress in k steps
    @param val_acc_history: a list contains all the historical validation acc
    @param required_progress: the next acc should be higher than the previous by 
        at least required_progress amount to be non-trivial
    @param t: number of training steps 
    @return: a boolean indicates if the model should earily stop
    """
    # TODO: add your code here
    
    cnt = 0 # initialize the count --> to store count of cases where difference in
                                    #  accuracy is less than required progress.
    
    if(len(val_acc_history) > 0): # if list has size > 0 
        for i in range(t): # start the loop
            index = len(val_acc_history) - (i+1) # start from the last term in list and move to the left
            if (index >= 1): # to check if index != 0 --> else we can't compare to previous value
                if ((val_acc_history[index] - val_acc_history[index-1]) < required_progress):
                    cnt += 1 # increase the count value
                else:
                    break # break if difference is grea-ter 
    
    if(cnt != t): # if count is equal to t, return True
        return False
    else:
        return True

    
def train_test(valid_sequences, valid_label, valid_length, num_epochs, optimizer, data_iter, model, training_length, threshold):
    losses = []
    total_batches = int(training_length/ batch_size) #375
    validation_acc_history = []
    calculated_f_score = None
    
    for epoch in range(1, num_epochs+1):
        stop_training = False
        for i, (train_data, train_labels, length_batch) in enumerate(data_iter):
            model.train()
            model.zero_grad()
            outputs = model(train_data, length_batch)
            loss = criterion(outputs, train_labels.float())
            losses.append(loss.data[0])
            loss.backward()
            optimizer.step()

            model.eval()
            val_outputs = model(Variable(valid_sequences), Variable(valid_length))
            val_accuracy = calculate_accuracy(val_outputs, Variable(valid_label), num_labels, threshold)
            validation_acc_history.append(val_accuracy)
            
            # calculating precision and recall based on CAFA's definition
            ave_precision = average_precision(val_outputs, Variable(valid_label), threshold)
            ave_recall = average_recall(val_outputs, Variable(valid_label), threshold)
            f_score = F_score(ave_precision, ave_recall)
            calculated_f_score = f_score
            #stop_training = early_stop(validation_acc_history)
            
            if stop_training:
                print("earily stop triggered")
                break
            if (i+1) % 80 == 0:
                print('Epoch: [{0}/{1}], Step: [{2}/{3}], Train loss: {4}, F_Score: {5}, Validation Acc:{6}'.format( 
                           epoch, num_epochs, i+1, total_batches, np.mean(losses)/(total_batches*epoch), f_score, val_accuracy))
        if stop_training == True:
            break
            
    return calculated_f_score

## 5) Prediction Test Data performance

In [57]:
def accuracy_on_test_set(model, test_input_seq, test_seq_length, test_output_labels, num_labels, threshold):
    test_input_seq = Variable(test_input_seq)
    test_seq_length = Variable(test_seq_length)
    test_output_labels = Variable(test_output_labels)
    predicted = model(test_input_seq, test_seq_length)
    accuracy_on_test_set = calculate_accuracy(predicted, test_output_labels, num_labels, threshold)
    return accuracy_on_test_set

## 6) Model training and test performance

### PART I - Human Data:

In [54]:
data_size = len(train_seqsHuman) #9751
num_labels = go_termsHuman.shape[0] #147

model = FastText(vocab_size, emb_dim, num_labels)
criterion = nn.MultiLabelSoftMarginLoss()  
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate) 

In [62]:
data_iter = batch_iter(TrainSeqsHuman, yTrainHuman, train_seqsHuman_length, batch_size)

num_epochs = 1
threshold = [0.1, 0.2, 0.3, 0.4, 0.5, 0,6, 0.7, 0.8, 0.9]
f_score = []

# Model Training
for i in threshold:
    model = FastText(vocab_size, emb_dim, num_labels)
    f = train_test(ValidSeqsHuman, yValidHuman, valid_seqsHuman_length, num_epochs, optimizer, data_iter, model, data_size, i)
    f_score.append(f)
    
# Prediction on test set
print("Test Data accuracy for human protein prediction is", accuracy_on_test_set(model, TestSeqsHuman, test_seqsHuman_length, yTestHuman, num_labels, threshold))

Epoch: [1/1], Step: [80/375], Train loss: 0.0026038429776827493, F_Score: 0.08474049982457527, Validation Acc:0.05384535627735982
Epoch: [1/1], Step: [160/375], Train loss: 0.0026046042690674462, F_Score: 0.08474049982457527, Validation Acc:0.05384535627735982
Epoch: [1/1], Step: [240/375], Train loss: 0.0026044170008765327, F_Score: 0.08474049982457527, Validation Acc:0.05384535627735982
Epoch: [1/1], Step: [320/375], Train loss: 0.0026046884228785832, F_Score: 0.08474049982457527, Validation Acc:0.05384535627735982
Epoch: [1/1], Step: [400/375], Train loss: 0.002604691073894501, F_Score: 0.08474049982457527, Validation Acc:0.05384535627735982
Epoch: [1/1], Step: [480/375], Train loss: 0.0026048032104969026, F_Score: 0.08474049982457527, Validation Acc:0.05384535627735982


KeyboardInterrupt: 

In [63]:
plt.plot(threshold, f_score)
plt.xlabel('threshold')
plt.ylabel('F_Score')
plt.show()

NameError: name 'plt' is not defined

### PART II - Yeast Data:

In [51]:
data_size = len(train_seqsYeast) #3447
num_labels = go_termsYeast.shape[0] #26

num_epochs = 1
threshold = [0.1, 0.2, 0.3, 0.4, 0.5, 0,6, 0.7, 0.8, 0.9]

model = FastText(vocab_size, emb_dim, num_labels)
data_iter = batch_iter(TrainSeqsYeast, yTrainYeast, train_seqsYeast_length, batch_size)
f_score = []

# Model Training
for i in threshold:
    model = FastText(vocab_size, emb_dim, num_labels)
    f = train_test(ValidSeqsYeast, yValidYeast, valid_seqsYeast_length, num_epochs, optimizer, data_iter, model, data_size, threshold)
    f_score.append(f)
    

# Prediction on test set
print("Test Data accuracy for yeast protein prediction is", accuracy_on_test_set(model, TestSeqsYeast, test_seqsYeast_length, yTestYeast, num_labels, threshold))

earily stop triggered
Test Data accuracy for yeast protein prediction is 0.0905526512323
