In [None]:
!pip install pandas
!pip install seaborn

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import random
import seaborn as sns
from scipy.stats import binom_test
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset
import torch.utils.data

from sklearn.manifold import TSNE

from OneHotLSTM import *

torch.manual_seed(1)

<torch._C.Generator at 0x7fb83877dd70>

In [2]:
#Load data and embeddings - remember to set the paths
datDir_ligands = './dat/network_input/'
train_ligs = pd.read_csv(os.path.join(datDir_ligands,'train.txt'),sep='\t',names=['Peptide','Allele'])
test_ligs = pd.read_csv(os.path.join(datDir_ligands,'test.txt'),sep='\t',names=['Peptide','Allele'])

datDir_embedding = './dat/embedding/'
df_embedding = pd.read_csv(os.path.join(datDir_embedding,'aa_embedding_window5_dim100.txt'),header=None,comment='#')
embeddingTensor = torch.tensor(df_embedding.loc[:,1:].values)

vocab_size=embeddingTensor.size()[0]
EMBEDDING_DIM=embeddingTensor.size()[1]
HIDDEN_DIM = 50
BATCH_SIZE = 10
trainDatNum = 1000
testDatNum = 500

datAlleles = [group[0] for group in train_ligs.groupby('Allele') if len(group[1])>300]#Include all alleles with more than 300 ligands associated

#Upsample the predefined number of ligands from each allele to build balanced datasets
train_ligs_select_sample = selectXnumOfClass(train_ligs,'Allele',datAlleles,trainDatNum)
test_ligs_select_sample = selectXnumOfClass(test_ligs,'Allele',datAlleles,testDatNum)

training_data,AA2IDX,MHC2IDX,IDX2MHC = initDataWrapper(train_ligs_select_sample,df_embedding,'Allele')
test_data = DF2Dat(test_ligs_select_sample)

#Create data iterators
train_iter = torch.utils.data.DataLoader(training_data,batch_size=BATCH_SIZE)
test_iter = torch.utils.data.DataLoader(test_data,batch_size=BATCH_SIZE)


FileNotFoundError: File b'./dat/network_input_uniq_allele_All/train_EL1.txt' does not exist

In [None]:
#Initialize and train models

model = lig2allele(EMBEDDING_DIM,HIDDEN_DIM,len(AA2IDX),len(MHC2IDX),BATCH_SIZE,bidirect=True)
print({p[0]: p[1].requires_grad for p in model.named_parameters()})
loss_function = nn.NLLLoss()
optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()),lr=0.01)

with torch.no_grad():
    for i in train_iter:
        #print(i[0])
        inputs = list(map(lambda x:prepare_sequence(x,AA2IDX),i[0]))
        inputs=torch.stack(inputs)
        tag_scores = model(inputs)
        #print(tag_scores)
        break

numEpochs=300
epochCount = 5

train_loss = []
test_loss = []
train_acc = []
test_acc = []
testLoss = []

for epoch in range(numEpochs):
    if epoch%epochCount==0:
        print('################Epoch%d/%d#################'%(epoch,numEpochs))
    for sentence,alleles in train_iter:
        #Step 1. Remember that Pytorch accumulates gradients
        #We need to clear them out before each instance
        model.zero_grad()
        #Also, we need to clear out the hidden state of the LSTM,
        #detaching it from its history on the last instance.
        model.hidden = model.init_hidden()
        #Step 2. Get our inputs ready for the networks, that is, turn them into,
        #Tensors of word indices
        sentence_in = list(map(lambda x:prepare_sequence(x,AA2IDX),sentence))
        sentence_in=torch.stack(sentence_in)
        #print(sentence_in.size())
        if sentence_in.size()[1]!=BATCH_SIZE:
            break
        #sentence_in = prepare_sequence(sentence,AA2IDX)
        targets = list(map(lambda x:prepare_sequence(x,MHC2IDX),alleles))[0]
        #Step 3. Run our forwards pass.
        tag_scores=model(sentence_in)
        #Step 4. Compute the loss, gradients, and upadte the parameters
        # by calling optimzer.step()
        pred_IDX = torch.argmax(tag_scores,dim=1)
        preds = list(map(lambda x:IDX2MHC[x.item()],list(pred_IDX)))
        targPred = list(zip(alleles[0],preds))
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        
    if epoch%epochCount==0:
        train_loss_val,train_targPred,train_accuracy = evaluateModel(train_iter)
        test_loss_val,test_targPred,test_accuracy = evaluateModel(test_iter)
        train_loss.append(train_loss_val)
        test_loss.append(test_loss_val)
        train_acc.append(train_accuracy)
        test_acc.append(test_accuracy)
        #testLoss.append(getLoss(model,test_iter))
        print("Train Loss: %.3f, Test Loss: %.3f,Train Accuracy: %.3f, Test Accuracy: %.3f"
              %(train_loss_val,test_loss_val,train_accuracy,test_accuracy))
        #print("Test Loss: %.3f"%test_loss_val)
        
#testPred = evaluateModel(model,test_iter)
    


In [None]:
#Print training curve

epochX = np.linspace(epochCount,numEpochs,(numEpochs/float(epochCount)))
figDir_deepLearning = '/Users/birey/Dropbox/2018_TCell_PhD/Worklog/fig/dl/'

plt.scatter(epochX,train_loss,c='b',label='Train')
plt.scatter(epochX,test_loss,c='r',label='Test')
plt.xlabel('Epochs')
plt.ylabel('NLLLoss')
plt.legend()
plt.show()

plt.scatter(epochX,train_acc,c='b',label='Train')
plt.scatter(epochX,test_acc,c='xkcd:light orange',label='Test')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()

In [None]:
#Get Confusion matrix at different levels of resolution for allele assignment

train_targPred_superType = targPredCut(train_targPred,idx=-3)
test_targPred_superType = targPredCut(test_targPred,idx=-3)

train_targPred_locus = targPredCut(train_targPred,idx=-6)
test_targPred_locus = targPredCut(test_targPred,idx=-6)

figDir_deepLearning = './'
conMat_train_full = confusionMatrix(train_targPred)
conMat_test_full = confusionMatrix(test_targPred,labels=True,ticker=5,titleFont=40,saveFig=os.path.join(figDir_deepLearning,'confMat_fullRes.eps'))

conMat_train_super = confusionMatrix(train_targPred_superType)
conMat_test_super = confusionMatrix(test_targPred_superType,labels=True,titleFont=40,ticker=5,saveFig=os.path.join(figDir_deepLearning,'confMat_superType.eps'))

conMat_train_locus = confusionMatrix(train_targPred_locus)
conMat_test_locus = confusionMatrix(test_targPred_locus,labels=True,titleFont=30,saveFig=os.path.join(figDir_deepLearning,'confMat_locus.eps'))
