In [1]:
#%% library import
import numpy as np
import pandas as pd
import networkx as nx
import torch as tc
import torch
import pprint
import pickle
import time

from torch.autograd import Variable
from sklearn.utils import shuffle
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from functools import partial


In [2]:
#%% Load dataset and cuda
dataset = pd.read_csv("datasets/KIBA.csv")
datalen = len(dataset)
cuda = tc.device('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda
GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
#%% protein-ligand-kiba split
protein = dataset.loc[:(2**16+2**15)+(2**13)-1, "uniprotID"]    # 5
ligand = dataset.loc[:(2**16+2**15)+(2**13)-1, "chemblID"]
kiba = list(dataset.loc[:(2**16+2**15)+(2**13)-1, 'KIBA'])
del dataset


In [4]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_lstm.txt', 'rb')
seq_voc, _ = pickle.load(f)
f.close()

sequence = np.zeros(((2**16+2**15)+(2**13), 4128))
for i, s in enumerate(protein):
    sequence[i] = seq_voc[s]

sequence = sequence[:, :1400]


In [5]:
#%% ligand smiles load
f = open('datasets/dictionaries/lgn_smiecoding.txt', 'rb')
smi_dic = pickle.load(f)
f.close()

smileseq = np.zeros(((2**16+2**15)+(2**13), 590))
for i, e in enumerate(ligand):
    smileseq[i] = smi_dic[e]
    
smileseq = smileseq[:, :100]


In [16]:
#%% dataset zip
revised_dataset = list(zip(sequence, smileseq, kiba))
shuffled_dataset = shuffle(revised_dataset); del revised_dataset
trainset = shuffled_dataset[:2**16+2**15]
validset = shuffled_dataset[2**16+2**15:(2**16+2**15) + (2**13)]

del shuffled_dataset


In [17]:
#%% Make collate func.
def collate(samples):
    sequences, smileseq, labels = map(list, zip(*samples))
    return tc.LongTensor(sequences).cuda(), tc.LongTensor(smileseq).cuda(), tc.tensor(labels).cuda()

In [18]:
#%%
class Conv1d(nn.Module):
    def __init__(self, in_planes, out_planes, kernel_size, stride, padding=0):
        super(Conv1d, self).__init__()
        
        self.conv = nn.Conv1d(in_planes, out_planes,
                              kernel_size=9, stride=stride,
                              padding=padding, bias=False) # verify bias false
        self.bn = nn.BatchNorm1d(out_planes,
                                 eps=0.001, # value found in tensorflow
                                 momentum=0.1, # default pytorch value
                                 affine=True)

    def forward(self, x):
        x = self.conv(x)
        out = self.bn(x)
        return out
    
    
class Block1(nn.Module):
    def __init__(self):
        super(Block1, self).__init__()
        
        self.conv = nn.Sequential(
            Conv1d(32, 32, kernel_size=10, stride=1),
            nn.ReLU(),
            nn.MaxPool1d(2)         
            )
        
    def forward(self, x):
        out = self.conv(x)
        return out
    
    
class IBlock(nn.Module):
    def __init__(self, in_planes, out_planes):
        super(IBlock, self).__init__()
        
        self.branch0 = nn.Sequential(
            Conv1d(in_planes, out_planes, kernel_size=9, stride=1)
            )
        
        self.branch1 = nn.Sequential(
            Conv1d(in_planes, in_planes, kernel_size=9, stride=1)
            nn.ReLU(),
            Conv1d(in_planes, out_planes, kernel_size=9, stride=1, padding=4)
            )
            
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x0 = self.branch0(x)
        x1 = self.branch1(x)
        x2 = x0 + x1
        out = self.relu(x2)
        
        return out

In [28]:
#%% learning module 선언
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()    # method 상속받고 __init__()은 여기서 하겠다.
        
        self.prt_emlayer = nn.Embedding(21, 10)
        
        self.prt_cv2dlayer = nn.Sequential(
                        nn.Conv2d(1, 32, kernel_size = (4, 10)),
                        nn.BatchNorm2d(num_features = 32),
                        nn.ReLU()    # batch, channel, input_len, embedding
                        )    
        
        self.prt_cv1dlayers = nn.Sequential(
                        Conv1d(32, 64, kernel_size = 8, stride=1),
                        nn.BatchNorm1d(64),
                        nn.ReLU(),
                        nn.MaxPool1d(4),
                        Conv1d(64, 96, kernel_size = 12, stride=1),
                        nn.BatchNorm1d(96),
                        nn.ReLU(),
                        nn.MaxPool1d(339)
                        )
            
        ######################################################################
        ######################################################################
        
        self.lgn_emlayer = nn.Embedding(64, 10)
        
        self.lgn_cv2dlayer = nn.Sequential(
                        nn.Conv2d(1, 32, kernel_size = (2, 10)),
                        nn.BatchNorm2d(num_features = 32),
                        nn.ReLU()
                        )
        
        self.lgn_cv1dlayers = nn.Sequential(
                        Block1(),
                        IBlock(32, 64),
                        nn.MaxPool1d(2),
                        IBlock(64, 96),
                        nn.MaxPool1d(10)
                        )
        
        self.mlplayers = nn.Sequential(
                        nn.Linear(192, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
                        nn.Dropout(0.2),
                        nn.Linear(512, 768),
                        nn.BatchNorm1d(768),
                        nn.ReLU(),
                        nn.Dropout(0.2),
                        nn.Linear(768, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU()
                        )

        self.regress = nn.Linear(512, 1)    # regression

    def forward(self, prt_seq, lgn_seq):   
        p = self.prt_emlayer(prt_seq)
        p = p.unsqueeze(1)
        p = self.prt_cv2dlayer(p)
        p = p.squeeze()
        p = self.prt_cv1dlayers(p)     # batch, channel(->input_size), seq_len
        p = p.squeeze()
        
        l = self.lgn_emlayer(lgn_seq)
        l = l.unsqueeze(1)
        l = self.lgn_cv2dlayer(l)
        l = l.squeeze()
        l = self.lgn_cv1dlayers(l)
        l = l.squeeze()
        
#         print(p.size(), l.size())
        
        cat = tc.cat((p, l), axis=1).cuda()
        out = self.mlplayers(cat)
        
        return self.regress(out).cuda()

In [29]:
#%% Set hyperparameter
hp_d = {}

# FIXME: 학습 관련 하이퍼파라미터
hp_d['batch_size'] = 256
hp_d['num_epochs'] = 300

hp_d['init_learning_rate'] = 10 ** -3.70183
hp_d['eps'] = 10 ** -8.39981
hp_d['weight_decay'] = 10 ** -3.59967

In [30]:
#%% training and validation
tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
va_data_loader = DataLoader(validset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

model = Regressor().to(torch.device('cuda:0'))
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
loss_func = nn.MSELoss(reduction='mean').cuda()
optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
    weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

print('tr_var:', np.var(np.array([s[2] for s in trainset])))
print('va_var:', np.var(np.array([s[2] for s in validset])))
print('total params:', total_params)

tr_epoch_losses = []
va_epoch_losses = []

start = time.time()

for epoch in range(hp_d['num_epochs']):                          #!! epoch-loop
    # training session
    model.train()
    tr_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(tr_data_loader):  #!! batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        tr_epoch_loss += loss.detach().item()
    
    tr_epoch_loss /= (iter + 1)
    print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))
    tr_epoch_losses.append(tr_epoch_loss)

# ===========================================================================
    # validation session
    model.eval()
    va_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(va_data_loader):  # batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()
        
        va_epoch_loss += loss.detach().item()
        
    va_epoch_loss /= (iter + 1)
    print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))
    va_epoch_losses.append(va_epoch_loss)
    
end = time.time()
print('time elapsed:', end-start)

tr_var: 0.6973572077706937
va_var: 0.7063622712020676
total params: 1172179
Training epoch 0, loss 32.1507
Validation epoch 0, loss 1.3180
Training epoch 1, loss 0.9798
Validation epoch 1, loss 0.5799
Training epoch 2, loss 0.7246
Validation epoch 2, loss 0.4902
Training epoch 3, loss 0.5531
Validation epoch 3, loss 0.4416
Training epoch 4, loss 0.4382
Validation epoch 4, loss 0.3714
Training epoch 5, loss 0.3707
Validation epoch 5, loss 0.3367
Training epoch 6, loss 0.3385
Validation epoch 6, loss 0.3696
Training epoch 7, loss 0.3202
Validation epoch 7, loss 0.3385
Training epoch 8, loss 0.3056
Validation epoch 8, loss 0.3563
Training epoch 9, loss 0.2953
Validation epoch 9, loss 0.3316
Training epoch 10, loss 0.2867
Validation epoch 10, loss 0.3216
Training epoch 11, loss 0.2749
Validation epoch 11, loss 0.3060
Training epoch 12, loss 0.2659
Validation epoch 12, loss 0.3150
Training epoch 13, loss 0.2564
Validation epoch 13, loss 0.2818
Training epoch 14, loss 0.2471
Validation epoch

Validation epoch 126, loss 0.2127
Training epoch 127, loss 0.0664
Validation epoch 127, loss 0.2012
Training epoch 128, loss 0.0640
Validation epoch 128, loss 0.2029
Training epoch 129, loss 0.0629
Validation epoch 129, loss 0.2065
Training epoch 130, loss 0.0643
Validation epoch 130, loss 0.1990
Training epoch 131, loss 0.0624
Validation epoch 131, loss 0.2080
Training epoch 132, loss 0.0638
Validation epoch 132, loss 0.2091
Training epoch 133, loss 0.0629
Validation epoch 133, loss 0.2030
Training epoch 134, loss 0.0622
Validation epoch 134, loss 0.2123
Training epoch 135, loss 0.0627
Validation epoch 135, loss 0.2190
Training epoch 136, loss 0.0619
Validation epoch 136, loss 0.2129
Training epoch 137, loss 0.0628
Validation epoch 137, loss 0.2151
Training epoch 138, loss 0.0640
Validation epoch 138, loss 0.1969
Training epoch 139, loss 0.0615
Validation epoch 139, loss 0.2039
Training epoch 140, loss 0.0619
Validation epoch 140, loss 0.1985
Training epoch 141, loss 0.0606
Validation

Training epoch 251, loss 0.0495
Validation epoch 251, loss 0.1969
Training epoch 252, loss 0.0480
Validation epoch 252, loss 0.2062
Training epoch 253, loss 0.0481
Validation epoch 253, loss 0.1922
Training epoch 254, loss 0.0483
Validation epoch 254, loss 0.1960
Training epoch 255, loss 0.0480
Validation epoch 255, loss 0.1936
Training epoch 256, loss 0.0483
Validation epoch 256, loss 0.2061
Training epoch 257, loss 0.0505
Validation epoch 257, loss 0.1986
Training epoch 258, loss 0.0488
Validation epoch 258, loss 0.1880
Training epoch 259, loss 0.0484
Validation epoch 259, loss 0.1982
Training epoch 260, loss 0.0481
Validation epoch 260, loss 0.1993
Training epoch 261, loss 0.0483
Validation epoch 261, loss 0.1999
Training epoch 262, loss 0.0506
Validation epoch 262, loss 0.1973
Training epoch 263, loss 0.0491
Validation epoch 263, loss 0.1965
Training epoch 264, loss 0.0494
Validation epoch 264, loss 0.1972
Training epoch 265, loss 0.0493
Validation epoch 265, loss 0.2013
Training e

In [33]:
#%%
np.save('ModifiedDeepDTA_v5_tr_losses', tr_epoch_losses)
np.save('ModifiedDeepDTA_v5_va_losses', va_epoch_losses)

In [32]:
min(va_epoch_losses)

0.1879951625596732