In [1]:
#%% library import
import numpy as np
import pandas as pd
import networkx as nx
import torch as tc
import torch
import pprint
import pickle
import time

from torch.autograd import Variable
from sklearn.utils import shuffle
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from functools import partial


In [2]:
#%% Load dataset and cuda
dataset = pd.read_csv("datasets/KIBA.csv")
datalen = len(dataset)
cuda = tc.device('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda
GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
#%% protein-ligand-kiba split
protein = dataset.loc[:(2**16+2**15)+(2**13)-1, "uniprotID"]    # 5
ligand = dataset.loc[:(2**16+2**15)+(2**13)-1, "chemblID"]
kiba = list(dataset.loc[:(2**16+2**15)+(2**13)-1, 'KIBA'])
del dataset


In [4]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_lstm.txt', 'rb')
seq_voc, _ = pickle.load(f)
f.close()

sequence = np.zeros(((2**16+2**15)+(2**13), 4128))
for i, s in enumerate(protein):
    sequence[i] = seq_voc[s]

sequence = sequence[:, :1400]


In [5]:
#%% ligand smiles load
f = open('datasets/dictionaries/lgn_smiecoding.txt', 'rb')
smi_dic = pickle.load(f)
f.close()

smileseq = np.zeros(((2**16+2**15)+(2**13), 590))
for i, e in enumerate(ligand):
    smileseq[i] = smi_dic[e]

smileseq = smileseq[:, :100]


In [6]:
#%% dataset zip
revised_dataset = list(zip(sequence, smileseq, kiba))
shuffled_dataset = shuffle(revised_dataset); del revised_dataset
trainset = shuffled_dataset[:2**16+2**15]
validset = shuffled_dataset[2**16+2**15:(2**16+2**15) + (2**13)]

del shuffled_dataset


In [7]:
#%% Make collate func.
def collate(samples):
    sequences, smileseq, labels = map(list, zip(*samples))
    return tc.LongTensor(sequences).cuda(), tc.LongTensor(smileseq).cuda(), tc.tensor(labels).cuda()

In [19]:
#%% learning module 선언
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()    # method 상속받고 __init__()은 여기서 하겠다.
        
        self.prt_emlayer = nn.Embedding(21, 10)
        
        self.prt_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 12),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size=1379)
                        )

        ######################################################################
        ######################################################################
        
        self.lgn_emlayer = nn.Embedding(64, 10)
        
        self.lgn_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 6),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size = 85)
                        )
            
        
        self.mlplayers = nn.Sequential(
                        nn.Linear(192, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(512, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU()
                        )

        self.regress = nn.Linear(512, 1)    # regression

    def forward(self, prt_seq, lgn_seq):   
        p = self.prt_emlayer(prt_seq)
        p = p.permute(0, 2, 1)
        p = self.prt_cv1dlayers(p)
        p = p.squeeze()
        
        l = self.lgn_emlayer(lgn_seq)
        l = l.permute(0, 2, 1)
        l = self.lgn_cv1dlayers(l)
        l = l.squeeze()
        
        cat = tc.cat((p, l), axis=1).cuda()
        out = self.mlplayers(cat)
        
        return self.regress(out).cuda()

In [20]:
#%% Set hyperparameter
hp_d = {}

# FIXME: 학습 관련 하이퍼파라미터
hp_d['batch_size'] = 256
hp_d['num_epochs'] = 300

hp_d['init_learning_rate'] = 10 ** -3.70183
hp_d['eps'] = 10 ** -8.39981
hp_d['weight_decay'] = 10 ** -3.59967

In [22]:
#%% training and validation
tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
va_data_loader = DataLoader(validset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

model = Regressor().to(torch.device('cuda:0'))
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
loss_func = nn.MSELoss(reduction='mean').cuda()
# optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
#     weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

optimizer = optim.Adam(model.parameters(), lr=0.001)

print('tr_var:', np.var(np.array([s[2] for s in trainset])))
print('va_var:', np.var(np.array([s[2] for s in validset])))
print('total params:', total_params)

tr_epoch_losses = []
va_epoch_losses = []

start = time.time()

for epoch in range(hp_d['num_epochs']):                          #!! epoch-loop
    # training session
    model.train()
    tr_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(tr_data_loader):  #!! batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        tr_epoch_loss += loss.detach().item()
    
    tr_epoch_loss /= (iter + 1)
    print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))
    tr_epoch_losses.append(tr_epoch_loss)

# ===========================================================================
    # validation session
    model.eval()
    va_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(va_data_loader):  # batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()
        
        va_epoch_loss += loss.detach().item()
        
    va_epoch_loss /= (iter + 1)
    print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))
    va_epoch_losses.append(va_epoch_loss)
    
end = time.time()
print('time elapsed:', end-start)

tr_var: 0.6985213481988329
va_var: 0.692295664014448
total params: 3249107
Training epoch 0, loss 7.4863
Validation epoch 0, loss 0.6014
Training epoch 1, loss 0.5044
Validation epoch 1, loss 0.4548
Training epoch 2, loss 0.4014
Validation epoch 2, loss 0.3718
Training epoch 3, loss 0.3624
Validation epoch 3, loss 0.3475
Training epoch 4, loss 0.3451
Validation epoch 4, loss 0.3411
Training epoch 5, loss 0.3221
Validation epoch 5, loss 0.4122
Training epoch 6, loss 0.3136
Validation epoch 6, loss 0.3289
Training epoch 7, loss 0.3023
Validation epoch 7, loss 0.3048
Training epoch 8, loss 0.2870
Validation epoch 8, loss 0.2950
Training epoch 9, loss 0.2701
Validation epoch 9, loss 0.2999
Training epoch 10, loss 0.2545
Validation epoch 10, loss 0.2621
Training epoch 11, loss 0.2437
Validation epoch 11, loss 0.3270
Training epoch 12, loss 0.2373
Validation epoch 12, loss 0.2529
Training epoch 13, loss 0.2272
Validation epoch 13, loss 0.2616
Training epoch 14, loss 0.2218
Validation epoch 1

Validation epoch 126, loss 0.1641
Training epoch 127, loss 0.0593
Validation epoch 127, loss 0.1623
Training epoch 128, loss 0.0631
Validation epoch 128, loss 0.1550
Training epoch 129, loss 0.0655
Validation epoch 129, loss 0.1580
Training epoch 130, loss 0.0608
Validation epoch 130, loss 0.1672
Training epoch 131, loss 0.0592
Validation epoch 131, loss 0.1668
Training epoch 132, loss 0.0549
Validation epoch 132, loss 0.1587
Training epoch 133, loss 0.0534
Validation epoch 133, loss 0.1669
Training epoch 134, loss 0.0531
Validation epoch 134, loss 0.1755
Training epoch 135, loss 0.0561
Validation epoch 135, loss 0.1649
Training epoch 136, loss 0.0551
Validation epoch 136, loss 0.1809
Training epoch 137, loss 0.0553
Validation epoch 137, loss 0.1896
Training epoch 138, loss 0.0555
Validation epoch 138, loss 0.1890
Training epoch 139, loss 0.0567
Validation epoch 139, loss 0.1912
Training epoch 140, loss 0.0538
Validation epoch 140, loss 0.1996
Training epoch 141, loss 0.0508
Validation

Training epoch 251, loss 0.0225
Validation epoch 251, loss 0.1756
Training epoch 252, loss 0.0254
Validation epoch 252, loss 0.1593
Training epoch 253, loss 0.0278
Validation epoch 253, loss 0.1679
Training epoch 254, loss 0.0328
Validation epoch 254, loss 0.2071
Training epoch 255, loss 0.0374
Validation epoch 255, loss 0.1852
Training epoch 256, loss 0.0342
Validation epoch 256, loss 0.1653
Training epoch 257, loss 0.0275
Validation epoch 257, loss 0.1783
Training epoch 258, loss 0.0233
Validation epoch 258, loss 0.1596
Training epoch 259, loss 0.0213
Validation epoch 259, loss 0.1518
Training epoch 260, loss 0.0208
Validation epoch 260, loss 0.1627
Training epoch 261, loss 0.0221
Validation epoch 261, loss 0.1824
Training epoch 262, loss 0.0257
Validation epoch 262, loss 0.1959
Training epoch 263, loss 0.0328
Validation epoch 263, loss 0.1737
Training epoch 264, loss 0.0365
Validation epoch 264, loss 0.1629
Training epoch 265, loss 0.0340
Validation epoch 265, loss 0.1656
Training e

In [11]:
#%%
np.save('ModifiedDeepDTA_v6_2_tr_losses', tr_epoch_losses)
np.save('ModifiedDeepDTA_v6_2_va_losses', va_epoch_losses)

In [23]:
min(va_epoch_losses)

0.15121634560637176