In [1]:
#%% library import
import numpy as np
import pandas as pd
import networkx as nx
import torch as tc
import torch
import pprint
import pickle
import time

from torch.autograd import Variable
from sklearn.utils import shuffle
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from functools import partial
# from bayes_opt import BayesianOptimization


In [2]:
#%% Load dataset and cuda
dataset = pd.read_csv("datasets/KIBA.csv")
datalen = len(dataset)
cuda = tc.device('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda
GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
#%% protein-ligand-kiba split
protein = dataset.loc[:, "uniprotID"]    # 5
ligand = dataset.loc[:, "chemblID"]
kiba = list(dataset.loc[:, 'KIBA'])
del dataset


In [4]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_tfidf.txt', 'rb')
tfidf_dic = pickle.load(f)
f.close()

tfidf = np.zeros((datalen, 400))
for i, s in enumerate(protein):
    tfidf[i] = tfidf_dic[s]

In [5]:
#%% ligand smiles load
f = open('datasets/dictionaries/lgn_smiecoding.txt', 'rb')
smi_dic = pickle.load(f)
f.close()

smileseq = np.zeros((datalen, 590))
for i, e in enumerate(ligand):
    smileseq[i] = smi_dic[e]

smileseq = smileseq[:, :100]


In [7]:
#%% dataset zip
revised_dataset = list(zip(tfidf, smileseq, kiba))
shuffled_dataset = shuffle(revised_dataset); del revised_dataset
trainset = shuffled_dataset[:int((9/10)*datalen)]
validset = shuffled_dataset[int((9/10)*datalen):]


In [8]:
#%% Make collate func.
def collate(samples):
    tfidf, smileseq, labels = map(list, zip(*samples))
    return tc.tensor(tfidf, dtype=tc.float).cuda(), tc.LongTensor(smileseq).cuda(), tc.tensor(labels).cuda()

In [9]:
#%% learning module 선언
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()    # method 상속받고 __init__()은 여기서 하겠다.
        
        self.prt_cv1dlayers = nn.Sequential(
                        nn.Conv1d(1, 1, kernel_size = 32),
                        nn.BatchNorm1d(num_features = 1),
                        nn.ReLU()
                        )
        
        self.fc = nn.Linear(369, 256, F.relu)
        
        ######################################################################
        ######################################################################
        self.lgn_emlayer = nn.Embedding(64, 10)
        
        self.lgn_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 6),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size = 85)
                        )
        
        
        self.mlplayers = nn.Sequential(
                        nn.Linear(352, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(512, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU()
                        )

        self.regress = nn.Linear(512, 1)    # regression

    def forward(self, prt_seq, lgn_seq):
        p = prt_seq.unsqueeze(1)
        p = self.prt_cv1dlayers(p)
        p = p.squeeze()
        p = self.fc(p)
        
        l = self.lgn_emlayer(lgn_seq)
        l = l.permute(0, 2, 1)
        l = self.lgn_cv1dlayers(l)
        l = l.squeeze()
        
        cat = tc.cat((p, l), axis=1).cuda()
        out = self.mlplayers(cat)
        out = out.squeeze()
        
        return self.regress(out).cuda()

In [10]:
#%% Set hyperparameter
hp_d = {}

# FIXME: 학습 관련 하이퍼파라미터
hp_d['batch_size'] = 256
hp_d['num_epochs'] = 150

hp_d['init_learning_rate'] = 10 ** -3.70183
hp_d['eps'] = 10 ** -8.39981
hp_d['weight_decay'] = 10 ** -3.59967

In [11]:
#%% training and validation
tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
va_data_loader = DataLoader(validset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

model = Regressor().to(torch.device('cuda:0'))
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
loss_func = nn.MSELoss(reduction='mean').cuda()
optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
    weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

print('tr_var:', np.var(np.array([s[2] for s in trainset])))
print('va_var:', np.var(np.array([s[2] for s in validset])))
print('total params:', total_params)

tr_epoch_losses = []
va_epoch_losses = []

start = time.time()

for epoch in range(hp_d['num_epochs']):                          #!! epoch-loop
    # training session
    model.train()
    tr_epoch_loss = 0

    for iter, (tfidf, smi, label) in enumerate(tr_data_loader):  #!! batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        tr_epoch_loss += loss.detach().item()
    
    tr_epoch_loss /= (iter + 1)
    print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))
    tr_epoch_losses.append(tr_epoch_loss)

# ===========================================================================
    # validation session
    model.eval()
    va_epoch_loss = 0

    for iter, (tfidf, smi, label) in enumerate(va_data_loader):  # batch-loop
        prediction = model(tfidf, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()
        
        va_epoch_loss += loss.detach().item()
        
    va_epoch_loss /= (iter + 1)
    print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))
    va_epoch_losses.append(va_epoch_loss)
    
end = time.time()
print('time elapsed:', end-start)

tr_var: 0.7006010478506419
va_var: 0.6993066206316961
total params: 3415524
Training epoch 0, loss 30.4930
Validation epoch 0, loss 0.9261
Training epoch 1, loss 0.9884
Validation epoch 1, loss 0.4753
Training epoch 2, loss 0.7259
Validation epoch 2, loss 0.3645
Training epoch 3, loss 0.4343
Validation epoch 3, loss 0.3350
Training epoch 4, loss 0.3483
Validation epoch 4, loss 0.3133
Training epoch 5, loss 0.3157
Validation epoch 5, loss 0.2973
Training epoch 6, loss 0.2906
Validation epoch 6, loss 0.2815
Training epoch 7, loss 0.2690
Validation epoch 7, loss 0.2769
Training epoch 8, loss 0.2617
Validation epoch 8, loss 0.2620
Training epoch 9, loss 0.2420
Validation epoch 9, loss 0.2603
Training epoch 10, loss 0.2309
Validation epoch 10, loss 0.2594
Training epoch 11, loss 0.2177
Validation epoch 11, loss 0.2572
Training epoch 12, loss 0.2101
Validation epoch 12, loss 0.2814
Training epoch 13, loss 0.2044
Validation epoch 13, loss 0.2617
Training epoch 14, loss 0.1961
Validation epoch

Validation epoch 126, loss 0.1738
Training epoch 127, loss 0.0528
Validation epoch 127, loss 0.1632
Training epoch 128, loss 0.0517
Validation epoch 128, loss 0.1779
Training epoch 129, loss 0.0508
Validation epoch 129, loss 0.1685
Training epoch 130, loss 0.0437
Validation epoch 130, loss 0.1675
Training epoch 131, loss 0.0468
Validation epoch 131, loss 0.1673
Training epoch 132, loss 0.0525
Validation epoch 132, loss 0.1727
Training epoch 133, loss 0.0481
Validation epoch 133, loss 0.1738
Training epoch 134, loss 0.0457
Validation epoch 134, loss 0.1730
Training epoch 135, loss 0.0445
Validation epoch 135, loss 0.1718
Training epoch 136, loss 0.0471
Validation epoch 136, loss 0.1808
Training epoch 137, loss 0.0564
Validation epoch 137, loss 0.1714
Training epoch 138, loss 0.0515
Validation epoch 138, loss 0.1718
Training epoch 139, loss 0.0484
Validation epoch 139, loss 0.1655
Training epoch 140, loss 0.0474
Validation epoch 140, loss 0.1636
Training epoch 141, loss 0.0442
Validation

In [12]:
#%%
np.save('ModifiedDeepDTA_v6_tr_losses', tr_epoch_losses)
np.save('ModifiedDeepDTA_v6_va_losses', va_epoch_losses)

In [67]:
min(va_epoch_losses)

0.21312199137173593