In [1]:
#%% library import
import numpy as np
import pandas as pd
import networkx as nx
import torch as tc
import torch
import pprint
import pickle
import time

from torch.autograd import Variable
from sklearn.utils import shuffle
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from functools import partial
# from bayes_opt import BayesianOptimization


In [2]:
#%% Load dataset and cuda
dataset = pd.read_csv("datasets/KIBA.csv")
datalen = len(dataset)
cuda = tc.device('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda
GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
#%% protein-ligand-kiba split
protein = dataset.loc[:, "uniprotID"]    # 5
ligand = dataset.loc[:, "chemblID"]
kiba = list(dataset.loc[:, 'KIBA'])
del dataset


In [4]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_tfidf.txt', 'rb')
tfidf_dic = pickle.load(f)
f.close()

tfidf = np.zeros((datalen, 400))
for i, s in enumerate(protein):
    tfidf[i] = tfidf_dic[s]

In [5]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_lstm.txt', 'rb')
seq_voc, _ = pickle.load(f)
f.close()

sequence = np.zeros((datalen, 4128))
for i, s in enumerate(protein):
    sequence[i] = seq_voc[s]

sequence = sequence[:, :1400]

In [6]:
#%% ligand smiles load
f = open('datasets/dictionaries/lgn_smiecoding.txt', 'rb')
smi_dic = pickle.load(f)
f.close()

smileseq = np.zeros((datalen, 590))
for i, e in enumerate(ligand):
    smileseq[i] = smi_dic[e]

smileseq = smileseq[:, :100]


In [7]:
#%% dataset zip
revised_dataset = list(zip(tfidf, sequence, smileseq, kiba))
shuffled_dataset = shuffle(revised_dataset); del revised_dataset
trainset = shuffled_dataset[:int((9/10)*datalen)]
validset = shuffled_dataset[int((9/10)*datalen):]


In [8]:
#%% Make collate func.
def collate(samples):
    tfidf, sequence, smileseq, labels = map(list, zip(*samples))
    return tc.tensor(tfidf, dtype=tc.float).cuda(), tc.LongTensor(sequence).cuda(), tc.LongTensor(smileseq).cuda(), tc.tensor(labels).cuda()

In [12]:
#%% learning module 선언
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()    # method 상속받고 __init__()은 여기서 하겠다.
        
        self.prt_cv1dlayers0 = nn.Sequential(
                        nn.Conv1d(1, 1, kernel_size = 32),
                        nn.BatchNorm1d(num_features = 1),
                        nn.ReLU()
                        )
        
        self.fc = nn.Linear(369, 256, F.relu)
        
        self.prt_emlayer = nn.Embedding(21, 10)
        
        self.prt_cv1dlayers1 = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 12),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size=1379)
                        )
        
        ######################################################################
        ######################################################################
        self.lgn_emlayer = nn.Embedding(64, 10)
        
        self.lgn_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 6),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size = 85)
                        )
        
        
        self.mlplayers = nn.Sequential(
                        nn.Linear(448, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(512, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU()
                        )

        self.regress = nn.Linear(512, 1)    # regression

    def forward(self, tfidf, prt_seq, lgn_seq):
        p0 = tfidf.unsqueeze(1)
        p0 = self.prt_cv1dlayers0(p0)
        p0 = p0.squeeze()
        p0 = self.fc(p0)
        
        p1 = self.prt_emlayer(prt_seq)
        p1 = p1.permute(0, 2, 1)
        p1 = self.prt_cv1dlayers1(p1)
        p1 = p1.squeeze()
        
        l = self.lgn_emlayer(lgn_seq)
        l = l.permute(0, 2, 1)
        l = self.lgn_cv1dlayers(l)
        l = l.squeeze()
        
        cat = tc.cat((p0, p1, l), axis=1).cuda()
        out = self.mlplayers(cat)
        out = out.squeeze()
        
        return self.regress(out).cuda()

In [13]:
#%% Set hyperparameter
hp_d = {}

# FIXME: 학습 관련 하이퍼파라미터
hp_d['batch_size'] = 256
hp_d['num_epochs'] = 150

hp_d['init_learning_rate'] = 10 ** -3.70183
hp_d['eps'] = 10 ** -8.39981
hp_d['weight_decay'] = 10 ** -3.59967

In [14]:
#%% training and validation
tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
va_data_loader = DataLoader(validset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

model = Regressor().to(torch.device('cuda:0'))
total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
loss_func = nn.MSELoss(reduction='mean').cuda()
optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
    weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

print('tr_var:', np.var(np.array([s[2] for s in trainset])))
print('va_var:', np.var(np.array([s[2] for s in validset])))
print('total params:', total_params)

tr_epoch_losses = []
va_epoch_losses = []

start = time.time()

for epoch in range(hp_d['num_epochs']):                          #!! epoch-loop
    # training session
    model.train()
    tr_epoch_loss = 0

    for iter, (tfidf, seq, smi, label) in enumerate(tr_data_loader):  #!! batch-loop
        prediction = model(tfidf, seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        tr_epoch_loss += loss.detach().item()
    
    tr_epoch_loss /= (iter + 1)
    print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))
    tr_epoch_losses.append(tr_epoch_loss)

# ===========================================================================
    # validation session
    model.eval()
    va_epoch_loss = 0

    for iter, (tfidf, seq, smi, label) in enumerate(va_data_loader):  # batch-loop
        prediction = model(tfidf, seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()
        
        va_epoch_loss += loss.detach().item()
        
    va_epoch_loss /= (iter + 1)
    print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))
    va_epoch_losses.append(va_epoch_loss)
    
end = time.time()
print('time elapsed:', end-start)

tr_var: 372.91571770685937
va_var: 372.9321843858667
total params: 3606006
Training epoch 0, loss 29.3025
Validation epoch 0, loss 0.8949
Training epoch 1, loss 0.7238
Validation epoch 1, loss 0.5001
Training epoch 2, loss 0.4616
Validation epoch 2, loss 0.3576
Training epoch 3, loss 0.3777
Validation epoch 3, loss 0.3346
Training epoch 4, loss 0.3403
Validation epoch 4, loss 0.3313
Training epoch 5, loss 0.3112
Validation epoch 5, loss 0.3159
Training epoch 6, loss 0.2892
Validation epoch 6, loss 0.2849
Training epoch 7, loss 0.2687
Validation epoch 7, loss 0.2754
Training epoch 8, loss 0.2502
Validation epoch 8, loss 0.2777
Training epoch 9, loss 0.2340
Validation epoch 9, loss 0.2618
Training epoch 10, loss 0.2193
Validation epoch 10, loss 0.2582
Training epoch 11, loss 0.2068
Validation epoch 11, loss 0.2535
Training epoch 12, loss 0.2009
Validation epoch 12, loss 0.2540
Training epoch 13, loss 0.1923
Validation epoch 13, loss 0.2668
Training epoch 14, loss 0.1823
Validation epoch 

Validation epoch 126, loss 0.1833
Training epoch 127, loss 0.0537
Validation epoch 127, loss 0.1797
Training epoch 128, loss 0.0467
Validation epoch 128, loss 0.1776
Training epoch 129, loss 0.0465
Validation epoch 129, loss 0.1828
Training epoch 130, loss 0.0457
Validation epoch 130, loss 0.1791
Training epoch 131, loss 0.0527
Validation epoch 131, loss 0.1849
Training epoch 132, loss 0.0515
Validation epoch 132, loss 0.1836
Training epoch 133, loss 0.0483
Validation epoch 133, loss 0.1738
Training epoch 134, loss 0.0487
Validation epoch 134, loss 0.1707
Training epoch 135, loss 0.0472
Validation epoch 135, loss 0.1705
Training epoch 136, loss 0.0469
Validation epoch 136, loss 0.1725
Training epoch 137, loss 0.0468
Validation epoch 137, loss 0.1755
Training epoch 138, loss 0.0478
Validation epoch 138, loss 0.1744
Training epoch 139, loss 0.0454
Validation epoch 139, loss 0.1785
Training epoch 140, loss 0.0458
Validation epoch 140, loss 0.1818
Training epoch 141, loss 0.0454
Validation

In [12]:
#%%
np.save('ModifiedDeepDTA_v6_tr_losses', tr_epoch_losses)
np.save('ModifiedDeepDTA_v6_va_losses', va_epoch_losses)

In [67]:
min(va_epoch_losses)

0.21312199137173593