In [1]:
#%% library import
import numpy as np
import pandas as pd
import networkx as nx
import torch as tc
import torch
import pprint
import pickle
import time
import copy

from torch.autograd import Variable
from sklearn.utils import shuffle
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from functools import partial
from sklearn.model_selection import KFold

In [2]:
#%% Load dataset and cuda
dataset = pd.read_csv("datasets/mini-dataset.csv")
datalen = len(dataset)
cuda = tc.device('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda
GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
#%% protein-ligand-kiba split
protein = dataset.loc[:, "uniprotID"]    # 5
ligand = dataset.loc[:, "chemblID"]
kiba = list(dataset.loc[:, 'KIBA'])
del dataset


In [4]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_lstm.txt', 'rb')
seq_voc, _ = pickle.load(f)
f.close()

sequence = np.zeros((datalen, 4128))
for i, s in enumerate(protein):
    sequence[i] = seq_voc[s]

sequence = sequence[:, :1400]


In [5]:
#%% ligand smiles load
f = open('datasets/dictionaries/lgn_smiecoding.txt', 'rb')
smi_dic = pickle.load(f)
f.close()

smileseq = np.zeros((datalen, 590))
for i, e in enumerate(ligand):
    smileseq[i] = smi_dic[e]

smileseq = smileseq[:, :100]


In [6]:
#%% dataset zip
revised_dataset = list(zip(sequence, smileseq, kiba))
shuffled_dataset = np.array(shuffle(revised_dataset)); del revised_dataset
trainset = shuffled_dataset[:int((9/10)*datalen)]
testset = shuffled_dataset[int((9/10)*datalen):]

In [7]:
#%% Make collate func.
def collate(samples):
    sequences, smileseq, labels = map(list, zip(*samples))
    return tc.LongTensor(sequences).cuda(), tc.LongTensor(smileseq).cuda(), tc.tensor(labels).cuda()

In [8]:
#%% learning module 선언
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()    # method 상속받고 __init__()은 여기서 하겠다.
        
        self.prt_emlayer = nn.Embedding(21, 10)
        
        self.prt_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 12),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size=1379)
                        )
        
        ######################################################################
        ######################################################################
        
        self.lgn_emlayer = nn.Embedding(64, 10)
        
        self.lgn_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 6),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size = 85)
                        )
        
        
        self.mlplayers = nn.Sequential(
                        nn.Linear(192, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(512, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU()
                        )

        self.regress = nn.Linear(512, 1)    # regression

    def forward(self, prt_seq, lgn_seq):   
        p = self.prt_emlayer(prt_seq)
        p = p.permute(0, 2, 1)
        p = self.prt_cv1dlayers(p)
        p = p.squeeze()
        
        l = self.lgn_emlayer(lgn_seq)
        l = l.permute(0, 2, 1)
        l = self.lgn_cv1dlayers(l)
        l = l.squeeze()
        
        cat = tc.cat((p, l), axis=1).cuda()
        out = self.mlplayers(cat)
        
        return self.regress(out).cuda()

In [9]:
#%% Set hyperparameter
hp_d = {}

# FIXME: 학습 관련 하이퍼파라미터
hp_d['batch_size'] = 512
hp_d['num_epochs'] = 300

hp_d['init_learning_rate'] = 10 ** -3.563
hp_d['eps'] = 10 ** -8
hp_d['weight_decay'] = 10 ** -5

In [10]:
testset

array([[array([13.,  1.,  8., ...,  0.,  0.,  0.]),
        array([42., 42., 35., 40., 14., 42.,  4., 40., 42.,  1., 42.,  1., 40.,
       48., 31., 48., 35., 31., 49., 42., 36., 40., 42.,  1., 42., 40.,
       42., 42.,  1., 40., 42.,  4., 36., 31., 42., 25., 31.,  9., 27.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.]),
        11.20000013],
       [array([13.,  1., 12., ...,  0.,  0.,  0.]),
        array([42., 35., 40., 42., 42., 40., 42.,  1., 42., 40., 42., 35., 31.,
       14., 42.,  4., 40., 14., 42.,  1., 40., 42., 49.,  4., 31., 42.,
       36., 40., 42., 42.,  5., 40., 42.,  1., 42., 40., 42., 36., 31.,
       14., 14., 40., 42.,  5.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0., 

In [18]:
#%% training and validation
tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
va_data_loader = DataLoader(testset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

model = Regressor().to(torch.device('cuda:0'))
loss_func = nn.MSELoss(reduction='mean').cuda()

optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
    weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

tr_epoch_losses = []
va_epoch_losses = []

for epoch in range(hp_d['num_epochs']):                             #!! epoch-loop
    # training session
    model.train()
    tr_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(tr_data_loader):       #!! batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_epoch_loss += loss.detach().item()

    tr_epoch_loss /= (iter + 1)
    print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))
    tr_epoch_losses.append(tr_epoch_loss)

# ===========================================================================
    # validation session
    model.eval()
    va_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(va_data_loader):  # batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()

        va_epoch_loss += loss.detach().item()

    va_epoch_loss /= (iter + 1)
    
    curr_loss = va_epoch_loss
    best_loss = 10
    if curr_loss < best_loss:
        best_loss = curr_loss
        checkpoint = {'epoch': epoch,
              'state_dict': model.state_dict(),
              'optimizer' : optimizer.state_dict(),
              'loss': best_loss}
        torch.save(checkpoint, 'checkpoint.pth')
    
    print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))
    va_epoch_losses.append(va_epoch_loss)
    


Training epoch 0, loss 40.4039
Validation epoch 0, loss 2.4978
Training epoch 1, loss 1.0788
Validation epoch 1, loss 0.4205
Training epoch 2, loss 0.6899
Validation epoch 2, loss 0.3607
Training epoch 3, loss 0.5934
Validation epoch 3, loss 0.3124
Training epoch 4, loss 0.4832
Validation epoch 4, loss 0.3050
Training epoch 5, loss 0.3873
Validation epoch 5, loss 0.2870
Training epoch 6, loss 0.3138
Validation epoch 6, loss 0.2669
Training epoch 7, loss 0.2682
Validation epoch 7, loss 0.2551
Training epoch 8, loss 0.2418
Validation epoch 8, loss 0.2599
Training epoch 9, loss 0.2256
Validation epoch 9, loss 0.2448
Training epoch 10, loss 0.2149
Validation epoch 10, loss 0.2310
Training epoch 11, loss 0.2026
Validation epoch 11, loss 0.2313
Training epoch 12, loss 0.1914
Validation epoch 12, loss 0.2593
Training epoch 13, loss 0.1856
Validation epoch 13, loss 0.2717
Training epoch 14, loss 0.1763
Validation epoch 14, loss 0.2278
Training epoch 15, loss 0.1670
Validation epoch 15, loss 0.

Validation epoch 127, loss 0.1902
Training epoch 128, loss 0.0353
Validation epoch 128, loss 0.1913
Training epoch 129, loss 0.0310
Validation epoch 129, loss 0.1731
Training epoch 130, loss 0.0290
Validation epoch 130, loss 0.1647
Training epoch 131, loss 0.0286
Validation epoch 131, loss 0.1633
Training epoch 132, loss 0.0275
Validation epoch 132, loss 0.1559
Training epoch 133, loss 0.0272
Validation epoch 133, loss 0.1593
Training epoch 134, loss 0.0259
Validation epoch 134, loss 0.1528
Training epoch 135, loss 0.0262
Validation epoch 135, loss 0.1558
Training epoch 136, loss 0.0271
Validation epoch 136, loss 0.1561
Training epoch 137, loss 0.0287
Validation epoch 137, loss 0.1530
Training epoch 138, loss 0.0319
Validation epoch 138, loss 0.1557
Training epoch 139, loss 0.0307
Validation epoch 139, loss 0.1523
Training epoch 140, loss 0.0296
Validation epoch 140, loss 0.1571
Training epoch 141, loss 0.0313
Validation epoch 141, loss 0.1606
Training epoch 142, loss 0.0317
Validation

Training epoch 252, loss 0.0173
Validation epoch 252, loss 0.1453
Training epoch 253, loss 0.0178
Validation epoch 253, loss 0.1489
Training epoch 254, loss 0.0190
Validation epoch 254, loss 0.1503
Training epoch 255, loss 0.0222
Validation epoch 255, loss 0.1796
Training epoch 256, loss 0.0325
Validation epoch 256, loss 0.1626
Training epoch 257, loss 0.0252
Validation epoch 257, loss 0.1573
Training epoch 258, loss 0.0212
Validation epoch 258, loss 0.1507
Training epoch 259, loss 0.0199
Validation epoch 259, loss 0.1579
Training epoch 260, loss 0.0207
Validation epoch 260, loss 0.1806
Training epoch 261, loss 0.0231
Validation epoch 261, loss 0.1814
Training epoch 262, loss 0.0262
Validation epoch 262, loss 0.1549
Training epoch 263, loss 0.0239
Validation epoch 263, loss 0.1483
Training epoch 264, loss 0.0194
Validation epoch 264, loss 0.1497
Training epoch 265, loss 0.0172
Validation epoch 265, loss 0.1460
Training epoch 266, loss 0.0163
Validation epoch 266, loss 0.1490
Training e