In [1]:
#%% library import
import numpy as np
import pandas as pd
import networkx as nx
import torch as tc
import torch
import pprint
import pickle
import time

from torch.autograd import Variable
from sklearn.utils import shuffle
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from functools import partial
from sklearn.model_selection import KFold

In [2]:
#%% Load dataset and cuda
dataset = pd.read_csv("datasets/KIBA.csv")
datalen = len(dataset)
cuda = tc.device('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda
GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
#%% protein-ligand-kiba split
protein = dataset.loc[:, "uniprotID"]    # 5
ligand = dataset.loc[:, "chemblID"]
kiba = list(dataset.loc[:, 'KIBA'])
del dataset


In [4]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_tfidf.txt', 'rb')
tfidf_dic = pickle.load(f)
f.close()

tfidf = np.zeros((datalen, 400))
for i, s in enumerate(protein):
    tfidf[i] = tfidf_dic[s]


In [5]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_lstm.txt', 'rb')
seq_voc, _ = pickle.load(f)
f.close()

sequence = np.zeros((datalen, 4128))
for i, s in enumerate(protein):
    sequence[i] = seq_voc[s]

sequence = sequence[:, :1400]

In [6]:
#%% ligand smiles load
f = open('datasets/dictionaries/lgn_smiecoding.txt', 'rb')
smi_dic = pickle.load(f)
f.close()

smileseq = np.zeros((datalen, 590))
for i, e in enumerate(ligand):
    smileseq[i] = smi_dic[e]

smileseq = smileseq[:, :100]


In [7]:
#%% dataset zip
revised_dataset = list(zip(tfidf, sequence, smileseq, kiba))
shuffled_dataset = np.array(shuffle(revised_dataset)); del revised_dataset


In [8]:
#%% Make collate func.
def collate(samples):
    tfidf, sequence, smileseq, labels = map(list, zip(*samples))
    return tc.tensor(tfidf, dtype=tc.float).cuda(), tc.LongTensor(sequence).cuda(), tc.LongTensor(smileseq).cuda(), tc.tensor(labels).cuda()

In [9]:
#%% learning module 선언
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()    # method 상속받고 __init__()은 여기서 하겠다.
        
        self.prt_cv1dlayers0 = nn.Sequential(
                        nn.Conv1d(1, 1, kernel_size = 32),
                        nn.BatchNorm1d(num_features = 1),
                        nn.ReLU()
                        )
        
        self.fc = nn.Linear(369, 256, F.relu)
        
        self.prt_emlayer = nn.Embedding(21, 10)
        
        self.prt_cv1dlayers1 = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 12),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size=1379)
                        )
        
        
        ######################################################################
        ######################################################################
        
        self.lgn_emlayer = nn.Embedding(64, 10)
        
        self.lgn_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 6),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size = 85)
                        )
        
        
        self.mlplayers = nn.Sequential(
                        nn.Linear(448, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(512, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU()
                        )

        self.regress = nn.Linear(512, 1)    # regression

    def forward(self, tfidf, prt_seq, lgn_seq):   
        p0 = tfidf.unsqueeze(1)
        p0 = self.prt_cv1dlayers0(p0)
        p0 = p0.squeeze()
        p0 = self.fc(p0)
        
        p1 = self.prt_emlayer(prt_seq)
        p1 = p1.permute(0, 2, 1)
        p1 = self.prt_cv1dlayers1(p1)
        p1 = p1.squeeze()
        
        l = self.lgn_emlayer(lgn_seq)
        l = l.permute(0, 2, 1)
        l = self.lgn_cv1dlayers(l)
        l = l.squeeze()
        
        cat = tc.cat((p0, p1, l), axis=1).cuda()
        out = self.mlplayers(cat)
        out = out.squeeze()
        
        return self.regress(out).cuda()

In [10]:
#%% Set hyperparameter
hp_d = {}

# FIXME: 학습 관련 하이퍼파라미터
hp_d['batch_size'] = 512
hp_d['num_epochs'] = 100

hp_d['init_learning_rate'] = 10 ** -3.163
hp_d['eps'] = 10 ** -7.32
hp_d['weight_decay'] = 10 ** -5.639

In [11]:
#%% training and validation
kf = KFold(n_splits=5); kf.get_n_splits(shuffled_dataset)
val_err = 0
for tr_idx, ts_idx in kf.split(shuffled_dataset):
    trainset, testset = shuffled_dataset[tr_idx], shuffled_dataset[ts_idx]
    trainset = tuple(trainset); testset = tuple(testset)

    tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
    va_data_loader = DataLoader(testset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

    model = Regressor().to(torch.device('cuda:0'))
    loss_func = nn.MSELoss(reduction='mean').cuda()

    optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
        weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

    tr_epoch_losses = []
    va_epoch_losses = []

    for epoch in range(hp_d['num_epochs']):                             #!! epoch-loop
        # training session
        model.train()
        tr_epoch_loss = 0

        for iter, (tfidf, seq, smi, label) in enumerate(tr_data_loader):       #!! batch-loop
            prediction = model(tfidf, seq, smi).view(-1).cuda()
            loss = loss_func(prediction, label).cuda()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tr_epoch_loss += loss.detach().item()

        tr_epoch_loss /= (iter + 1)
        print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))
        tr_epoch_losses.append(tr_epoch_loss)

    # ===========================================================================
        # validation session
        model.eval()
        va_epoch_loss = 0

        for iter, (tfidf, seq, smi, label) in enumerate(va_data_loader):  # batch-loop
            prediction = model(tfidf, seq, smi).view(-1).cuda()
            loss = loss_func(prediction, label).cuda()

            va_epoch_loss += loss.detach().item()

        va_epoch_loss /= (iter + 1)
        print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))
        va_epoch_losses.append(va_epoch_loss)
    val_err += min(va_epoch_losses)
print('5-fold CVMSE:', val_err/5)


Training epoch 0, loss 19.2520
Validation epoch 0, loss 0.5148
Training epoch 1, loss 0.7038
Validation epoch 1, loss 0.3840
Training epoch 2, loss 0.5891
Validation epoch 2, loss 0.3635
Training epoch 3, loss 0.4719
Validation epoch 3, loss 0.3161
Training epoch 4, loss 0.3692
Validation epoch 4, loss 0.2944
Training epoch 5, loss 0.3073
Validation epoch 5, loss 0.2774
Training epoch 6, loss 0.2786
Validation epoch 6, loss 0.2743
Training epoch 7, loss 0.2528
Validation epoch 7, loss 0.2939
Training epoch 8, loss 0.2411
Validation epoch 8, loss 0.2876
Training epoch 9, loss 0.2226
Validation epoch 9, loss 0.2884
Training epoch 10, loss 0.2106
Validation epoch 10, loss 0.2855
Training epoch 11, loss 0.2012
Validation epoch 11, loss 0.2625
Training epoch 12, loss 0.1900
Validation epoch 12, loss 0.3425
Training epoch 13, loss 0.1827
Validation epoch 13, loss 0.3407
Training epoch 14, loss 0.1737
Validation epoch 14, loss 0.3351
Training epoch 15, loss 0.1663
Validation epoch 15, loss 0.

Training epoch 29, loss 0.1026
Validation epoch 29, loss 0.2166
Training epoch 30, loss 0.1038
Validation epoch 30, loss 0.2255
Training epoch 31, loss 0.1084
Validation epoch 31, loss 0.2904
Training epoch 32, loss 0.1064
Validation epoch 32, loss 0.2210
Training epoch 33, loss 0.1008
Validation epoch 33, loss 0.2038
Training epoch 34, loss 0.0967
Validation epoch 34, loss 0.2152
Training epoch 35, loss 0.0907
Validation epoch 35, loss 0.2144
Training epoch 36, loss 0.0896
Validation epoch 36, loss 0.2004
Training epoch 37, loss 0.0907
Validation epoch 37, loss 0.2393
Training epoch 38, loss 0.0948
Validation epoch 38, loss 0.2068
Training epoch 39, loss 0.0882
Validation epoch 39, loss 0.2194
Training epoch 40, loss 0.0842
Validation epoch 40, loss 0.2012
Training epoch 41, loss 0.0831
Validation epoch 41, loss 0.2092
Training epoch 42, loss 0.0784
Validation epoch 42, loss 0.2547
Training epoch 43, loss 0.0775
Validation epoch 43, loss 0.2095
Training epoch 44, loss 0.0746
Validatio

Training epoch 58, loss 0.0663
Validation epoch 58, loss 0.1811
Training epoch 59, loss 0.0605
Validation epoch 59, loss 0.1756
Training epoch 60, loss 0.0553
Validation epoch 60, loss 0.1774
Training epoch 61, loss 0.0522
Validation epoch 61, loss 0.1737
Training epoch 62, loss 0.0487
Validation epoch 62, loss 0.1743
Training epoch 63, loss 0.0482
Validation epoch 63, loss 0.1769
Training epoch 64, loss 0.0461
Validation epoch 64, loss 0.1714
Training epoch 65, loss 0.0447
Validation epoch 65, loss 0.1754
Training epoch 66, loss 0.0445
Validation epoch 66, loss 0.1759
Training epoch 67, loss 0.0450
Validation epoch 67, loss 0.1815
Training epoch 68, loss 0.0438
Validation epoch 68, loss 0.1773
Training epoch 69, loss 0.0444
Validation epoch 69, loss 0.1740
Training epoch 70, loss 0.0442
Validation epoch 70, loss 0.1799
Training epoch 71, loss 0.0455
Validation epoch 71, loss 0.1838
Training epoch 72, loss 0.0463
Validation epoch 72, loss 0.1764
Training epoch 73, loss 0.0490
Validatio

Validation epoch 86, loss 0.1885
Training epoch 87, loss 0.0584
Validation epoch 87, loss 0.2005
Training epoch 88, loss 0.0582
Validation epoch 88, loss 0.1968
Training epoch 89, loss 0.0560
Validation epoch 89, loss 0.1851
Training epoch 90, loss 0.0540
Validation epoch 90, loss 0.1718
Training epoch 91, loss 0.0521
Validation epoch 91, loss 0.1758
Training epoch 92, loss 0.0523
Validation epoch 92, loss 0.1929
Training epoch 93, loss 0.0503
Validation epoch 93, loss 0.2134
Training epoch 94, loss 0.0479
Validation epoch 94, loss 0.2092
Training epoch 95, loss 0.0455
Validation epoch 95, loss 0.2004
Training epoch 96, loss 0.0494
Validation epoch 96, loss 0.1926
Training epoch 97, loss 0.0538
Validation epoch 97, loss 0.1889
Training epoch 98, loss 0.0597
Validation epoch 98, loss 0.1762
Training epoch 99, loss 0.0612
Validation epoch 99, loss 0.1760
Training epoch 0, loss 16.3652
Validation epoch 0, loss 0.5928
Training epoch 1, loss 0.7139
Validation epoch 1, loss 0.4046
Training e