In [1]:
#%% library import
import numpy as np
import pandas as pd
import networkx as nx
import torch as tc
import torch
import pprint
import pickle
import time

from torch.autograd import Variable
from sklearn.utils import shuffle
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from functools import partial
from sklearn.model_selection import KFold

In [2]:
#%% Load dataset and cuda
dataset = pd.read_csv("datasets/KIBA.csv")
datalen = len(dataset)
cuda = tc.device('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda
GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
#%% protein-ligand-kiba split
protein = dataset.loc[:, "uniprotID"]    # 5
ligand = dataset.loc[:, "chemblID"]
kiba = list(dataset.loc[:, 'KIBA'])
del dataset


In [4]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_tfidf.txt', 'rb')
tfidf_dic = pickle.load(f)
f.close()

tfidf = np.zeros((datalen, 400))
for i, s in enumerate(protein):
    tfidf[i] = tfidf_dic[s]


In [5]:
#%% ligand smiles load
f = open('datasets/dictionaries/lgn_smiecoding.txt', 'rb')
smi_dic = pickle.load(f)
f.close()

smileseq = np.zeros((datalen, 590))
for i, e in enumerate(ligand):
    smileseq[i] = smi_dic[e]

smileseq = smileseq[:, :100]


In [6]:
#%% dataset zip
revised_dataset = list(zip(tfidf, smileseq, kiba))
shuffled_dataset = np.array(shuffle(revised_dataset)); del revised_dataset


In [7]:
#%% Make collate func.
def collate(samples):
    tfidf, smileseq, labels = map(list, zip(*samples))
    return tc.tensor(tfidf, dtype=tc.float).cuda(), tc.LongTensor(smileseq).cuda(), tc.tensor(labels).cuda()

In [8]:
#%% learning module 선언
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()    # method 상속받고 __init__()은 여기서 하겠다.
        
        self.prt_cv1dlayers = nn.Sequential(
                        nn.Conv1d(1, 1, kernel_size = 32),
                        nn.BatchNorm1d(num_features = 1),
                        nn.ReLU()
                        )
        
        self.fc = nn.Linear(369, 256, F.relu)
        
        
        ######################################################################
        ######################################################################
        
        self.lgn_emlayer = nn.Embedding(64, 10)
        
        self.lgn_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 6),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size = 85)
                        )
        
        
        self.mlplayers = nn.Sequential(
                        nn.Linear(352, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(512, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU()
                        )

        self.regress = nn.Linear(512, 1)    # regression

    def forward(self, prt_seq, lgn_seq):   
        p = prt_seq.unsqueeze(1)
        p = self.prt_cv1dlayers(p)
        p = p.squeeze()
        p = self.fc(p)
        
        l = self.lgn_emlayer(lgn_seq)
        l = l.permute(0, 2, 1)
        l = self.lgn_cv1dlayers(l)
        l = l.squeeze()
        
        cat = tc.cat((p, l), axis=1).cuda()
        out = self.mlplayers(cat)
        out = out.squeeze()
        
        return self.regress(out).cuda()

In [9]:
#%% Set hyperparameter
hp_d = {}

# FIXME: 학습 관련 하이퍼파라미터
hp_d['batch_size'] = 512
hp_d['num_epochs'] = 100

hp_d['init_learning_rate'] = 10 ** -3.493
hp_d['eps'] = 10 ** -8.493
hp_d['weight_decay'] = 10 ** -5.972

In [10]:
#%% training and validation
kf = KFold(n_splits=5); kf.get_n_splits(shuffled_dataset)
val_err = 0
for tr_idx, ts_idx in kf.split(shuffled_dataset):
    trainset, testset = shuffled_dataset[tr_idx], shuffled_dataset[ts_idx]
    trainset = tuple(trainset); testset = tuple(testset)

    tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
    va_data_loader = DataLoader(testset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

    model = Regressor().to(torch.device('cuda:0'))
    loss_func = nn.MSELoss(reduction='mean').cuda()

    optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
        weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

    tr_epoch_losses = []
    va_epoch_losses = []

    for epoch in range(hp_d['num_epochs']):                             #!! epoch-loop
        # training session
        model.train()
        tr_epoch_loss = 0

        for iter, (tfidf, smi, label) in enumerate(tr_data_loader):       #!! batch-loop
            prediction = model(tfidf, smi).view(-1).cuda()
            loss = loss_func(prediction, label).cuda()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tr_epoch_loss += loss.detach().item()

        tr_epoch_loss /= (iter + 1)
        print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))
        tr_epoch_losses.append(tr_epoch_loss)

    # ===========================================================================
        # validation session
        model.eval()
        va_epoch_loss = 0

        for iter, (tfidf, smi, label) in enumerate(va_data_loader):  # batch-loop
            prediction = model(tfidf, smi).view(-1).cuda()
            loss = loss_func(prediction, label).cuda()

            va_epoch_loss += loss.detach().item()

        va_epoch_loss /= (iter + 1)
        print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))
        va_epoch_losses.append(va_epoch_loss)
    val_err += min(va_epoch_losses)
print('5-fold CVMSE:', val_err/5)


Training epoch 0, loss 35.7098
Validation epoch 0, loss 2.0925
Training epoch 1, loss 0.9075
Validation epoch 1, loss 0.5333
Training epoch 2, loss 0.6346
Validation epoch 2, loss 0.5028
Training epoch 3, loss 0.5493
Validation epoch 3, loss 0.4240
Training epoch 4, loss 0.4763
Validation epoch 4, loss 0.3912
Training epoch 5, loss 0.3952
Validation epoch 5, loss 0.3447
Training epoch 6, loss 0.3361
Validation epoch 6, loss 0.3271
Training epoch 7, loss 0.3029
Validation epoch 7, loss 0.3117
Training epoch 8, loss 0.2800
Validation epoch 8, loss 0.2916
Training epoch 9, loss 0.2666
Validation epoch 9, loss 0.2805
Training epoch 10, loss 0.2516
Validation epoch 10, loss 0.2869
Training epoch 11, loss 0.2427
Validation epoch 11, loss 0.2631
Training epoch 12, loss 0.2422
Validation epoch 12, loss 0.2555
Training epoch 13, loss 0.2389
Validation epoch 13, loss 0.2834
Training epoch 14, loss 0.2278
Validation epoch 14, loss 0.2738
Training epoch 15, loss 0.2225
Validation epoch 15, loss 0.

Training epoch 29, loss 0.1348
Validation epoch 29, loss 0.2409
Training epoch 30, loss 0.1373
Validation epoch 30, loss 0.2238
Training epoch 31, loss 0.1400
Validation epoch 31, loss 0.2441
Training epoch 32, loss 0.1419
Validation epoch 32, loss 0.3018
Training epoch 33, loss 0.1367
Validation epoch 33, loss 0.2435
Training epoch 34, loss 0.1292
Validation epoch 34, loss 0.2253
Training epoch 35, loss 0.1168
Validation epoch 35, loss 0.2214
Training epoch 36, loss 0.1221
Validation epoch 36, loss 0.2300
Training epoch 37, loss 0.1227
Validation epoch 37, loss 0.2844
Training epoch 38, loss 0.1359
Validation epoch 38, loss 0.2193
Training epoch 39, loss 0.1183
Validation epoch 39, loss 0.2363
Training epoch 40, loss 0.1204
Validation epoch 40, loss 0.2369
Training epoch 41, loss 0.1198
Validation epoch 41, loss 0.2285
Training epoch 42, loss 0.1163
Validation epoch 42, loss 0.2309
Training epoch 43, loss 0.1253
Validation epoch 43, loss 0.2619
Training epoch 44, loss 0.1215
Validatio

Validation epoch 57, loss 0.1925
Training epoch 58, loss 0.0900
Validation epoch 58, loss 0.1984
Training epoch 59, loss 0.0891
Validation epoch 59, loss 0.1874
Training epoch 60, loss 0.0963
Validation epoch 60, loss 0.1961
Training epoch 61, loss 0.0932
Validation epoch 61, loss 0.2031
Training epoch 62, loss 0.0853
Validation epoch 62, loss 0.2075
Training epoch 63, loss 0.0847
Validation epoch 63, loss 0.2129
Training epoch 64, loss 0.0866
Validation epoch 64, loss 0.2159
Training epoch 65, loss 0.0863
Validation epoch 65, loss 0.2171
Training epoch 66, loss 0.0826
Validation epoch 66, loss 0.2598
Training epoch 67, loss 0.0840
Validation epoch 67, loss 0.2274
Training epoch 68, loss 0.0861
Validation epoch 68, loss 0.2306
Training epoch 69, loss 0.0842
Validation epoch 69, loss 0.1921
Training epoch 70, loss 0.0777
Validation epoch 70, loss 0.1934
Training epoch 71, loss 0.0776
Validation epoch 71, loss 0.1876
Training epoch 72, loss 0.0737
Validation epoch 72, loss 0.1797
Trainin

Training epoch 86, loss 0.0701
Validation epoch 86, loss 0.1939
Training epoch 87, loss 0.0735
Validation epoch 87, loss 0.1808
Training epoch 88, loss 0.0734
Validation epoch 88, loss 0.1728
Training epoch 89, loss 0.0704
Validation epoch 89, loss 0.1765
Training epoch 90, loss 0.0739
Validation epoch 90, loss 0.1832
Training epoch 91, loss 0.0726
Validation epoch 91, loss 0.1993
Training epoch 92, loss 0.0740
Validation epoch 92, loss 0.2134
Training epoch 93, loss 0.0699
Validation epoch 93, loss 0.1915
Training epoch 94, loss 0.0791
Validation epoch 94, loss 0.2059
Training epoch 95, loss 0.0810
Validation epoch 95, loss 0.2435
Training epoch 96, loss 0.0775
Validation epoch 96, loss 0.2907
Training epoch 97, loss 0.0810
Validation epoch 97, loss 0.2487
Training epoch 98, loss 0.0735
Validation epoch 98, loss 0.2159
Training epoch 99, loss 0.0730
Validation epoch 99, loss 0.1744
Training epoch 0, loss 32.6399
Validation epoch 0, loss 1.6621
Training epoch 1, loss 0.8323
Validation 