In [1]:
#%% library import
import numpy as np
import pandas as pd
import networkx as nx
import torch as tc
import torch
import pprint
import pickle
import time

from torch.autograd import Variable
from sklearn.utils import shuffle
from torch import nn
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader
from functools import partial
from sklearn.model_selection import KFold

In [2]:
#%% Load dataset and cuda
dataset = pd.read_csv("datasets/KIBA.csv")
datalen = len(dataset)
cuda = tc.device('cuda')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,1), 'GB')

Using device: cuda
GeForce RTX 2080 Ti
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [3]:
#%% protein-ligand-kiba split
protein = dataset.loc[:, "uniprotID"]    # 5
ligand = dataset.loc[:, "chemblID"]
kiba = list(dataset.loc[:, 'KIBA'])
del dataset


In [4]:
#%% protein sequence load
f = open('datasets/dictionaries/prt_lstm.txt', 'rb')
seq_voc, _ = pickle.load(f)
f.close()

sequence = np.zeros((datalen, 4128))
for i, s in enumerate(protein):
    sequence[i] = seq_voc[s]

sequence = sequence[:, :1400]


In [5]:
#%% ligand smiles load
f = open('datasets/dictionaries/lgn_smiecoding.txt', 'rb')
smi_dic = pickle.load(f)
f.close()

smileseq = np.zeros((datalen, 590))
for i, e in enumerate(ligand):
    smileseq[i] = smi_dic[e]

smileseq = smileseq[:, :100]


In [6]:
#%% dataset zip
revised_dataset = list(zip(sequence, smileseq, kiba))
shuffled_dataset = np.array(shuffle(revised_dataset, random_state=0)); del revised_dataset


In [7]:
#%% Make collate func.
def collate(samples):
    sequences, smileseq, labels = map(list, zip(*samples))
    return tc.LongTensor(sequences).cuda(), tc.LongTensor(smileseq).cuda(), tc.tensor(labels).cuda()

In [8]:
#%% learning module 선언
class Regressor(nn.Module):
    def __init__(self):
        super(Regressor, self).__init__()    # method 상속받고 __init__()은 여기서 하겠다.
        
        self.prt_emlayer = nn.Embedding(21, 10)
        
        self.prt_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 12),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size=1379)
                        )
        
        ######################################################################
        ######################################################################
        
        self.lgn_emlayer = nn.Embedding(64, 10)
        
        self.lgn_cv1dlayers = nn.Sequential(
                        nn.Conv1d(10, 32, kernel_size = 4),
                        nn.BatchNorm1d(num_features = 32),
                        nn.ReLU(),
                        nn.Conv1d(32, 64, kernel_size = 6),
                        nn.BatchNorm1d(num_features = 64),
                        nn.ReLU(),
                        nn.Conv1d(64, 96, kernel_size = 8),
                        nn.BatchNorm1d(num_features = 96),
                        nn.ReLU(),
                        nn.MaxPool1d(kernel_size = 85)
                        )
        
        
        self.mlplayers = nn.Sequential(
                        nn.Linear(192, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 1024),
                        nn.BatchNorm1d(1024),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(1024, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU(),
                        nn.Dropout(0.1),
                        nn.Linear(512, 512),
                        nn.BatchNorm1d(512),
                        nn.ReLU()
                        )

        self.regress = nn.Linear(512, 1)    # regression

    def forward(self, prt_seq, lgn_seq):   
        p = self.prt_emlayer(prt_seq)
        p = p.permute(0, 2, 1)
        p = self.prt_cv1dlayers(p)
        p = p.squeeze()
        
        l = self.lgn_emlayer(lgn_seq)
        l = l.permute(0, 2, 1)
        l = self.lgn_cv1dlayers(l)
        l = l.squeeze()
        
        cat = tc.cat((p, l), axis=1).cuda()
        out = self.mlplayers(cat)
        
        return self.regress(out).cuda()

In [9]:
#%% Set hyperparameter
hp_d = {}

# FIXME: 학습 관련 하이퍼파라미터
hp_d['batch_size'] = 512
hp_d['num_epochs'] = 400

hp_d['init_learning_rate'] = 10 ** -3.563
hp_d['eps'] = 10 ** -8
hp_d['weight_decay'] = 10 ** -5

In [None]:
#%% training 80%, validation 10%, test 10%
data_len = len(shuffled_dataset)
train_len = int(data_len * (8/10))
valid_len = int(data_len * (1/10))
test_len = int(data_len * (1/10))

trainset = shuffled_dataset[:train_len]
validset = shuffled_dataset[train_len:train_len+valid_len]
testset = shuffled_dataset[-test_len:]

with open('testset.npy', 'wb') as f:
    np.save(f, testset)

trainset = tuple(trainset); validset = tuple(validset); testset = tuple(testset)

tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
va_data_loader = DataLoader(validset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
ts_data_loader = DataLoader(testset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

model = Regressor().to(torch.device('cuda:0'))
loss_func = nn.MSELoss(reduction='mean').cuda()

optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
    weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

best_loss = 100

for epoch in range(hp_d['num_epochs']):                             #!! epoch-loop
# ===========================================================================
    # training session
    model.train()
    tr_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(tr_data_loader):       #!! batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_epoch_loss += loss.detach().item()

    tr_epoch_loss /= (iter + 1)
    print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))

# ===========================================================================
    # validation session
    model.eval()
    va_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(va_data_loader):  # batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()

        va_epoch_loss += loss.detach().item()

    va_epoch_loss /= (iter + 1)
    print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))    
    
    if va_epoch_loss < best_loss:
        torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': va_epoch_loss
                }, 'results/model9_weight1.pt')
        best_loss = va_epoch_loss

# ===========================================================================
    # test session
checkpoint = torch.load('results/model9_wheight.pt')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']

model.eval()
ts_epoch_loss = 0

for iter, (seq, smi, label) in enumerate(ts_data_loader):  # batch-loop
    prediction = model(seq, smi).view(-1).cuda()
    loss = loss_func(prediction, label).cuda()
    
    ts_epoch_loss += loss.detach().item()
    
ts_epoch_loss /= (iter + 1)
print('Test loss {:.4f}'.format(epoch, ts_epoch_loss)) 


prediction = np.array(prediction.detach().cpu())
label = np.array(label.detach().cpu())
print('prediction-label corr coef:', np.corrcoef(prediction, label))

temp_pre = prediction.argsort()
rank_pre = np.empty_like(temp_pre)
rank_pre[temp_pre] = np.arange(len(prediction))

temp_lab = label.argsort()
rank_lab = np.empty_like(temp_lab)
rank_lab[temp_lab] = np.arange(len(label))

print('rank-order corr coef:',np.corrcoef(rank_pre, rank_lab))


In [10]:
#%% training 90%, validation 10%
data_len = len(shuffled_dataset)
train_len = int(data_len * (9/10))
valid_len = int(data_len * (1/10))

trainset = shuffled_dataset[:train_len]
validset = shuffled_dataset[-valid_len:]

trainset = tuple(trainset); validset = tuple(validset);

tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
va_data_loader = DataLoader(validset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

model = Regressor().to(torch.device('cuda:0'))
loss_func = nn.MSELoss(reduction='mean').cuda()

optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
    weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

best_loss = 100

for epoch in range(hp_d['num_epochs']):                             #!! epoch-loop
# ===========================================================================
    # training session
    model.train()
    tr_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(tr_data_loader):       #!! batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        tr_epoch_loss += loss.detach().item()

    tr_epoch_loss /= (iter + 1)
    print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))

# ===========================================================================
    # validation session
    model.eval()
    va_epoch_loss = 0

    for iter, (seq, smi, label) in enumerate(va_data_loader):  # batch-loop
        prediction = model(seq, smi).view(-1).cuda()
        loss = loss_func(prediction, label).cuda()

        va_epoch_loss += loss.detach().item()

    va_epoch_loss /= (iter + 1)
    print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))    
    
    if va_epoch_loss < best_loss:
        torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': va_epoch_loss
                }, 'results/model9_weight2.pt')
        best_loss = va_epoch_loss


Training epoch 0, loss 40.7391
Validation epoch 0, loss 2.7955
Training epoch 1, loss 1.1354
Validation epoch 1, loss 0.4640
Training epoch 2, loss 0.6872
Validation epoch 2, loss 0.3659
Training epoch 3, loss 0.5870
Validation epoch 3, loss 0.3225
Training epoch 4, loss 0.4929
Validation epoch 4, loss 0.3249
Training epoch 5, loss 0.4042
Validation epoch 5, loss 0.3504
Training epoch 6, loss 0.3302
Validation epoch 6, loss 0.3057
Training epoch 7, loss 0.2835
Validation epoch 7, loss 0.2717
Training epoch 8, loss 0.2544
Validation epoch 8, loss 0.2517
Training epoch 9, loss 0.2325
Validation epoch 9, loss 0.2569
Training epoch 10, loss 0.2200
Validation epoch 10, loss 0.2345
Training epoch 11, loss 0.2080
Validation epoch 11, loss 0.2268
Training epoch 12, loss 0.1984
Validation epoch 12, loss 0.2222
Training epoch 13, loss 0.1881
Validation epoch 13, loss 0.2250
Training epoch 14, loss 0.1815
Validation epoch 14, loss 0.2248
Training epoch 15, loss 0.1727
Validation epoch 15, loss 0.

Validation epoch 127, loss 0.1672
Training epoch 128, loss 0.0373
Validation epoch 128, loss 0.1675
Training epoch 129, loss 0.0335
Validation epoch 129, loss 0.1743
Training epoch 130, loss 0.0327
Validation epoch 130, loss 0.1800
Training epoch 131, loss 0.0327
Validation epoch 131, loss 0.1746
Training epoch 132, loss 0.0322
Validation epoch 132, loss 0.1682
Training epoch 133, loss 0.0353
Validation epoch 133, loss 0.1665
Training epoch 134, loss 0.0425
Validation epoch 134, loss 0.1733
Training epoch 135, loss 0.0416
Validation epoch 135, loss 0.1892
Training epoch 136, loss 0.0411
Validation epoch 136, loss 0.1755
Training epoch 137, loss 0.0412
Validation epoch 137, loss 0.1900
Training epoch 138, loss 0.0401
Validation epoch 138, loss 0.1900
Training epoch 139, loss 0.0374
Validation epoch 139, loss 0.1666
Training epoch 140, loss 0.0355
Validation epoch 140, loss 0.1608
Training epoch 141, loss 0.0329
Validation epoch 141, loss 0.1573
Training epoch 142, loss 0.0314
Validation

Training epoch 252, loss 0.0191
Validation epoch 252, loss 0.1564
Training epoch 253, loss 0.0188
Validation epoch 253, loss 0.1611
Training epoch 254, loss 0.0182
Validation epoch 254, loss 0.1802
Training epoch 255, loss 0.0183
Validation epoch 255, loss 0.1871
Training epoch 256, loss 0.0231
Validation epoch 256, loss 0.1923
Training epoch 257, loss 0.0203
Validation epoch 257, loss 0.1726
Training epoch 258, loss 0.0204
Validation epoch 258, loss 0.1922
Training epoch 259, loss 0.0194
Validation epoch 259, loss 0.1767
Training epoch 260, loss 0.0188
Validation epoch 260, loss 0.1779
Training epoch 261, loss 0.0187
Validation epoch 261, loss 0.1586
Training epoch 262, loss 0.0203
Validation epoch 262, loss 0.1544
Training epoch 263, loss 0.0226
Validation epoch 263, loss 0.1507
Training epoch 264, loss 0.0239
Validation epoch 264, loss 0.1825
Training epoch 265, loss 0.0245
Validation epoch 265, loss 0.1597
Training epoch 266, loss 0.0244
Validation epoch 266, loss 0.1543
Training e

Validation epoch 376, loss 0.1693
Training epoch 377, loss 0.0245
Validation epoch 377, loss 0.1741
Training epoch 378, loss 0.0206
Validation epoch 378, loss 0.1729
Training epoch 379, loss 0.0180
Validation epoch 379, loss 0.1737
Training epoch 380, loss 0.0164
Validation epoch 380, loss 0.1633
Training epoch 381, loss 0.0161
Validation epoch 381, loss 0.1544
Training epoch 382, loss 0.0159
Validation epoch 382, loss 0.1565
Training epoch 383, loss 0.0172
Validation epoch 383, loss 0.1618
Training epoch 384, loss 0.0171
Validation epoch 384, loss 0.1636
Training epoch 385, loss 0.0180
Validation epoch 385, loss 0.1632
Training epoch 386, loss 0.0203
Validation epoch 386, loss 0.1635
Training epoch 387, loss 0.0255
Validation epoch 387, loss 0.1634
Training epoch 388, loss 0.0249
Validation epoch 388, loss 0.1653
Training epoch 389, loss 0.0225
Validation epoch 389, loss 0.1574
Training epoch 390, loss 0.0206
Validation epoch 390, loss 0.1570
Training epoch 391, loss 0.0176
Validation

In [None]:
#%% K-fold training, validation and test
kf = KFold(n_splits=5); kf.get_n_splits(shuffled_dataset)
val_err = 0
for tr_idx, ts_idx in kf.split(shuffled_dataset):
    trainset, testset = shuffled_dataset[tr_idx], shuffled_dataset[ts_idx]
    trainset = tuple(trainset); testset = tuple(testset)

    tr_data_loader = DataLoader(trainset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)
    va_data_loader = DataLoader(testset, batch_size=hp_d['batch_size'], shuffle=False, collate_fn=collate)

    model = Regressor().to(torch.device('cuda:0'))
    loss_func = nn.MSELoss(reduction='mean').cuda()

    optimizer = optim.Adam(model.parameters(), lr=hp_d['init_learning_rate'], 
        weight_decay=hp_d['weight_decay'], eps=hp_d['eps'])

    tr_epoch_losses = []
    va_epoch_losses = []

    for epoch in range(hp_d['num_epochs']):                             #!! epoch-loop
        # training session
        model.train()
        tr_epoch_loss = 0

        for iter, (seq, smi, label) in enumerate(tr_data_loader):       #!! batch-loop
            prediction = model(seq, smi).view(-1).cuda()
            loss = loss_func(prediction, label).cuda()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            tr_epoch_loss += loss.detach().item()

        tr_epoch_loss /= (iter + 1)
        print('Training epoch {}, loss {:.4f}'.format(epoch, tr_epoch_loss))
        tr_epoch_losses.append(tr_epoch_loss)

    # ===========================================================================
        # validation session
        model.eval()
        va_epoch_loss = 0

        for iter, (seq, smi, label) in enumerate(va_data_loader):  # batch-loop
            prediction = model(seq, smi).view(-1).cuda()
            loss = loss_func(prediction, label).cuda()

            va_epoch_loss += loss.detach().item()

        va_epoch_loss /= (iter + 1)
        print('Validation epoch {}, loss {:.4f}'.format(epoch, va_epoch_loss))
        va_epoch_losses.append(va_epoch_loss)
    val_err += min(va_epoch_losses)
print('10-fold CVMSE:', val_err/10)
