In [1]:
import numpy as np 
import pandas as pd 

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors

In [23]:
import warnings
warnings.simplefilter('ignore')

In [4]:
seq_rdic = ['A','I','L','V','F','W','Y','N','C','Q','M','S','T','D','E','R','H','K','G','P','O','U','X','B','Z']
seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}

In [5]:
def encodeSeq(seq, seq_dic):
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]

In [6]:
def padding_seq(x, max_len=2500):
    if len(x) == 0:
        return x
    elif len(x) >= max_len:
        return x[:max_len]
    else:
        return x + [0]*(max_len-len(x))

In [7]:
def parse_data(dti_dir, prot_len=2500, drug_len=2048, is_train=True):

    protein_col = "protein"
    drug_col = "compound"
    label_col = "label"
    weight_col = "weight"
    col_names = [protein_col, drug_col, label_col, weight_col]
    
    dti_df = pd.read_csv(dti_dir, header=0)

    dti_df[protein_col] = dti_df[protein_col].map(lambda a: encodeSeq(a, seq_dic))
    
    drug_feature = np.stack(dti_df[drug_col].map(
        lambda sm: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(sm),2,nBits=drug_len)
    ))
    
    protein_feature = np.stack(dti_df[protein_col].map(padding_seq))
    label = dti_df[label_col].values
    if is_train:
        weight = dti_df[weight_col].values
    
    print("\tPositive data : %d" %(sum(dti_df[label_col])))
    print("\tNegative data : %d" %(dti_df.shape[0] - sum(dti_df[label_col])))
    
    if is_train:
        return {"protein_feature": protein_feature, 
            "drug_feature": drug_feature, 
            "label": label,
            "weight": weight,
            }
    else:
        return {"protein_feature": protein_feature, 
            "drug_feature": drug_feature, 
            "label": label,
            }

In [9]:
train_datas = parse_data("../../data/maked/bias/train_celegans.csv")
test_datas = parse_data("../../data/maked/bias/test_celegans.csv", is_train=False)



	Positive data : 2000
	Negative data : 1949
	Positive data : 401
	Negative data : 377


In [10]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc_drug = nn.Linear(2048, 64)
        
        self.conv1D = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=128, stride=1, padding=0)
        self.batch = nn.BatchNorm1d(64)
        
        self.fc_out = nn.Linear(128, 128)
        self.fc_interaction = nn.Linear(128, 2)

    def forward(self, drug, protein):
        compound_vector = self.fc_drug(drug)
        compound_vector = torch.relu(compound_vector)

        protein = torch.unsqueeze(protein, 1)
        protein_vector = self.conv1D(protein)
        protein_vector = self.batch(protein_vector)
        protein_vector = torch.relu(protein_vector)
        
        protein_vector = F.max_pool1d(protein_vector, kernel_size=2373)
        protein_vector = torch.squeeze(protein_vector, 2)
        
        cat_vector = torch.cat((compound_vector, protein_vector), 1)
        
        for j in range(2):
            cat_vector = torch.relu(self.fc_out(cat_vector))
        out = self.fc_interaction(cat_vector)
        return out

In [11]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    print('The code uses GPU...')
else:
    device = torch.device('cpu')
    print('The code uses CPU!!!')

The code uses GPU...


In [12]:
model= Net().to(device)

In [13]:
train_drug_dataset = torch.FloatTensor(train_datas["drug_feature"])
train_protein_dataset = torch.FloatTensor(train_datas["protein_feature"])
train_weight_dataset = torch.FloatTensor(train_datas["weight"])
train_target_dataset = torch.LongTensor(train_datas["label"])

In [14]:
test_drug_dataset = torch.FloatTensor(test_datas["drug_feature"])
test_protein_dataset = torch.FloatTensor(test_datas["protein_feature"])
test_target_dataset = torch.LongTensor(test_datas["label"])

In [15]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [16]:
train_dataset = torch.utils.data.TensorDataset(train_drug_dataset, train_protein_dataset, train_weight_dataset, train_target_dataset)
test_dataset = torch.utils.data.TensorDataset(test_drug_dataset, test_protein_dataset, test_target_dataset)

N = len(train_dataset)
train_size = int(0.8 * N)
val_size = N - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    train_dataset, [train_size, val_size]
)

In [17]:

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=True)

In [19]:
from sklearn.metrics import roc_auc_score
scores = []
for epoch in range(1000):
    loss_total = 0
    for data in train_loader:
        d, p, w, true_label = data
        d, p, w, true_label  = d.to(device), p.to(device), w.to(device), true_label.to(device)

        optimizer.zero_grad()
        output = model(d, p)
        predicted_label = nn.Softmax(dim=1)(output)
        loss = torch.mean(F.cross_entropy( predicted_label, true_label, reduce='none'))
        loss.backward()
        optimizer.step()
        loss_total += loss.to('cpu').data.numpy()
    pred_y, true_y = [], []
    for data in val_loader:
        d, p, w, true_label = data
        d, p, w, true_label  = d.to(device), p.to(device), w.to(device), true_label.to(device)
        output = model(d, p)
        predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
        pred_y += predicted_label
        true_y += true_label.to('cpu').data.numpy().tolist()
    score = roc_auc_score(true_y, pred_y)
    if epoch > 10:
        if np.mean(scores[-10:]) > score:
            scores.append(score)
            break
    scores.append(score)

    print("epoch: {}, loss: {}, AUC: {}".format(epoch, loss_total, score))

epoch: 0, loss: 68.1805129647255, AUC: 0.6529832537114091
epoch: 1, loss: 67.67079412937164, AUC: 0.6763946678995222
epoch: 2, loss: 66.34617406129837, AUC: 0.6967303642060924
epoch: 3, loss: 64.98169642686844, AUC: 0.7162024194791184
epoch: 4, loss: 63.827765345573425, AUC: 0.7381209225869421
epoch: 5, loss: 62.479497253894806, AUC: 0.7539297272306982
epoch: 6, loss: 60.3669136762619, AUC: 0.7716777109980993
epoch: 7, loss: 58.74269127845764, AUC: 0.7864077669902912
epoch: 8, loss: 57.180183589458466, AUC: 0.7970668310474135
epoch: 9, loss: 54.302863001823425, AUC: 0.8075653670313865
epoch: 10, loss: 53.19386637210846, AUC: 0.8219229978938717
epoch: 11, loss: 51.370805501937866, AUC: 0.8311052036780191
epoch: 12, loss: 48.71482867002487, AUC: 0.8410836287049879
epoch: 13, loss: 47.22610658407211, AUC: 0.849527405352648
epoch: 14, loss: 46.749104380607605, AUC: 0.8601222581805106
epoch: 15, loss: 45.11350464820862, AUC: 0.8629732367596444
epoch: 16, loss: 43.529558420181274, AUC: 0.873

In [20]:
pred_y, true_y = [], []
for data in train_loader:
    d, p, w, true_label = data
    d, p, w, true_label  = d.to(device), p.to(device), w.to(device), true_label.to(device)

    output = model(d, p)
    predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
    pred_y += predicted_label
    true_y += true_label.to('cpu').data.numpy().tolist()
print(roc_auc_score(true_y, pred_y))

0.9854820309873912


In [21]:
pred_y, true_y = [], []
for data in val_loader:
    d, p, w, true_label = data
    d, p, w, true_label  = d.to(device), p.to(device), w.to(device), true_label.to(device)

    output = model(d, p)
    predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
    pred_y += predicted_label
    true_y += true_label.to('cpu').data.numpy().tolist()
print(roc_auc_score(true_y, pred_y))

0.9486823855755895


In [22]:
pred_y, true_y = [], []
for data in test_loader:
    d, p, true_label = data
    d, p, true_label  = d.to(device), p.to(device), true_label.to(device)
    output = model(d, p)
    predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
    pred_y += predicted_label
    true_y += true_label.to('cpu').data.numpy().tolist()
print(roc_auc_score(true_y, pred_y))

0.934037585082387
