In [43]:
import numpy as np 
import pandas as pd 

In [44]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [45]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors

In [46]:
seq_rdic = ['A','I','L','V','F','W','Y','N','C','Q','M','S','T','D','E','R','H','K','G','P','O','U','X','B','Z']
seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}

In [47]:
def encodeSeq(seq, seq_dic):
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]

In [48]:
def padding_seq(x, max_len=2500):
    if len(x) == 0:
        return x
    elif len(x) >= max_len:
        return x[:max_len]
    else:
        return x + [0]*(max_len-len(x))

In [49]:
def parse_data(dti_dir, prot_len=2500, drug_len=2048):

    protein_col = "protein"
    drug_col = "compound"
    label_col = "label"
    col_names = [protein_col, drug_col, label_col]
    
    dti_df = pd.read_csv(dti_dir, header=0)

    dti_df[protein_col] = dti_df[protein_col].map(lambda a: encodeSeq(a, seq_dic))
    
    drug_feature = np.stack(dti_df[drug_col].map(
        lambda sm: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(sm),2,nBits=drug_len)
    ))
    
    protein_feature = np.stack(dti_df[protein_col].map(padding_seq))
    label = dti_df[label_col].values
    
    print("\tPositive data : %d" %(sum(dti_df[label_col])))
    print("\tNegative data : %d" %(dti_df.shape[0] - sum(dti_df[label_col])))
    
    return {"protein_feature": protein_feature, 
            "drug_feature": drug_feature, 
            "label": label,
            }

In [50]:
train_datas = parse_data("../../data/maked/bias/train_human.csv")
test_datas = parse_data("../../data/maked/bias/test_human.csv")



	Positive data : 1876
	Negative data : 1496




	Positive data : 331
	Negative data : 341


In [51]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc_drug = nn.Linear(2048, 64)
        
        self.conv1D = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=128, stride=1, padding=0)
        self.batch = nn.BatchNorm1d(64)
        
        self.fc_out = nn.Linear(128, 128)
        self.fc_interaction = nn.Linear(128, 2)

    def forward(self, drug, protein):
        compound_vector = self.fc_drug(drug)
        compound_vector = torch.relu(compound_vector)

        protein = torch.unsqueeze(protein, 1)
        protein_vector = self.conv1D(protein)
        protein_vector = self.batch(protein_vector)
        protein_vector = torch.relu(protein_vector)
        
        protein_vector = F.max_pool1d(protein_vector, kernel_size=2373)
        protein_vector = torch.squeeze(protein_vector, 2)
        
        cat_vector = torch.cat((compound_vector, protein_vector), 1)
        
        for j in range(2):
            cat_vector = torch.relu(self.fc_out(cat_vector))
        out = self.fc_interaction(cat_vector)
        return out

In [52]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('The code uses GPU...')
else:
    device = torch.device('cpu')
    print('The code uses CPU!!!')

The code uses GPU...


In [53]:
model= Net().to(device)

In [54]:
train_drug_dataset = torch.FloatTensor(train_datas["drug_feature"])
train_protein_dataset = torch.FloatTensor(train_datas["protein_feature"])
train_target_dataset = torch.LongTensor(train_datas["label"])

In [55]:
test_drug_dataset = torch.FloatTensor(test_datas["drug_feature"])
test_protein_dataset = torch.FloatTensor(test_datas["protein_feature"])
test_target_dataset = torch.LongTensor(test_datas["label"])

In [56]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

In [57]:
train_dataset = torch.utils.data.TensorDataset(train_drug_dataset, train_protein_dataset, train_target_dataset)
test_dataset = torch.utils.data.TensorDataset(test_drug_dataset, test_protein_dataset, test_target_dataset)

N = len(train_dataset)
train_size = int(0.8 * N)
val_size = N - train_size
train_dataset, val_dataset = torch.utils.data.random_split(
    train_dataset, [train_size, val_size]
)

In [58]:

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=True)

In [59]:
from sklearn.metrics import roc_auc_score
scores = []
for epoch in range(1000):
    loss_total = 0
    for data in train_loader:
        d, p, true_label = data
        d, p, true_label  = d.to(device), p.to(device), true_label.to(device)

        optimizer.zero_grad()
        output = model(d, p)
        predicted_label = nn.Softmax(dim=1)(output)
        loss = F.cross_entropy( predicted_label, true_label)
        loss.backward()
        optimizer.step()
        loss_total += loss.to('cpu').data.numpy()
    pred_y, true_y = [], []
    for data in val_loader:
        d, p, true_label = data
        d, p, true_label  = d.to(device), p.to(device), true_label.to(device)
        output = model(d, p)
        predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
        pred_y += predicted_label
        true_y += true_label.to('cpu').data.numpy().tolist()
    score = roc_auc_score(true_y, pred_y)
    if epoch > 10:
        if np.mean(scores[-10:]) > score:
            scores.append(score)
            break
    scores.append(score)

    print("epoch: {}, loss: {}, AUC: {}".format(epoch, loss_total, score))

epoch: 0, loss: 58.47306215763092, AUC: 0.5752908171470437
epoch: 1, loss: 58.173059940338135, AUC: 0.602817933277344
epoch: 2, loss: 58.01193618774414, AUC: 0.6202580276253953
epoch: 3, loss: 57.61207467317581, AUC: 0.65713060414917
epoch: 4, loss: 57.278115808963776, AUC: 0.6744634848024588
epoch: 5, loss: 56.851521611213684, AUC: 0.6897861086789486
epoch: 6, loss: 55.882786989212036, AUC: 0.691885710201383
epoch: 7, loss: 55.11020576953888, AUC: 0.6998150563765345
epoch: 8, loss: 53.79955035448074, AUC: 0.7301341958079445
epoch: 9, loss: 52.53967607021332, AUC: 0.7333506066508229
epoch: 10, loss: 51.19662469625473, AUC: 0.7599038650536962
epoch: 11, loss: 49.8689700961113, AUC: 0.7684898951092687
epoch: 12, loss: 47.85645377635956, AUC: 0.7915765773814842
epoch: 13, loss: 45.79874449968338, AUC: 0.8145203080606829
epoch: 14, loss: 44.47428673505783, AUC: 0.8249111019780927
epoch: 15, loss: 42.28650778532028, AUC: 0.8296195700730838
epoch: 16, loss: 40.795512080192566, AUC: 0.8376337

In [60]:
pred_y, true_y = [], []
for data in train_loader:
    d, p, true_label = data
    d, p, true_label  = d.to(device), p.to(device), true_label.to(device)
    output = model(d, p)
    predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
    pred_y += predicted_label
    true_y += true_label.to('cpu').data.numpy().tolist()
print(roc_auc_score(true_y, pred_y))

0.9808477467642807


In [61]:
pred_y, true_y = [], []
for data in val_loader:
    d, p, true_label = data
    d, p, true_label  = d.to(device), p.to(device), true_label.to(device)

    output = model(d, p)
    predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
    pred_y += predicted_label
    true_y += true_label.to('cpu').data.numpy().tolist()
print(roc_auc_score(true_y, pred_y))

0.8690652752711614


In [62]:
pred_y, true_y = [], []
for data in test_loader:
    d, p, true_label = data
    d, p, true_label  = d.to(device), p.to(device), true_label.to(device)
    output = model(d, p)
    predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
    pred_y += predicted_label
    true_y += true_label.to('cpu').data.numpy().tolist()
print(roc_auc_score(true_y, pred_y))

0.8690008948268377
