In [2]:
import numpy as np 
import pandas as pd 

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import rdMolDescriptors

In [5]:
seq_rdic = ['A','I','L','V','F','W','Y','N','C','Q','M','S','T','D','E','R','H','K','G','P','O','U','X','B','Z']
seq_dic = {w: i+1 for i, w in enumerate(seq_rdic)}

In [6]:
def encodeSeq(seq, seq_dic):
    if pd.isnull(seq):
        return [0]
    else:
        return [seq_dic[aa] for aa in seq]

In [7]:
def padding_seq(x, max_len=2500):
    if len(x) == 0:
        return x
    elif len(x) >= max_len:
        return x[:max_len]
    else:
        return x + [0]*(max_len-len(x))

In [8]:
def parse_data(dti_dir, prot_len=2500, drug_len=2048):

    protein_col = "protein"
    drug_col = "compound"
    label_col = "label"
    col_names = [protein_col, drug_col, label_col]
    
    dti_df = pd.read_csv(dti_dir, header=0)

    dti_df[protein_col] = dti_df[protein_col].map(lambda a: encodeSeq(a, seq_dic))
    
    drug_feature = np.stack(dti_df[drug_col].map(
        lambda sm: AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(sm),2,nBits=drug_len)
    ))
    drug_weight = np.stack(dti_df[drug_col].map(
        lambda sm: rdMolDescriptors._CalcMolWt(Chem.MolFromSmiles(sm))
    ))
    
    protein_feature = np.stack(dti_df[protein_col].map(padding_seq))
    label = dti_df[label_col].values
    
    print("\tPositive data : %d" %(sum(dti_df[label_col])))
    print("\tNegative data : %d" %(dti_df.shape[0] - sum(dti_df[label_col])))
    
    return {"protein_feature": protein_feature, 
            "drug_feature": drug_feature, 
            "label": label,
            "weight": drug_weight
            }

In [9]:
datas = parse_data(
    "../../data/maked/default/human.csv",
)



	Positive data : 3364
	Negative data : 3364


In [10]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc_drug = nn.Linear(2048, 64)
        
        self.conv1D = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=128, stride=1, padding=0)
        self.batch = nn.BatchNorm1d(64)
        
        self.fc_out = nn.Linear(128, 128)
        self.fc_interaction = nn.Linear(128, 2)

    def forward(self, drug, protein):
        compound_vector = self.fc_drug(drug)
        compound_vector = torch.relu(compound_vector)

        protein = torch.unsqueeze(protein, 1)
        protein_vector = self.conv1D(protein)
        protein_vector = self.batch(protein_vector)
        protein_vector = torch.relu(protein_vector)
        
        protein_vector = F.max_pool1d(protein_vector, kernel_size=2373)
        protein_vector = torch.squeeze(protein_vector, 2)
        
        cat_vector = torch.cat((compound_vector, protein_vector), 1)
        
        for j in range(2):
            cat_vector = torch.relu(self.fc_out(cat_vector))
        out = self.fc_interaction(cat_vector)
        return out

In [11]:
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('The code uses GPU...')
else:
    device = torch.device('cpu')
    print('The code uses CPU!!!')

The code uses GPU...


In [12]:
model= Net().to(device)

In [13]:
drug_dataset = torch.FloatTensor(datas["drug_feature"])
protein_dataset = torch.FloatTensor(datas["protein_feature"])
target_dataset = torch.LongTensor(datas["label"])

In [14]:
import torch.optim as optim
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

In [15]:
dataset = torch.utils.data.TensorDataset(drug_dataset, protein_dataset, target_dataset)
N = len(dataset)
train_size = int(0.9 * N)
test_size = N - train_size
train_dataset, test_dataset = torch.utils.data.random_split(
    dataset, [train_size, test_size]
)

In [16]:

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=True)

In [17]:

for epoch in range(30):
    loss_total = 0
    for data in train_loader:
        d, p, true_label = data
        d, p, true_label  = d.to(device), p.to(device), true_label.to(device)

        optimizer.zero_grad()
        output = model(d, p)
        predicted_label = nn.Softmax(dim=1)(output)
        loss = F.cross_entropy( predicted_label, true_label)
        loss.backward()
        optimizer.step()
        loss_total += loss.to('cpu').data.numpy()
    print(loss_total)

124.48527079820633
116.2722233235836
99.06899937987328
86.06469029188156
80.87032479047775
76.97328686714172
75.85367009043694
74.31280264258385
69.64795199036598
69.13546362519264
68.1599238216877
67.45757111907005
67.17061084508896
67.78545519709587
67.32498994469643
67.99825528264046
65.85328212380409
67.18486088514328
65.06361949443817
64.9163729250431
65.6147093474865
64.13174310326576
64.0649505853653
64.95881590247154
63.35362073779106
63.27229779958725
65.34714275598526
66.23399412631989
63.56633269786835
64.42056247591972


In [20]:
from sklearn.metrics import roc_auc_score
pred_y, true_y = [], []
for data in train_loader:
    d, p, true_label = data
    d, p, true_label  = d.to(device), p.to(device), true_label.to(device)
    output = model(d, p)
    predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
    pred_y += predicted_label
    true_y += true_label.to('cpu').data.numpy().tolist()
print(roc_auc_score(true_y, pred_y))

0.9880679986986187


In [22]:
pred_y, true_y = [], []
for data in test_loader:
    d, p, true_label = data
    d, p, true_label  = d.to(device), p.to(device), true_label.to(device)
    output = model(d, p)
    predicted_label = nn.Softmax(dim=1)(output)[:,1].to('cpu').data.numpy().tolist()
    pred_y += predicted_label
    true_y += true_label.to('cpu').data.numpy().tolist()
print(roc_auc_score(true_y, pred_y))

0.968127331553544
