# リンク予測(DrugBank)
## ECFP -> ニューラルネットワーク

## データについて
compound.csvは、drugbankのapprovedな化合物の構造情報の入ったstructure.sdfからDB番号とその化合物のSMILESをECFPで512bitのベクトルにしたデータを持っている。drugbankのfulldatabase.xmlからdrug interactionの部分を抜き出し、関係ある組みをdata_pare.csvで保存している。
####  このファイルで扱うlink_pare.pklとnolink_pare.pklについては、それぞれ化合物のリンクありのペア、なしのペアの512bit×2のベクトルと有無の{0,1}を含むデータ

In [1]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem, Draw

In [2]:
import torch
import torch.utils.data
from torchvision import datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim  # 最適化アルゴリズム実装のためのライブラリ

In [3]:
df_link = pd.read_pickle('link_pare.pkl')

In [4]:
df_nolink = pd.read_pickle('nolink_pare.pkl')

In [5]:
df_link.tail()

Unnamed: 0,smiles_1,smiles_2,target
650632,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
650633,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
650634,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
650635,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
650636,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ...",1


In [6]:
df_nolink.tail()

Unnamed: 0,smiles_1,smiles_2,target
2582384,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",0
2582385,"[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",0
2582386,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...",0
2582387,"[0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2582388,"[0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [7]:
class Net(nn.Module):  # 多層ニューラルネットワークの構築
    def __init__(self):
        super(Net, self).__init__()
        self.share1 = nn.Linear(512,1024)
        self.share2 = nn.Linear(1024,512)
        self.fc1 = nn.Linear(1024, 128)  # 一つ目の隠れ層のユニット数は512
        self.fc2 = nn.Linear(128, 128)  # 二つ目の隠れ層のユニット数は128
        self.fc3 = nn.Linear(128, 1)  # 出力層のユニット数は1


    def forward(self, x, y):
        a = F.relu(self.share1(x))
        a = self.share2(a)
        b = F.relu(self.share1(y))
        b = self.share2(b)
        x = torch.cat([a,b], dim=1)
        x = F.relu(self.fc1(x))  # 活性化関数にはReLUを使用
        x = F.relu(self.fc2(x))  # 活性化関数にはReLUを使用
        x = F.sigmoid(self.fc3(x))
        return x

net = Net()

In [8]:
net.train()

Net(
  (share1): Linear(in_features=512, out_features=1024, bias=True)
  (share2): Linear(in_features=1024, out_features=512, bias=True)
  (fc1): Linear(in_features=1024, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=1, bias=True)
)

In [9]:
df = pd.concat([df_link, df_nolink])
df = df.sample(frac=1)
df = df.reset_index(drop=True)

In [10]:
df_sample = df.sample(n=10000)
df_sample = df_sample.reset_index(drop=True)

In [11]:
df_sample.head()

Unnamed: 0,smiles_1,smiles_2,target
0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",0
3,"[1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [12]:
A = df_sample["smiles_1"]
B = df_sample["smiles_2"]
target = df_sample["target"]

In [13]:
A = torch.Tensor(A)
B = torch.Tensor(B)
target = torch.Tensor(target)

In [16]:
train_tensor = torch.utils.data.TensorDataset(A,B,target)
train_dataset, test_dataset = torch.utils.data.random_split(train_tensor, [9000, 1000])

In [1]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

NameError: name 'torch' is not defined

In [18]:
criterion = nn.BCELoss() 
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
#optimizer = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-5)

In [19]:
for epoch in range(100):
    total_loss = 0.0
    for i,data in enumerate(train_loader):
        inputA,inputB, labels = data
        optimizer.zero_grad()  
        outputs = net(inputA,inputB)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(total_loss)
print('\n学習が終了しました。')

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


165.65033343434334
143.28736525774002
141.56463953852654
141.21573001146317
141.2772894501686
140.70640560984612
140.62453815340996
140.49379408359528
140.4364049732685
139.76800972223282
139.4693266749382
138.41875125467777
137.3926550745964
135.9909774363041
134.3025318980217
132.18200606107712
129.836967125535
127.14128413796425
124.63915520906448
121.75282882153988
118.38367913663387
114.96248944103718
111.15059807896614
106.65535575151443
103.12684381753206
98.38952718675137
93.98743283748627
89.25970967113972
85.56924080103636
80.39787148684263
77.16557044535875
72.40935228019953
68.66813292354345
67.97179218754172
62.82472752034664
60.46506704390049
59.51380102708936
56.39119489863515
53.69862088561058
52.14755528792739
53.8911803252995
49.628217458724976
47.4805322997272
45.94580235145986
49.04339971393347
44.276848370209336
43.56863881088793
43.98586699925363
42.26857727020979
40.9021244905889
41.139714013785124
39.06892497744411
39.10173236951232
36.71541526541114
36.74071266

In [27]:
correct = 0.0  # 正答数を表す
total = 0.0  # テストデータの総数を表す
link = 0.0

In [28]:
y_label = np.empty(0)
y_predict = np.empty(0)

In [30]:
for data in test_loader:
    inputA, inputB, labels = data
    outputs = net(inputA,inputB)
    _, predicted = torch.max(outputs.data, 0)
    total += labels.size(0)  # テストデータの総数を計算
    correct += (predicted == labels).sum().item()  # 正答数を計算
    link += sum(labels.numpy())
    y_label = np.concatenate([y_label, np.array(labels.numpy())])
    y_predict = np.concatenate([y_predict, np.array(net(inputA,inputB).detach().numpy()).ravel()])

In [31]:
pred = []
for i in y_predict:
    if i >= 0.5:
        pred.append(1)
    else:
        pred.append(0)

In [32]:
count = 0
for i in range(1000):
    if y_label[i] == pred[i]:
        count += 1
count

837

In [33]:
print('テストデータに対する正答率： %d / %d = %f' % (count, total, count / total) + '\n')
print('テストデータに含まれるリンクありの割合： %d / %d = %f' % (link, total, link / total) + '\n')
print('テストデータに含まれるリンクなしの割合： %d / %d = %f' % (total - link, total, (total- link) / total) + '\n')

for i in range(10):  # テストデータの一部を10行に分けて可視化
    print("ラベル：" + "".join('%d ' % y_label[i*8 + j] for j in range(8)))  # ラベルの値を表示
    print("　予測：" + "".join('%d ' % pred[i*8 + j] for j in range(8)) + "\n")  # 予測結果を表示

テストデータに対する正答率： 837 / 1000 = 0.837000

テストデータに含まれるリンクありの割合： 221 / 1000 = 0.221000

テストデータに含まれるリンクなしの割合： 779 / 1000 = 0.779000

ラベル：1 0 1 0 1 0 0 0 
　予測：0 0 1 0 1 0 0 0 

ラベル：0 0 0 0 0 0 0 0 
　予測：1 0 0 0 0 0 0 0 

ラベル：0 0 0 1 0 0 0 0 
　予測：0 0 0 1 1 0 0 0 

ラベル：0 0 1 0 0 1 0 0 
　予測：0 0 1 0 0 1 1 0 

ラベル：0 0 1 0 1 0 0 0 
　予測：0 0 0 0 0 0 0 0 

ラベル：0 0 0 1 0 0 0 0 
　予測：0 0 1 1 0 0 0 0 

ラベル：0 0 1 0 0 0 0 0 
　予測：1 0 1 0 0 0 0 0 

ラベル：0 0 1 0 0 0 0 0 
　予測：0 1 1 0 0 1 0 0 

ラベル：0 0 0 0 0 0 1 0 
　予測：0 0 0 0 0 1 1 0 

ラベル：1 0 0 0 0 0 0 1 
　予測：0 0 0 0 0 0 0 1 



In [34]:
from sklearn.metrics import roc_auc_score

In [36]:
roc_auc_score(y_label, y_predict)

0.8541580748029438