# リンク予測( Indication)
## ECFP -> ニューラルネットワーク

In [1]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem, Draw

In [2]:
import torch
import torch.utils.data
from torchvision import datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim  # 最適化アルゴリズム実装のためのライブラリ

In [3]:
df = pd.read_pickle("data_side_effect.pkl")

In [4]:
df.head()

Unnamed: 0,smiles1,smiles2,side_effect
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.039056
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.009857
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.032003
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.023612
5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...",0.024511


In [5]:
df.median()

side_effect    0.034465
dtype: float64

In [6]:
def f(x):
    ans = 0
    if x > 0.03 and x <= 1:
        ans = 1
    else:
        ans = 0
    return ans 

In [7]:
df["side_effect"] = df["side_effect"].map(f)

In [8]:
class Net(nn.Module):  # 多層ニューラルネットワークの構築
    def __init__(self):
        super(Net, self).__init__()
        self.share1 = nn.Linear(512,1024)
        self.share2 = nn.Linear(1024,512)
        self.fc1 = nn.Linear(1024, 128)  # 一つ目の隠れ層のユニット数は512
        self.fc2 = nn.Linear(128, 128)  # 二つ目の隠れ層のユニット数は128
        self.fc3 = nn.Linear(128, 1)  # 出力層のユニット数は1


    def forward(self, x, y):
        a = F.relu(self.share1(x))
        a = self.share2(a)
        b = F.relu(self.share1(y))
        b = self.share2(b)
        x = torch.cat([a,b], dim=1)
        x = F.relu(self.fc1(x))  # 活性化関数にはReLUを使用
        x = F.relu(self.fc2(x))  # 活性化関数にはReLUを使用
        x = F.sigmoid(self.fc3(x))
        return x

net = Net()

In [9]:
df_sample = df.sample(n=10000)
df_sample = df_sample.reset_index(drop=True)

In [10]:
df_sample.head()

Unnamed: 0,smiles1,smiles2,side_effect
0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, ...",1
1,"[1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, ...",1
2,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
3,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
4,"[1, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1


In [11]:
A = df_sample["smiles1"]
B = df_sample["smiles2"]
target = df_sample["side_effect"]

In [12]:
A = torch.Tensor(A)
B = torch.Tensor(B)
target = torch.Tensor(target)

In [13]:
train_tensor = torch.utils.data.TensorDataset(A,B,target)
train_dataset, test_dataset = torch.utils.data.random_split(train_tensor, [9000, 1000])

In [14]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

In [15]:
criterion = nn.BCELoss() 
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
#optimizer = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-5)

In [16]:
for epoch in range(100):
    total_loss = 0.0
    for i,data in enumerate(train_loader):
        inputA,inputB, labels = data
        optimizer.zero_grad()  
        outputs = net(inputA,inputB)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(total_loss)
print('\n学習が終了しました。')

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


195.3426150083542
193.6116989850998
193.37617582082748
193.38556998968124
193.25361043214798
193.1447623372078
193.06200051307678
193.07367104291916
192.98464858531952
192.78636318445206
192.72203022241592
192.47545689344406
192.2884345650673
191.99032098054886
191.4787793159485
190.9117791056633
190.00746661424637
188.81767636537552
187.02255433797836
184.5559812784195
181.29213535785675
177.46491849422455
172.17524245381355
166.3661023080349
160.6258963048458
153.44904482364655
147.9443572461605
141.12190729379654
134.82055795192719
129.47833833098412
126.24787949025631
122.49762415885925
119.49750055372715
117.31455132365227
114.46781253814697
112.45002466440201
112.57549278438091
108.71586526930332
110.0341265052557
107.48325683176517
106.6518672555685
104.67224828898907
103.86283667385578
104.84498839080334
102.4812478646636
102.03561854362488
101.91147230565548
100.97072997689247
99.84156838059425
99.16636177897453
98.03230719268322
97.60158112645149
96.50024175643921
98.43519309

In [17]:
correct = 0.0  # 正答数を表す
total = 0.0  # テストデータの総数を表す
link = 0.0

In [18]:
y_label = np.empty(0)
y_predict = np.empty(0)

In [19]:
for data in test_loader:
    inputA, inputB, labels = data
    outputs = net(inputA,inputB)
    _, predicted = torch.max(outputs.data, 0)
    total += labels.size(0)  # テストデータの総数を計算
    correct += (predicted == labels).sum().item()  # 正答数を計算
    link += sum(labels.numpy())
    y_label = np.concatenate([y_label, np.array(labels.numpy())])
    y_predict = np.concatenate([y_predict, np.array(net(inputA,inputB).detach().numpy()).ravel()])

In [20]:
pred = []
for i in y_predict:
    if i >= 0.5:
        pred.append(1)
    else:
        pred.append(0)

In [21]:
count = 0
for i in range(1000):
    if y_label[i] == pred[i]:
        count += 1
count

755

In [22]:
print('テストデータに対する正答率： %d / %d = %f' % (count, total, count / total) + '\n')
print('テストデータに含まれるリンクありの割合： %d / %d = %f' % (link, total, link / total) + '\n')
print('テストデータに含まれるリンクなしの割合： %d / %d = %f' % (total - link, total, (total- link) / total) + '\n')

for i in range(10):  # テストデータの一部を10行に分けて可視化
    print("ラベル：" + "".join('%d ' % y_label[i*8 + j] for j in range(8)))  # ラベルの値を表示
    print("　予測：" + "".join('%d ' % pred[i*8 + j] for j in range(8)) + "\n")  # 予測結果を表示

テストデータに対する正答率： 755 / 1000 = 0.755000

テストデータに含まれるリンクありの割合： 558 / 1000 = 0.558000

テストデータに含まれるリンクなしの割合： 442 / 1000 = 0.442000

ラベル：1 1 0 1 1 1 1 1 
　予測：0 1 0 0 1 1 1 1 

ラベル：1 0 1 1 1 0 1 1 
　予測：1 0 1 1 0 0 0 1 

ラベル：1 1 1 1 1 0 1 1 
　予測：1 0 1 1 0 0 0 1 

ラベル：0 0 0 0 1 1 0 0 
　予測：0 0 0 0 0 1 1 0 

ラベル：1 1 1 0 1 1 0 1 
　予測：0 1 1 1 1 1 0 1 

ラベル：1 0 1 0 0 0 1 1 
　予測：1 1 1 0 1 0 0 1 

ラベル：1 1 1 1 1 1 0 1 
　予測：1 0 1 0 1 1 1 1 

ラベル：1 1 0 1 1 0 1 1 
　予測：1 1 0 1 1 1 1 0 

ラベル：0 0 1 1 1 1 1 0 
　予測：0 1 1 1 1 1 1 0 

ラベル：0 1 0 1 1 0 0 0 
　予測：0 0 1 0 1 0 0 0 



In [23]:
from sklearn.metrics import roc_auc_score

In [24]:
roc_auc_score(y_label, y_predict)

0.8256134546457126