# リンク予測( Indication)
## ECFP -> ニューラルネットワーク

In [1]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem, Draw

In [2]:
import torch
import torch.utils.data
from torchvision import datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim  # 最適化アルゴリズム実装のためのライブラリ

In [3]:
df = pd.read_pickle("data_substruction.pkl")

In [4]:
df.head()

Unnamed: 0,smiles1,smiles2,substructure
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.69
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, ...",0.22
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.27
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.36
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, ...",0.44


In [5]:
df.median()

substructure    0.33
dtype: float64

In [6]:
def f(x):
    ans = 0
    if x > 0.3 and x <= 1:
        ans = 1
    else:
        ans = 0
    return ans 

In [7]:
df["substructure"] = df["substructure"].map(f)

In [8]:
class Net(nn.Module):  # 多層ニューラルネットワークの構築
    def __init__(self):
        super(Net, self).__init__()
        self.share1 = nn.Linear(512,1024)
        self.share2 = nn.Linear(1024,512)
        self.fc1 = nn.Linear(1024, 128)  # 一つ目の隠れ層のユニット数は512
        self.fc2 = nn.Linear(128, 128)  # 二つ目の隠れ層のユニット数は128
        self.fc3 = nn.Linear(128, 1)  # 出力層のユニット数は1


    def forward(self, x, y):
        a = F.relu(self.share1(x))
        a = self.share2(a)
        b = F.relu(self.share1(y))
        b = self.share2(b)
        x = torch.cat([a,b], dim=1)
        x = F.relu(self.fc1(x))  # 活性化関数にはReLUを使用
        x = F.relu(self.fc2(x))  # 活性化関数にはReLUを使用
        x = torch.sigmoid(self.fc3(x))
        return x

net = Net()

In [9]:
df_sample = df.sample(n=10000)
df_sample = df_sample.reset_index(drop=True)

In [10]:
df_sample.head()

Unnamed: 0,smiles1,smiles2,substructure
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, ...",0
2,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
3,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, ...",1
4,"[0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1


In [11]:
A = df_sample["smiles1"]
B = df_sample["smiles2"]
target = df_sample["substructure"]

In [12]:
A = torch.Tensor(A)
B = torch.Tensor(B)
target = torch.Tensor(target)

In [13]:
train_tensor = torch.utils.data.TensorDataset(A,B,target)
train_dataset, test_dataset = torch.utils.data.random_split(train_tensor, [9000, 1000])

In [14]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

In [15]:
criterion = nn.BCELoss() 
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
#optimizer = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-5)

In [16]:
for epoch in range(100):
    total_loss = 0.0
    for i,data in enumerate(train_loader):
        inputA,inputB, labels = data
        optimizer.zero_grad()  
        outputs = net(inputA,inputB)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(total_loss)
print('\n学習が終了しました。')

  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
  return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)


194.27072232961655
193.50788915157318
192.95556277036667
191.93902665376663
189.87885981798172
186.09910702705383
178.0148402452469
160.68422615528107
134.71387448906898
114.43247525393963
103.1713736653328
96.57614215835929
92.72435890883207
89.51819478720427
86.71247202903032
84.3325884193182
81.73652877658606
79.61334620416164
77.60657557100058
76.77976509183645
73.93292443454266
71.82342028990388
70.57596333324909
68.13880552351475
65.69090553745627
63.80363579839468
62.94597828015685
61.212806917726994
59.32810387015343
58.62578305602074
57.575908333063126
56.62351609393954
55.169795759022236
53.826446164399385
52.26783712953329
52.31801034882665
51.51084143295884
49.58783462271094
48.842662669718266
47.686126723885536
46.62941251322627
45.429741360247135
44.23699692264199
42.48587587289512
42.00279350532219
40.757057439535856
39.50053837150335
37.70166852325201
36.63966316357255
35.202766455709934
34.26619231980294
31.878463247790933
31.834991484880447
29.46985045168549
27.417455

In [17]:
correct = 0.0  # 正答数を表す
total = 0.0  # テストデータの総数を表す
link = 0.0

In [18]:
y_label = np.empty(0)
y_predict = np.empty(0)

In [19]:
for data in test_loader:
    inputA, inputB, labels = data
    outputs = net(inputA,inputB)
    _, predicted = torch.max(outputs.data, 0)
    total += labels.size(0)  # テストデータの総数を計算
    correct += (predicted == labels).sum().item()  # 正答数を計算
    link += sum(labels.numpy())
    y_label = np.concatenate([y_label, np.array(labels.numpy())])
    y_predict = np.concatenate([y_predict, np.array(net(inputA,inputB).detach().numpy()).ravel()])

In [20]:
pred = []
for i in y_predict:
    if i >= 0.5:
        pred.append(1)
    else:
        pred.append(0)

In [21]:
count = 0
for i in range(1000):
    if y_label[i] == pred[i]:
        count += 1
count

898

In [22]:
print('テストデータに対する正答率： %d / %d = %f' % (count, total, count / total) + '\n')
print('テストデータに含まれるリンクありの割合： %d / %d = %f' % (link, total, link / total) + '\n')
print('テストデータに含まれるリンクなしの割合： %d / %d = %f' % (total - link, total, (total- link) / total) + '\n')

for i in range(10):  # テストデータの一部を10行に分けて可視化
    print("ラベル：" + "".join('%d ' % y_label[i*10 + j] for j in range(10)))  # ラベルの値を表示
    print("　予測：" + "".join('%d ' % pred[i*10 + j] for j in range(10)) + "\n")  # 予測結果を表示

テストデータに対する正答率： 898 / 1000 = 0.898000

テストデータに含まれるリンクありの割合： 565 / 1000 = 0.565000

テストデータに含まれるリンクなしの割合： 435 / 1000 = 0.435000

ラベル：0 1 1 1 1 0 1 1 
　予測：0 1 1 1 1 0 1 1 

ラベル：0 0 1 1 1 0 1 0 
　予測：0 0 1 1 1 0 1 0 

ラベル：1 1 0 1 0 0 1 0 
　予測：1 1 0 1 0 0 1 1 

ラベル：0 0 1 1 0 1 0 0 
　予測：0 0 1 1 0 1 0 0 

ラベル：0 1 0 1 0 1 1 1 
　予測：0 1 0 1 0 1 1 1 

ラベル：1 1 1 0 1 0 1 1 
　予測：1 1 1 0 0 0 1 1 

ラベル：0 1 0 0 1 1 1 0 
　予測：1 1 0 0 1 1 1 0 

ラベル：0 0 1 1 1 0 1 1 
　予測：0 1 0 0 1 0 1 1 

ラベル：0 0 1 1 1 1 1 1 
　予測：0 0 1 1 1 1 1 1 

ラベル：1 0 0 0 0 0 1 0 
　予測：1 0 0 0 0 0 1 0 



In [23]:
from sklearn.metrics import roc_auc_score

In [24]:
roc_auc_score(y_label, y_predict)

0.9625389075373818