# リンク予測( Indication)
## ECFP -> ニューラルネットワーク

In [1]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem, Draw

In [2]:
import torch
import torch.utils.data
from torchvision import datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim  # 最適化アルゴリズム実装のためのライブラリ

In [3]:
df = pd.read_pickle("data_indication.pkl")

In [4]:
df.head()

Unnamed: 0,smiles,indication
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.042896
5,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.0


In [5]:
df.median()

indication    0.0
dtype: float64

In [6]:
def f(x):
    ans = 0
    if x > 0 and x <= 1:
        ans = 1
    else:
        ans = 0
    return ans 

In [8]:
df["indication"] = df["indication"].map(f)

In [9]:
df1 = df.sample(n= 10000)

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error

In [11]:
X = df1["smiles"].values.tolist()
y = df1["indication"].values.tolist()
clf = MLPClassifier(hidden_layer_sizes = (128,128), solver = "sgd")

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
clf.fit(X_train, y_train)
correct = y_test
predict = clf.predict(X_test)
print(clf.score(X_test, y_test))



0.873


In [28]:
x = sum(y_test)/len(y_test)
x = 1 - x
x
### 全体のリンクあり、なしの割合は、0.5～0.6

0.889

In [14]:
per_correct = (predict == correct).sum()
print(per_correct / len(correct))

0.873


In [15]:
df1.rename(columns={"indication": 'target'},inplace=True)
df1 = df1.reset_index(drop=True)

In [16]:
df1.tail()

Unnamed: 0,smiles,target
9995,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
9996,"[0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, ...",0
9997,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",0
9998,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
9999,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0


In [17]:
class Net(nn.Module):  # 多層ニューラルネットワークの構築
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1024, 128)  # 一つ目の隠れ層のユニット数は512
        self.fc2 = nn.Linear(128, 128)  # 二つ目の隠れ層のユニット数は128
        self.fc3 = nn.Linear(128, 2)  # 出力層のユニット数は2


    def forward(self, x):
        x = F.relu(self.fc1(x))  # 活性化関数にはReLUを使用
        x = F.relu(self.fc2(x))  # 活性化関数にはReLUを使用
        x = self.fc3(x)
        return x

net = Net()

In [18]:
train_label = torch.tensor(df1["target"]).long()
train_data = torch.tensor(df1["smiles"]).float()

In [19]:
train_tensor = torch.utils.data.TensorDataset(train_data,train_label)
train_dataset, test_dataset = torch.utils.data.random_split(train_tensor, [9000, 1000])

In [20]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

In [21]:
criterion = nn.CrossEntropyLoss()  # 学習の際、目的関数として最小二乗誤差を使用
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
#optimizer = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-5)

In [22]:
lambda1, lambda2 = 0.5, 0.0001

In [23]:
for epoch in range(100):
    total_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels = data
        optimizer.zero_grad()  
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(total_loss)
print('\n学習が終了しました。')

115.32384523749352
98.70598226040602
97.97722867876291
97.22696796059608
96.52189153432846
95.79260742664337
94.89333991706371
94.262950129807
93.31104197353125
92.44531328231096
91.66202490776777
90.73823633790016
89.6761543750763
88.63763192296028
87.32702108472586
86.40506365150213
84.84988825023174
83.82181926816702
82.63593293726444
81.4243832975626
80.15539877861738
78.7974187657237
77.38388867676258
76.07086355239153
74.67263784259558
73.30400858074427
72.37302307784557
70.55763204023242
69.35960641503334
67.70105252414942
66.21586446464062
64.11016288399696
63.01103410497308
60.85582009702921
58.95844313874841
56.755978148430586
55.4339728243649
52.69982463121414
50.833171758800745
47.7439811937511
45.25706660374999
42.666120145469904
40.249695079401135
37.52501508034766
34.70586138404906
31.722696667537093
28.943962777033448
26.336751386523247
23.833859637379646
22.142979633063078
19.209776086732745
17.04463448934257
15.51378674339503
13.730569343082607
12.10634732618928
10.61

In [32]:
correct = 0.0  # 正答数を表す
total = 0.0  # テストデータの総数を表す
for data in test_loader:
    inputs, labels = data
    outputs = net(inputs)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)  # テストデータの総数を計算
    correct += (predicted == labels).sum().item()  # 正答数を計算

In [33]:
print('テストデータに対する正答率： %d / %d = %f' % (correct, total, correct / total) + '\n')

dataiter = iter(test_loader)
for i in range(4):  # テストデータの一部を4行に分けて可視化
    images, labels = dataiter.next()
    print("ラベル：" + "".join('%5s' % labels[j].item() for j in range(8)))  # ラベルの値を表示

    _, predict = torch.max(net(images).data, 1)  # テストデータに対する予測を出力
    print("　予測：" + "".join('%5s' % predict[j].item() for j in range(8)) + "\n")  # 予測結果を表示

テストデータに対する正答率： 883 / 1000 = 0.883000

ラベル：    0    0    0    0    0    0    0    0
　予測：    0    0    0    0    0    0    0    0

ラベル：    0    0    0    0    0    1    0    0
　予測：    0    0    0    0    0    0    0    0

ラベル：    0    0    0    0    0    0    0    0
　予測：    0    0    0    0    0    0    0    0

ラベル：    0    0    0    0    0    0    0    0
　予測：    0    0    0    0    0    0    0    0

