# リンク予測( Substruction)
## ECFP -> ニューラルネットワーク

In [1]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import AllChem, Draw

In [2]:
import torch
import torch.utils.data
from torchvision import datasets
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim  # 最適化アルゴリズム実装のためのライブラリ

In [3]:
df = pd.read_pickle("data_substruction.pkl")

In [4]:
df.head()

Unnamed: 0,smiles,substructure
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.69
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.22
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.27
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.36
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0.44


In [5]:
df.median()

substructure    0.33
dtype: float64

In [6]:
def f(x):
    ans = 0
    if x > 0.3 and x <= 1:
        ans = 1
    else:
        ans = 0
    return ans 

In [7]:
df["substructure"] = df["substructure"].map(f)

In [8]:
df1 = df.sample(n= 10000)

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import mean_squared_error

In [10]:
X = df1["smiles"].values.tolist()
y = df1["substructure"].values.tolist()
clf = MLPClassifier(hidden_layer_sizes = (128,128), solver = "sgd")

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=0)
clf.fit(X_train, y_train)
correct = y_test
predict = clf.predict(X_test)
print(clf.score(X_test, y_test))

0.897




In [12]:
x = sum(y_test)/len(y_test)
x
### 全体のリンクあり、なしの割合は、0.5～0.6

0.573

In [13]:
per_correct = (predict == correct).sum()
print(per_correct / len(correct))

0.897


In [18]:
df1.rename(columns={"substructure": 'target'},inplace=True)
df1 = df1.reset_index(drop=True)

In [19]:
df1.tail()

Unnamed: 0,smiles,target
9995,"[0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
9996,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
9997,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
9998,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
9999,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1


In [20]:
class Net(nn.Module):  # 多層ニューラルネットワークの構築
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1024, 128)  # 一つ目の隠れ層のユニット数は512
        self.fc2 = nn.Linear(128, 128)  # 二つ目の隠れ層のユニット数は128
        self.fc3 = nn.Linear(128, 2)  # 出力層のユニット数は2


    def forward(self, x):
        x = F.relu(self.fc1(x))  # 活性化関数にはReLUを使用
        x = F.relu(self.fc2(x))  # 活性化関数にはReLUを使用
        x = self.fc3(x)
        return x

net = Net()

In [21]:
train_label = torch.tensor(df1["target"]).long()
train_data = torch.tensor(df1["smiles"]).float()

In [22]:
train_tensor = torch.utils.data.TensorDataset(train_data,train_label)
train_dataset, test_dataset = torch.utils.data.random_split(train_tensor, [9000, 1000])

In [23]:
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

In [24]:
criterion = nn.CrossEntropyLoss()  # 学習の際、目的関数として最小二乗誤差を使用
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
#optimizer = optim.Adam(net.parameters(), lr=1e-3, weight_decay=1e-5)

In [25]:
lambda1, lambda2 = 0.5, 0.0001

In [26]:
for epoch in range(100):
    total_loss = 0.0
    for i, data in enumerate(train_loader):
        inputs, labels = data
        optimizer.zero_grad()  
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(total_loss)
print('\n学習が終了しました。')

191.38271588087082
182.36783701181412
164.03551352024078
141.20961660146713
124.86920946836472
113.6546559035778
104.68566870689392
97.34722439944744
90.86371178925037
85.3422576636076
80.16580504924059
75.39986357092857
71.00874122977257
66.77464891970158
62.7355011254549
58.2844525128603
54.318564511835575
50.24644086137414
47.04618593305349
42.81893677264452
39.84659123420715
36.52378098294139
33.71934621408582
30.90499841608107
28.307689601555467
25.681905023753643
23.589059120975435
21.070155335590243
19.167776562273502
18.014620043337345
16.708372766152024
14.34826163481921
13.050960530526936
11.600035508396104
10.43411864596419
9.547302709892392
8.517433557193726
7.882603817153722
6.948002800811082
6.362106829881668
5.771528614219278
5.400520575931296
5.127148684114218
4.551238630898297
4.145092810387723
3.7520071913604625
3.50961572653614
3.2624092597980052
3.062319735181518
2.8447226408170536
2.68935720610898
2.516029544058256
2.3652603824739344
2.258648425573483
2.07863529090

In [27]:
correct = 0.0  # 正答数を表す
total = 0.0  # テストデータの総数を表す
for data in test_loader:
    inputs, labels = data
    outputs = net(inputs)
    _, predicted = torch.max(outputs.data, 1)
    total += labels.size(0)  # テストデータの総数を計算
    correct += (predicted == labels).sum().item()  # 正答数を計算

In [29]:
print('テストデータに対する正答率： %d / %d = %f' % (correct, total, correct / total) + '\n')

dataiter = iter(test_loader)
for i in range(10):  # テストデータの一部を4行に分けて可視化
    images, labels = dataiter.next()
    print("ラベル：" + "".join('%5s' % labels[j].item() for j in range(8)))  # ラベルの値を表示

    _, predict = torch.max(net(images).data, 1)  # テストデータに対する予測を出力
    print("　予測：" + "".join('%5s' % predict[j].item() for j in range(8)) + "\n")  # 予測結果を表示

テストデータに対する正答率： 901 / 1000 = 0.901000

ラベル：    0    1    1    1    0    0    1    1
　予測：    1    1    1    1    0    0    1    1

ラベル：    1    0    0    0    1    1    1    1
　予測：    0    0    0    0    0    1    0    1

ラベル：    1    0    1    0    1    0    0    1
　予測：    1    0    1    0    1    0    1    1

ラベル：    0    1    1    0    1    0    0    1
　予測：    0    0    1    0    1    0    0    1

ラベル：    1    1    0    1    0    1    1    0
　予測：    1    1    0    0    0    1    0    0

ラベル：    0    1    1    0    1    1    1    1
　予測：    0    1    1    0    1    1    1    1

ラベル：    1    1    0    0    1    0    1    0
　予測：    0    1    1    0    1    0    1    0

ラベル：    1    1    1    1    1    0    1    1
　予測：    1    1    1    1    1    0    1    1

ラベル：    0    1    0    0    0    1    1    0
　予測：    0    1    0    0    0    1    1    0

ラベル：    1    1    1    1    1    0    0    1
　予測：    1    1    1    0    1    0    0    0

