# 3.10 多层感知机的简洁实现

In [106]:
import torch
from torch import nn
from torch.nn import init
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator,AllChem
import torch.utils.data as Data
import sys
sys.path.append("..") 
import d2lzh_pytorch as d2l


print(torch.__version__)

1.4.0


## 3.10.1 定义模型

In [117]:
num_inputs,  num_hidden_1, num_hidden_2 ,num_outputs= 2048, 128, 64, 4
    
net = nn.Sequential(
        d2l.FlattenLayer(),
        nn.Linear(num_inputs, num_hidden_1),
        nn.ReLU(),
        nn.Linear(num_hidden_1, num_hidden_2),
        nn.ReLU(),
        nn.Linear(num_hidden_2, num_outputs), 
        )    
for params in net.parameters():
    init.normal_(params, mean=0, std=0.01)

In [108]:
def read_raw_data(fname):
    raw_df = pd.read_csv(fname,sep='\t')
    ce = raw_df[['Canonical_QSARr','EPA_category']]
    fps,ys= [],[]
    for im in range(ce.shape[0]):
        smi, y = ce.iloc[im,:]
        try:
            mol =Chem.MolFromSmiles(smi)
            fp = AllChem.GetMorganFingerprintAsBitVect(mol,2,nBits=2048)
            fp = list(fp.ToBitString())
            fp = [float(ik) for ik in fp]
            y = int(y)-1
            fps.append(fp)
            ys.append(y)
        except:
            pass
    x = np.array(fps)
    y = np.array(ys)
    return x,y

In [113]:
batch_size = 256

In [114]:
fname = './trainingset_171130.txt'
x,y = read_raw_data(fname)
features = torch.tensor(x , dtype= torch.float)
labels = torch.tensor(y, dtype= torch.long)
dataset = Data.TensorDataset(features, labels)
train_iter = Data.DataLoader(dataset,batch_size,shuffle = True)

In [112]:
y.shape

(8890,)

In [115]:
fname = './validationset.txt'
x,y = read_raw_data(fname)
features = torch.tensor(x , dtype= torch.float)
labels = torch.tensor(y, dtype= torch.long)
dataset = Data.TensorDataset(features, labels)
test_iter = Data.DataLoader(dataset,batch_size,shuffle = True)

## 3.10.2 读取数据并训练模型

In [118]:
loss = torch.nn.CrossEntropyLoss()

optimizer = torch.optim.SGD(net.parameters(), lr=0.5)
num_epochs = 150
d2l.train_ch3(net, train_iter, test_iter, 
              loss, num_epochs, batch_size,
              None, None, optimizer)

epoch 1, loss 0.0048, train acc 0.487, test acc 0.497
epoch 2, loss 0.0048, train acc 0.499, test acc 0.497
epoch 3, loss 0.0048, train acc 0.499, test acc 0.497
epoch 4, loss 0.0048, train acc 0.499, test acc 0.497
epoch 5, loss 0.0048, train acc 0.499, test acc 0.497
epoch 6, loss 0.0047, train acc 0.499, test acc 0.497
epoch 7, loss 0.0047, train acc 0.499, test acc 0.497
epoch 8, loss 0.0047, train acc 0.499, test acc 0.497
epoch 9, loss 0.0047, train acc 0.499, test acc 0.497
epoch 10, loss 0.0045, train acc 0.500, test acc 0.513
epoch 11, loss 0.0043, train acc 0.515, test acc 0.542
epoch 12, loss 0.0041, train acc 0.546, test acc 0.538
epoch 13, loss 0.0039, train acc 0.568, test acc 0.541
epoch 14, loss 0.0038, train acc 0.580, test acc 0.583
epoch 15, loss 0.0037, train acc 0.592, test acc 0.580
epoch 16, loss 0.0035, train acc 0.606, test acc 0.566
epoch 17, loss 0.0035, train acc 0.611, test acc 0.585
epoch 18, loss 0.0033, train acc 0.636, test acc 0.584
epoch 19, loss 0.00

epoch 150, loss 0.0002, train acc 0.974, test acc 0.612
