In [1]:
import time
import torch
from torch import nn
import torch.optim as optim
import numpy as np

In [2]:
# DataUtils
# 生成二值化的输入
def binary_encoder(input_size):
    def wrapper(num):
        ret = [int(i) for i in '{0:b}'.format(num)]
        return [0] * (input_size - len(ret)) + ret
    return wrapper


# 生成大小为4的向量作为输出
def decoder(array):
    ret = 0
    for i in array:
        ret = ret * 2 + int(i)
    return ret


# 划分训练集和数据集
def training_test_gen(x, y):
    assert len(x) == len(y)
    indices = np.random.permutation(range(len(x)))
    split_size = int(0.9 * len(indices))
    trX = x[indices[:split_size]]
    trY = y[indices[:split_size]]
    teX = x[indices[split_size:]]
    teY = y[indices[split_size:]]
    return trX, trY, teX, teY


def get_pytorch_data(input_size=10, limit=1000):
    x = []
    y = []
    encoder = binary_encoder(input_size)
    for i in range(limit):
        x.append(encoder(i))
        if i % 15 == 0:
            y.append(0)
        elif i % 5 == 0:
            y.append(1)
        elif i % 3 == 0:
            y.append(2)
        else:
            y.append(3)
    return training_test_gen(np.array(x), np.array(y))


def get_numpy_data(input_size=10, limit=1000):
    x = []
    y = []
    encoder = binary_encoder(input_size)
    for i in range(limit):
        x.append(encoder(i))
        if i % 15 == 0:
            y.append([1, 0, 0, 0])
        elif i % 5 == 0:
            y.append([0, 1, 0, 0])
        elif i % 3 == 0:
            y.append([0, 0, 1, 0])
        else:
            y.append([0, 0, 0, 1])
    return training_test_gen(np.array(x), np.array(y))


def check_fizbuz(i):
    if i % 15 == 0:
        return 'fizbuz'
    elif i % 5 == 0:
        return 'buz'
    elif i % 3 == 0:
        return 'fiz'
    else:
        return 'number'

根据已有数据信息，构建网络的思路：
- 将输入数字二值化为10位数字，因此第一个输入层需要10个神经元才能接受这10个数字
- 输出始终是大小为4的向量，因此需要4个输出神经元
- 首先构建一个结点数位100的隐藏层
- 使用64个数据点进行数据的批处理


In [3]:
# 定义5个超参数
epochs = 500
batches = 64
lr = 0.01
input_size = 10
output_size = 4
hidden_size = 100

In [4]:
trX, trY, teX, teY = get_numpy_data(input_size)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
dtype = torch.float64

In [5]:
# 准备输入
x = torch.from_numpy(trX).to(device=device, dtype=dtype)
y = torch.from_numpy(trY).to(device=device, dtype=dtype)
print(x.grad, x.grad_fn, x)
# None None tensor([[...]])

None None tensor([[1., 0., 0.,  ..., 0., 1., 0.],
        [1., 1., 1.,  ..., 0., 0., 0.],
        [1., 1., 0.,  ..., 1., 0., 0.],
        ...,
        [1., 1., 1.,  ..., 0., 1., 1.],
        [0., 1., 1.,  ..., 1., 0., 1.],
        [0., 1., 1.,  ..., 1., 1., 0.]], device='cuda:0', dtype=torch.float64)


In [6]:
# 准备初始权重
w1 = torch.randn(input_size, hidden_size, requires_grad=True, device=device, dtype=dtype)
w2 = torch.randn(hidden_size, output_size, requires_grad=True, device=device, dtype=dtype)

print(w1.grad, w1.grad_fn, w1)
# None None tensor([[...]])

None None tensor([[ 2.6937e-01, -1.2346e+00,  8.7802e-01,  2.4474e+00, -3.8610e-01,
          1.4135e+00,  1.0142e+00,  2.4813e-02,  9.3307e-01,  1.0022e+00,
         -4.3590e-01, -1.3530e-01,  3.4596e-01,  1.6907e-02, -1.2306e+00,
          3.4348e-01,  5.0399e-01, -1.1721e+00,  1.3460e+00, -1.6082e-02,
          7.8378e-01,  2.2673e-01, -2.8122e-01,  1.2901e+00,  8.8757e-01,
         -1.3858e-01, -9.3914e-01,  1.1388e+00, -3.5818e-01,  1.8353e-02,
          1.2423e+00,  6.1580e-01,  9.7386e-01, -1.0645e+00, -1.0333e-01,
          1.9774e-01, -3.2640e-01,  9.8945e-01,  2.7390e-01, -2.2330e+00,
          1.3827e+00,  1.1603e-01, -4.7850e-01,  6.1885e-01,  6.1713e-02,
         -5.9669e-01,  2.5547e-01, -4.9556e-01,  1.0010e-01, -2.6631e-01,
         -1.3187e+00, -1.3085e+00, -1.4812e+00,  3.9448e-01, -5.4051e-01,
          9.6986e-01, -4.5480e-01,  8.1977e-01,  4.8560e-01,  9.0852e-02,
         -1.2053e+00,  2.0573e+00,  2.5806e-01, -2.1518e-01, -3.3136e-01,
          1.0085e-01,  1.376

In [7]:
# 准备偏置项
b1 = torch.zeros(1, hidden_size, requires_grad=True, device=device, dtype=dtype)
b2 = torch.zeros(1, output_size, requires_grad=True, device=device, dtype=dtype)

In [8]:
no_of_batches = int(len(trX) / batches)

In [9]:
# PyTorch支持基于动态图的网络，该网络在每次迭代时计算图
# 遍历数据时，实际是在动态地创建图，并在到达最后一个结点或根结点时对其进行反向传播
for epoch in range(epochs):
    for batch in range(no_of_batches):
        start = batch * batches
        end = start + batches
        x_ = x[start:end]
        y_ = y[start:end]

        a2 = x_.matmul(w1)
        a2 = a2.add(b1)

        # print(a2.grad, a2.grad_fn, a2)
        # None <AddBackward0 object at 0x7f5f3b9253c8> tensor([[...]])

        h2 = a2.sigmoid()

        a3 = h2.matmul(w2)
        a3 = a3.add(b2)
        hyp = a3.sigmoid()

        error = hyp - y_
        output = error.pow(2).sum() / 2.0
        output.backward()

        # 向前移动之前，需要检查之前检查过的所有张量
        # print(x.grad, x.grad_fn, x)
        # None None tensor([[...]])
        # print(w1.grad, w1.grad_fn, w1)
        # tensor([[...]], None, tensor([[...]]
        # print(a2.grad, a2.grad_fn, a2)
        # None <AddBackward0 object at 0x7f5f3d42c780> tensor([[...]])

        # Direct manipulation of data outside autograd is not allowed
        # when grad flag is True
        with torch.no_grad():
            w1 -= lr * w1.grad
            w2 -= lr * w2.grad
            b1 -= lr * b1.grad
            b2 -= lr * b2.grad
        # Making gradients zero. This is essential otherwise, gradient
        # from next iteration accumulates
        w1.grad.zero_()
        w2.grad.zero_()
        b1.grad.zero_()
        b2.grad.zero_()
#     if epoch % 10:
#         print(epoch, output.item())

In [10]:
# traversing the graph using .grad_fn
print(output.grad_fn)
# <DivBackward0 object at 0x7eff00ae3ef0>
print(output.grad_fn.next_functions[0][0])
# <SumBackward0 object at 0x7eff017b4128>
print(output.grad_fn.next_functions[0][0].next_functions[0][0])
# <PowBackward0 object at 0x7eff017b4128>

<DivBackward0 object at 0x000002A7292D0948>
<SumBackward0 object at 0x000002A7292ED988>
<PowBackward0 object at 0x000002A7292D0948>


In [11]:
# test
with torch.no_grad():
    x = torch.from_numpy(teX).to(device=device, dtype=dtype)
    y = torch.from_numpy(teY).to(device=device, dtype=dtype)

    a2 = x.matmul(w1)
    a2 = a2.add(b1)
    h2 = a2.sigmoid()

    a3 = h2.matmul(w2)
    a3 = a3.add(b2)
    hyp = a3.sigmoid()
    error = hyp - y
    output = error.pow(2).sum() / 2.
    outli = ['fizbuz', 'buz', 'fiz', 'number']
    for i in range(len(teX)):
        num = decoder(teX[i])
        print('Number: {} -- Actual: {} -- Prediction: {}'.format(num, check_fizbuz(num), outli[hyp[i].max(0)[1].item()]))
    print('Test loss: ', output.item() / len(x))
    accuracy = hyp.max(1)[1] == y.max(1)[1]
    print('accuracy: ', accuracy.sum().item() / len(accuracy))

Number: 21 -- Actual: fiz -- Prediction: number
Number: 78 -- Actual: fiz -- Prediction: fiz
Number: 866 -- Actual: number -- Prediction: number
Number: 710 -- Actual: buz -- Prediction: number
Number: 96 -- Actual: fiz -- Prediction: fiz
Number: 998 -- Actual: number -- Prediction: number
Number: 827 -- Actual: number -- Prediction: number
Number: 115 -- Actual: buz -- Prediction: fiz
Number: 845 -- Actual: buz -- Prediction: number
Number: 137 -- Actual: number -- Prediction: fiz
Number: 641 -- Actual: number -- Prediction: number
Number: 965 -- Actual: buz -- Prediction: number
Number: 994 -- Actual: number -- Prediction: number
Number: 485 -- Actual: buz -- Prediction: number
Number: 0 -- Actual: fizbuz -- Prediction: fiz
Number: 648 -- Actual: fiz -- Prediction: number
Number: 740 -- Actual: buz -- Prediction: number
Number: 578 -- Actual: number -- Prediction: number
Number: 863 -- Actual: number -- Prediction: number
Number: 55 -- Actual: buz -- Prediction: number
Number: 57 -- 