## 准备数据

In [None]:
import torch
from torchvision import datasets, transforms

def mnist_dataset():
    # 定义数据预处理
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.1307,), (0.3081,))
    ])

    # 加载训练集和测试集
    train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True)
    test_dataset = datasets.MNIST(root='./data', train=False, transform=transform, download=True)

    return train_dataset, test_dataset


## Demo numpy based auto differentiation

In [None]:
import torch


class Matmul:
    def __init__(self):
        self.mem = {}

    def forward(self, x, W):
        h = torch.matmul(x, W)
        self.mem = {"x": x, "W": W}
        return h

    def backward(self, grad_y):
        """
        x: shape(N, d)
        W: shape(d, d')
        grad_y: shape(N, d')
        """
        x = self.mem["x"]
        W = self.mem["W"]

        grad_x = torch.matmul(grad_y, W.T)  # 计算输入的梯度
        grad_W = torch.matmul(x.T, grad_y)  # 计算权重的梯度

        return grad_x, grad_W


class Relu:
    def __init__(self):
        self.mem = {}

    def forward(self, x):
        self.mem["x"] = x
        return torch.where(x > 0, x, torch.zeros_like(x))  # 使用 PyTorch 的功能

    def backward(self, grad_y):
        """
        grad_y: same shape as x
        """
        x = self.mem["x"]
        grad_x = torch.where(x > 0, grad_y, torch.zeros_like(grad_y))  # 计算ReLU的导数
        return grad_x


class Softmax:
    def __init__(self):
        self.epsilon = 1e-12
        self.mem = {}

    def forward(self, x):
        """
        x: shape(N, c)
        """
        x_exp = torch.exp(x)
        # x_max = torch.max(x, dim=1, keepdim=True).values  # 计算每行的最大值
        # x_exp = torch.exp(x - x_max)  # 数值稳定性
        partition = torch.sum(x_exp, axis=1, keepdims=True)
        out = x_exp / (partition + self.epsilon)

        self.mem["out"] = out
        self.mem["x_exp"] = x_exp
        return out

    def backward(self, grad_y):
        """
        grad_y: same shape as x, shape(N, c)
        """
        s = self.mem["out"]
        sisj = torch.matmul(s.unsqueeze(2), s.unsqueeze(1))  # (N, c, c)
        g_y_exp = grad_y.unsqueeze(1)
        tmp = torch.matmul(g_y_exp, sisj)  # (N, 1, c)
        tmp = tmp.squeeze(1)  # (N, c)
        tmp = -tmp + grad_y * s
        return tmp


class Log:
    def __init__(self):
        self.epsilon = 1e-12
        self.mem = {}

    def forward(self, x):
        """
        x: shape(N, c)
        """
        out = torch.log(x + self.epsilon)

        self.mem["x"] = x
        return out

    def backward(self, grad_y):
        """
        grad_y: same shape as x , shape(N, c)
        """
        x = self.mem["x"]
        return 1.0 / (x + self.epsilon) * grad_y  # 导数计算

## Gradient check

In [None]:
import torch
import torch.nn.functional as F
from IPython.display import display

# 随机生成数据
x = torch.normal(mean=1.0, std=5.0, size=(5, 6))
W = torch.normal(mean=0.0, std=1.0, size=(6, 4))

aa = Matmul()
out = aa.forward(x, W)  # shape(5, 4)
grad = aa.backward(torch.ones_like(out))
display(grad)  

x, W = x.requires_grad_(), W.requires_grad_()
y = torch.matmul(x, W)
loss = y.sum()
loss.backward()
grads = x.grad
display(grads)  

aa = Relu()
out = aa.forward(x)  # shape(5, 6)
grad = aa.backward(torch.ones_like(out))
display(grad)  
x = torch.normal(mean=1.0, std=5.0, size=(5, 6), requires_grad=True)
y = F.relu(x)
loss = y.sum()
loss.backward()
grads = x.grad
display(grads)  

# Softmax 和交叉熵的实现
x = torch.normal(mean=0.0, std=1.0, size=(5, 6))
label = torch.zeros_like(x)
label[0, 1] = 1.0
label[1, 0] = 1.0
label[1, 1] = 1.0
label[2, 3] = 1.0
label[3, 5] = 1.0
label[4, 0] = 1.0

aa = Softmax()
out = aa.forward(x)  # shape(5, 6)
grad = aa.backward(label)
display(grad)  

x = torch.normal(mean=0.0, std=1.0, size=(5, 6), requires_grad=True)
y = F.softmax(x, dim=1)
loss = (y * label).sum()
loss.backward()
grads = x.grad
display(grads)  

aa = Log()
out = aa.forward(x)  # shape(5, 6)
grad = aa.backward(label)
display(grad) 

# 使用 PyTorch 自带的 autograd 进行梯度计算
x = torch.normal(mean=1.0, std=5.0, size=(5, 6), requires_grad=True)
y = torch.log(x)
loss = (y * label).sum()
loss.backward()
grads = x.grad
display(grads)  


# Final Gradient Check

In [13]:
import torch

# 随机生成输入数据和权重
x = torch.randn(5, 6)  # 使用torch生成随机输入数据
W1 = torch.randn(6, 5)  # 使用torch生成随机权重
W2 = torch.randn(5, 6)  # 使用torch生成随机权重

# 创建标签张量，采用 PyTorch 的方式
label = torch.zeros(5, 6)  # 这里确定标签的形状
label[0, 1] = 1.0
label[1, 0] = 1.0
label[2, 3] = 1.0
label[3, 5] = 1.0
label[4, 0] = 1.0

# 以 PyTorch 的形式创建 Matmul、Relu、Softmax、Log 的实例
mul_h1 = Matmul()
mul_h2 = Matmul()
relu = Relu()
softmax = Softmax()
log = Log()

# 前向传播
h1 = mul_h1.forward(x, W1)  # shape(5, 5)
h1_relu = relu.forward(h1)
h2 = mul_h2.forward(h1_relu, W2)
h2_soft = softmax.forward(h2)
h2_log = log.forward(h2_soft)

# 反向传播
h2_log_grad = log.backward(label)
h2_soft_grad = softmax.backward(h2_log_grad)
h2_grad, W2_grad = mul_h2.backward(h2_soft_grad)
h1_relu_grad = relu.backward(h2_grad)
h1_grad, W1_grad = mul_h1.backward(h1_relu_grad)

# 打印最终结果
#print(h2_log_grad)
print(W1_grad)
print(W2_grad)
print('--' * 50)

# 使用 PyTorch 的自动求导
x_tensor = x  # 使用之前生成的x
W1_tensor = W1  # 使用之前生成的W1
W2_tensor = W2  # 使用之前生成的W2
label_tensor = label  # 使用之前生成的label

# 计算图
x_tensor.requires_grad = True
W1_tensor.requires_grad = True
W2_tensor.requires_grad = True

h1 = torch.matmul(x_tensor, W1_tensor)
h1_relu = torch.relu(h1)
h2 = torch.matmul(h1_relu, W2_tensor)
prob = torch.softmax(h2, dim=1)
log_prob = torch.log(prob)
loss = torch.sum(label_tensor * log_prob)

# 计算梯度
loss.backward()

grads = [W1_tensor.grad, W2_tensor.grad]
for grad in grads:
    print(grad.numpy())


tensor([[ 0.1642, -3.4692,  0.7434,  1.2980, -1.5350],
        [-2.0963, -4.4178, -3.9099, -6.9246,  1.2034],
        [ 0.5456, -1.2010,  0.4848,  2.0277, -1.4400],
        [ 0.3582, -4.9010,  0.5285,  0.4637,  0.6839],
        [ 0.4142,  2.0812,  1.3810,  1.4073,  0.7473],
        [-0.6213,  0.8196, -0.8686, -1.7750, -0.3635]])
tensor([[ 1.4916e+00,  1.5099e-01, -5.6423e-02, -1.5412e+00, -6.2669e-03,
         -3.8729e-02],
        [-1.8614e+00,  1.7616e+00, -1.3660e-01, -9.3748e-01, -4.1187e-03,
          1.1779e+00],
        [ 2.7911e+00, -6.6404e-03, -5.0846e-02, -2.7001e+00, -2.4836e-03,
         -3.0975e-02],
        [ 2.1555e+00,  2.3055e+00, -2.8566e-01, -3.9659e+00, -3.3285e-02,
         -1.7611e-01],
        [-3.6194e-01,  1.0228e-01, -2.2656e-01,  9.0856e-01, -1.3109e-01,
         -2.9125e-01]])
----------------------------------------------------------------------------------------------------
[[ 0.16419981 -3.4691637   0.7433523   1.2979882  -1.534984  ]
 [-2.0962892  -4.41

## 建立模型

In [None]:
class myModel:
    def __init__(self):
        # 使用torch生成权重
        self.W1 = torch.randn(28 * 28 + 1, 100)  # 输入层的权重
        self.W2 = torch.randn(100, 10)  # 隐藏层到输出层的权重

        self.mul_h1 = Matmul()
        self.mul_h2 = Matmul()
        self.relu = Relu()
        self.softmax = Softmax()
        self.log = Log()

    def forward(self, x):
        x = x.view(-1, 28 * 28)  # 使用torch的view函数进行形状调整
        bias = torch.ones(x.size(0), 1)  # 创建一个偏置项
        x = torch.cat([x, bias], dim=1)  # 使用torch.cat进行拼接

        self.h1 = self.mul_h1.forward(x, self.W1)  # Shape: (batch_size, 100)
        self.h1_relu = self.relu.forward(self.h1)
        self.h2 = self.mul_h2.forward(self.h1_relu, self.W2)
        self.h2_soft = self.softmax.forward(self.h2)
        self.h2_log = self.log.forward(self.h2_soft)

    def backward(self, label):
        self.h2_log_grad = self.log.backward(-label)
        self.h2_soft_grad = self.softmax.backward(self.h2_log_grad)
        self.h2_grad, self.W2_grad = self.mul_h2.backward(self.h2_soft_grad)
        self.h1_relu_grad = self.relu.backward(self.h2_grad)
        self.h1_grad, self.W1_grad = self.mul_h1.backward(self.h1_relu_grad)


# 创建模型实例
model = myModel()

## 计算 loss

In [21]:
from torch import nn
def compute_loss(log_prob, labels):
    cross_loss = nn.CrossEntropyLoss()

    return cross_loss(log_prob, labels)


def compute_accuracy(log_prob, labels):
    predictions = torch.argmax(log_prob, axis=1)
    truth = torch.argmax(labels, axis=1)
    predictions = predictions.to(torch.int64)
    truth = truth.to(torch.int64)
    
    accuracy = torch.mean(predictions.eq(truth).to(torch.float32))  
    return accuracy


def train_one_step(model, x, y):
    model.forward(x)
    model.backward(y)
    model.W1 -= 1e-5 * model.W1_grad
    model.W2 -= 1e-5 * model.W2_grad
    loss = compute_loss(model.h2_log, y)
    accuracy = compute_accuracy(model.h2_log, y)
    return loss, accuracy


def test(model, x, y):
    model.forward(x)
    loss = compute_loss(model.h2_log, y)
    accuracy = compute_accuracy(model.h2_log, y)
    return loss, accuracy

## 实际训练

In [22]:

# 生成数据集
train_data, test_data = mnist_dataset()

# 生成零标签，注意这里的shape根据实际的样本数量调整
train_label = torch.zeros(len(train_data), 10)
test_label = torch.zeros(len(test_data), 10)

# 根据数据集中的标签更新one-hot标签
for i in range(len(train_data)):
    train_label[i][train_data[i][1]] = 1.0

for i in range(len(test_data)):
    test_label[i][test_data[i][1]] = 1.0

# 训练模型
for epoch in range(50):
    # 这里假设train_one_step函数和model定义是已存在的
    loss, accuracy = train_one_step(model, torch.stack([data[0] for data in train_data]), train_label)
    print('epoch', epoch, ': loss', loss.item(), '; accuracy', accuracy)

# 测试模型
loss, accuracy = test(model, torch.stack([data[0] for data in test_data]), test_label)
print('test loss', loss.item(), '; test accuracy', accuracy)

epoch 0 : loss nan ; accuracy tensor(0.0987)
epoch 1 : loss nan ; accuracy tensor(0.0987)
epoch 2 : loss nan ; accuracy tensor(0.0987)
epoch 3 : loss nan ; accuracy tensor(0.0987)


KeyboardInterrupt: 