In [1]:
import numpy as np
import struct
import random
import matplotlib.pyplot as plt
import pandas as pd
import math
import cv2

In [2]:
def load_labels(file):
    with open(file, "rb") as f:
        data = f.read()
    
    magic_number, num_samples = struct.unpack(">ii", data[:8])
    if magic_number != 2049:   # 0x00000801
        print(f"magic number mismatch {magic_number} != 2049")
        return None
    
    labels = np.frombuffer(data[8:], dtype=np.uint8)
    return labels

def load_images(file):
    with open(file, "rb") as f:
        data = f.read()

    magic_number, num_samples, image_width, image_height = struct.unpack(">iiii", data[:16])
    if magic_number != 2051:   # 0x00000803
        print(f"magic number mismatch {magic_number} != 2051")
        return None
    
    image_data = np.frombuffer(data[16:], dtype=np.uint8).reshape(num_samples, -1)
    return image_data

def one_hot(labels, classes, label_smoothing=0):
    n = len(labels)
    eoff = label_smoothing / classes
    output = np.ones((n, classes), dtype=np.float32) * eoff
    for row, label in enumerate(labels):
        output[row, label] = 1 - label_smoothing + eoff
    return output

In [3]:
val_labels = load_labels("dataset/t10k-labels-idx1-ubyte")   #  10000,
val_images = load_images("dataset/t10k-images-idx3-ubyte")   #  10000, 784
val_images = (val_images - np.mean(val_images)) / np.var(val_images)
#val_images = val_images / 255 - 0.5
val_images = val_images.reshape(-1, 1, 28, 28)

train_labels = load_labels("dataset/train-labels-idx1-ubyte") # 60000,
train_images = load_images("dataset/train-images-idx3-ubyte") # 60000, 784
#train_images = train_images / 255 - 0.5
train_images = (train_images - np.mean(train_images)) / np.var(train_images)
train_images = train_images.reshape(-1, 1, 28, 28)
#train_images = (train_images - np.mean(train_images)) / np.var(train_images)

FileNotFoundError: [Errno 2] No such file or directory: 'dataset/t10k-labels-idx1-ubyte'

In [None]:
np.min(val_images), np.max(val_images)

In [None]:
class Dataset:
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels
        
    # 获取他的一个item，  dataset = Dataset(),   dataset[index]
    def __getitem__(self, index):
        return self.images[index].reshape(1, 1, 28, 28), self.labels[index]
    
    # 获取数据集的长度，个数
    def __len__(self):
        return len(self.images)
    
class DataLoaderIterator:
    def __init__(self, dataloader):
        self.dataloader = dataloader
        self.cursor = 0
        self.indexs = list(range(self.dataloader.count_data))  # 0, ... 60000
        if self.dataloader.shuffle:
            # 打乱一下
            np.random.shuffle(self.indexs)
            
    def __next__(self):
        if self.cursor >= self.dataloader.count_data:
            raise StopIteration()
            
        batch_data = []
        remain = min(self.dataloader.batch_size, self.dataloader.count_data - self.cursor)  #  256, 128
        for n in range(remain):
            index = self.indexs[self.cursor]
            data = self.dataloader.dataset[index]
            
            # 如果batch没有初始化，则初始化n个list成员
            if len(batch_data) == 0:
                batch_data = [[] for i in range(len(data))]
                
            #直接append进去
            for index, item in enumerate(data):
                batch_data[index].append(item)
            self.cursor += 1
            
        # 通过np.vstack一次性实现合并，而非每次一直在合并
        for index in range(len(batch_data)):
            batch_data[index] = np.vstack(batch_data[index])
        return batch_data

class DataLoader:
    
    # shuffle 打乱
    def __init__(self, dataset, batch_size, shuffle):
        self.dataset = dataset
        self.shuffle = shuffle
        self.count_data = len(dataset)
        self.batch_size = batch_size
        
    def __iter__(self):
        return DataLoaderIterator(self)

In [None]:
np.random.binomial(size=(10, 10), p=0.1, n=1)

In [None]:
class Module:
    def __init__(self, name):
        self.name = name
        self.train_mode = False
        
    def __call__(self, *args):
        return self.forward(*args)
    
    def train(self):
        self.train_mode = True
        for m in self.modules():
            m.train()
        
    def eval(self):
        self.train_mode = False
        for m in self.modules():
            m.eval()
        
    def modules(self):
        ms = []
        for attr in self.__dict__:
            m = self.__dict__[attr]
            if isinstance(m, Module):
                ms.append(m)
        return ms
    
    def params(self):
        ps = []
        for attr in self.__dict__:
            p = self.__dict__[attr]
            if isinstance(p, Parameter):
                ps.append(p)
            
        ms = self.modules()
        for m in ms:
            ps.extend(m.params())
        return ps
    
    def info(self, n):
        ms = self.modules()
        output = f"{self.name}\n"
        for m in ms:
            output += ('  '*(n+1)) + f"{m.info(n+1)}\n"
        return output[:-1]
    
    def __repr__(self):
        return self.info(0)
    
class Initializer:
    def __init__(self, name):
        self.name = name
        
    def __call__(self, *args):
        return self.apply(*args)
        
class GaussInitializer(Initializer):
    # where :math:`\mu` is the mean and :math:`\sigma` the standard
    # deviation. The square of the standard deviation, :math:`\sigma^2`,
    # is called the variance.
    def __init__(self, mu, sigma):
        self.mu = mu
        self.sigma = sigma
        
    def apply(self, value):
        value[...] = np.random.normal(self.mu, self.sigma, value.shape)
    
class Parameter:
    def __init__(self, value):
        self.value = value
        self.delta = np.zeros(value.shape)
        
    def zero_grad(self):
        self.delta[...] = 0
        
class Linear(Module):
    def __init__(self, input_feature, output_feature):
        super().__init__("Linear")
        self.input_feature = input_feature
        self.output_feature = output_feature
        self.weights = Parameter(np.zeros((input_feature, output_feature)))
        self.bias = Parameter(np.zeros((1, output_feature)))
        
        # 权重初始化 
        initer = GaussInitializer(0, np.sqrt(2 / input_feature))  # np.sqrt(2 / input_feature)
        initer.apply(self.weights.value)
        
    def forward(self, x):
        self.x_save = x.copy()
        return x @ self.weights.value + self.bias.value
    
    #AB = C  G
    #dB = A.T @ G
    #dA = G @ B.T
    def backward(self, G):
        self.weights.delta += self.x_save.T @ G
        self.bias.delta += np.sum(G, 0)  #值复制
        return G @ self.weights.value.T
    
class ReLU(Module):
    def __init__(self, inplace=True):
        super().__init__("ReLU")
        self.inplace = inplace
        
    # 亿点点
    def forward(self, x):
        self.negative_position = x < 0
        if not self.inplace:
            x = x.copy()
            
        x[self.negative_position] = 0
        return x
    
    def backward(self, G):
        if not self.inplace:
            G = G.copy()
            
        G[self.negative_position] = 0
        return G

def sigmoid(x):
    p0 = x < 0
    p1 = ~p0
    x = x.copy()

    # 如果x的类型是整数，那么会造成丢失精度
    x[p0] = np.exp(x[p0]) / (1 + np.exp(x[p0]))
    x[p1] = 1 / (1 + np.exp(-x[p1]))
    return x

class SWish(Module):
    def __init__(self):
        super().__init__("SWish")
        
    def forward(self, x):
        self.x_save = x.copy()
        self.sx = sigmoid(x)
        return x * self.sx
    
    def backward(self, G):
        return G * (self.sx + self.x_save * self.sx * (1 - self.sx))
    
class Dropout(Module):
    def __init__(self, prob_keep=0.5, inplace=True):
        super().__init__("Dropout")
        self.prob_keep = prob_keep
        self.inplace = inplace
        
    def forward(self, x):
        if not self.train_mode:
            return x
        
        self.mask = np.random.binomial(size=x.shape, p=1 - self.prob_keep, n=1)
        if not self.inplace:
            x = x.copy()
            
        x[self.mask] = 0
        x *= 1 / self.prob_keep  # rescale
        return x
    
    def backward(self, G):
        if not self.inplace:
            G = G.copy()
        G[self.mask] = 0
        G *= 1 / self.prob_keep
        return G
    
class Conv2d(Module):
    def __init__(self, in_feature, out_feature, kernel_size, padding=0, stride=1):
        super().__init__("Conv2d")
        self.in_feature = in_feature
        self.out_feature = out_feature
        self.kernel_size = kernel_size
        self.padding = padding
        self.stride = stride
        self.kernel = Parameter(np.ones((out_feature, in_feature, kernel_size, kernel_size)))
        self.bias = Parameter(np.zeros((out_feature)))
        initer = GaussInitializer(0, np.sqrt(2 / in_feature))  # np.sqrt(2 / input_feature)
        initer.apply(self.kernel.value)
        
    def forward(self, x):
        self.in_shape = x.shape
        ib, ic, ih, iw = self.in_shape
        self.oh = (ih + self.padding * 2 - self.kernel_size) // self.stride + 1
        self.ow = (iw + self.padding * 2 - self.kernel_size) // self.stride + 1
        col_w = self.oh * self.ow
        col_h = self.kernel_size * self.kernel_size * self.in_feature
        self.column = np.zeros((ib, col_h, col_w))
        self.output = np.zeros((ib, self.out_feature, self.oh, self.ow))
        khalf = self.kernel_size // 2
        self.kcol = self.kernel.value.reshape(self.out_feature, -1)
        for b in range(ib):
            for c in range(ic):
                for oy in range(self.oh):
                    for ox in range(self.ow):
                        for ky in range(self.kernel_size):
                            for kx in range(self.kernel_size):
                                column_y = ky * self.kernel_size + kx + c * self.kernel_size * self.kernel_size
                                column_x = ox + oy * self.ow
                                ix = ox * self.stride + kx - self.padding
                                iy = oy * self.stride + ky - self.padding
                                if ix >= 0 and iy >= 0 and ix < iw and iy < ih:
                                    self.column[b, column_y, column_x] = x[b, c, iy, ix]
            self.output[b] = (self.kcol @ self.column[b]).reshape(self.out_feature, self.oh, self.ow) + self.bias.value.reshape(self.out_feature, 1, 1)
        return self.output
    
    #AB = C  G
    #dB = A.T @ G
    #dA = G @ B.T
    def backward(self, G):
        
        ib, ic, ih, iw = self.in_shape
        for b in range(ib):
            self.kernel.delta += (G[b].reshape(self.out_feature, -1) @ self.column[b].T).reshape(self.kernel.value.shape)
    
        self.bias.delta += np.sum(G, axis=(0, 2, 3))
        self.Gout = np.zeros((self.in_shape))
        for b in range(ib):
            dcolumn = self.kcol.T @ G[b].reshape(self.out_feature, -1)
            
            for c in range(ic):
                for oy in range(self.oh):
                    for ox in range(self.ow):
                        for ky in range(self.kernel_size):
                            for kx in range(self.kernel_size):
                                column_y = ky * self.kernel_size + kx + c * self.kernel_size * self.kernel_size
                                column_x = ox + oy * self.ow
                                ix = ox * self.stride + kx - self.padding
                                iy = oy * self.stride + ky - self.padding
                                if ix >= 0 and iy >= 0 and ix < iw and iy < ih:
                                    self.Gout[b, c, iy, ix] += dcolumn[column_y, column_x]
        return self.Gout
    
class Flatten(Module):
    def __init__(self):
        super().__init__("Flatten")
        
    def forward(self, x):
        self.in_shape = x.shape
        out = x.reshape(self.in_shape[0], -1)
        return out
    
    def backward(self, G):
        return G.reshape(self.in_shape)
    
class ModuleList(Module):
    def __init__(self, *args):
        super().__init__("ModuleList")
        self.ms = list(args)
        
    def modules(self):
        return self.ms
    
    def forward(self, x):
        for m in self.ms:
            x = m(x)
        return x
    
    def backward(self, G):
        for i in range(len(self.ms)-1, -1, -1):
            G = self.ms[i].backward(G)
        return G
    
class SigmoidCrossEntropy(Module):
    def __init__(self, params, weight_decay=1e-5):
        super().__init__("CrossEntropyLoss")
        self.params = params
        self.weight_decay = weight_decay
        
    def sigmoid(self, x):
        #return 1 / (1 + np.exp(-x))
        p0 = x < 0
        p1 = ~p0
        x = x.copy()
        x[p0] = np.exp(x[p0]) / (1 + np.exp(x[p0]))
        x[p1] = 1 / (1 + np.exp(-x[p1]))
        return x
    
    def decay_loss(self):
        loss = 0
        for p in self.params:
            loss += np.sqrt(np.sum(p.value ** 2)) / (2 * p.value.size) * self.weight_decay
        return loss
    
    def decay_backward(self):
        for p in self.params:
            eps = 1e-8
            p.delta += 1 / (2 * np.sqrt(np.sum(p.value ** 2)) + eps) / (2 * p.value.size) * self.weight_decay * 2 * p.value

    def forward(self, x, label_onehot):
        eps = 1e-6
        self.label_onehot = label_onehot
        self.predict = self.sigmoid(x)
        self.predict = np.clip(self.predict, a_max=1-eps, a_min=eps)  # 裁切
        self.batch_size = self.predict.shape[0]
        return -np.sum(label_onehot * np.log(self.predict) + (1 - label_onehot) * 
                        np.log(1 - self.predict)) / self.batch_size + self.decay_loss()
    
    def backward(self):
        self.decay_backward()
        return (self.predict - self.label_onehot) / self.batch_size
    
class SoftmaxCrossEntropy(Module):
    def __init__(self):
        super().__init__("SoftmaxCrossEntropy")
        
    def softmax(self, x):
        #return np.exp(x) / np.sum(np.exp(x), axis=1, keepdims=True)
        max_x = np.max(x, axis=1, keepdims=True)
        exp_x = np.exp(x - max_x)
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)

    def forward(self, x, label_onehot):
        eps = 1e-6
        self.label_onehot = label_onehot
        self.predict = self.softmax(x)
        self.predict = np.clip(self.predict, a_max=1-eps, a_min=eps)  # 裁切
        self.batch_size = self.predict.shape[0]
        return -np.sum(label_onehot * np.log(self.predict)) / self.batch_size
    
    def backward(self):
        return (self.predict - self.label_onehot) / self.batch_size

In [None]:
# class Conv2d(Module):
#     def __init__(self, in_feature, out_feature, kernel_size, padding=0, stride=1):
#         super().__init__("Conv2d")
#         self.in_feature = in_feature
#         self.out_feature = out_feature
#         self.kernel_size = kernel_size
#         self.padding = padding
#         self.stride = stride
#         self.kernel = Parameter(np.ones((out_feature, in_feature, kernel_size, kernel_size)))
#         self.bias = Parameter(np.zeros((out_feature)))
#         #initer = GaussInitializer(0, np.sqrt(2 / in_feature))  # np.sqrt(2 / input_feature)
#         #initer.apply(self.kernel.value)
        
#     def forward(self, x):
#         self.in_shape = x.shape
#         ib, ic, ih, iw = self.in_shape
#         self.oh = (ih + self.padding * 2 - self.kernel_size) // self.stride + 1
#         self.ow = (iw + self.padding * 2 - self.kernel_size) // self.stride + 1
#         col_w = self.oh * self.ow
#         col_h = self.kernel_size * self.kernel_size * self.in_feature
#         self.column = np.zeros((col_h, col_w))
#         self.output = np.zeros((ib, self.out_feature, self.oh, self.ow))
#         khalf = self.kernel_size // 2
#         self.kcol = self.kernel.value.reshape(self.out_feature, -1)
#         for b in range(ib):
#             for c in range(ic):
#                 for oy in range(self.oh):
#                     for ox in range(self.ow):
#                         for ky in range(self.kernel_size):
#                             for kx in range(self.kernel_size):
#                                 column_y = ky * self.kernel_size + kx + c * self.kernel_size * self.kernel_size
#                                 column_x = ox + oy * self.ow
#                                 ix = ox * self.stride + kx - self.padding
#                                 iy = oy * self.stride + ky - self.padding
#                                 if ix >= 0 and iy >= 0 and ix < iw and iy < ih:
#                                     self.column[column_y, column_x] = x[b, c, iy, ix]
#             self.output[b] = (self.kcol @ self.column).reshape(self.out_feature, self.oh, self.ow) + self.bias.value.reshape(self.out_feature, 1, 1)
#         return self.output
    
#     #AB = C  G
#     #dB = A.T @ G
#     #dA = G @ B.T
#     def backward(self, G):
        
#         ib, ic, ih, iw = self.in_shape
#         for b in range(ib):
#             self.kernel.delta += (G[b].reshape(self.out_feature, -1) @ self.column.T).reshape(self.kernel.value.shape)
    
#         self.bias.delta += np.sum(G, axis=(0, 2, 3))
#         self.Gout = np.zeros((self.in_shape))
#         for b in range(ib):
#             dcolumn = self.kcol.T @ G[b].reshape(self.out_feature, -1)
            
#             for c in range(ic):
#                 for oy in range(self.oh):
#                     for ox in range(self.ow):
#                         for ky in range(self.kernel_size):
#                             for kx in range(self.kernel_size):
#                                 column_y = ky * self.kernel_size + kx + c * self.kernel_size * self.kernel_size
#                                 column_x = ox + oy * self.ow
#                                 ix = ox * self.stride + kx - self.padding
#                                 iy = oy * self.stride + ky - self.padding
#                                 if ix >= 0 and iy >= 0 and ix < iw and iy < ih:
#                                     self.Gout[b, c, iy, ix] += dcolumn[column_y, column_x]
#         return self.Gout

In [None]:
conv = Conv2d(1, 2, 3, 1, 1)
x = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9]
]).reshape(1, 1, 3, 3).astype(np.float32)
x = np.repeat(x, 2, axis=0)
x[1] *= 0.5
o = conv(x)
g = conv.backward(o)
o, g

In [None]:
x = np.ones((256, 10))

In [None]:
np.sum(x, axis=1, keepdims=True).shape

$$ p(x) = \frac{1}{\sqrt{ 2 \pi \sigma^2 }}
                     e^{ - \frac{ (x - \mu)^2 } {2 \sigma^2} } $$

In [None]:
class Optimizer:
    def __init__(self, name, model, lr):
        self.name = name
        self.model = model
        self.lr = lr
        self.params = model.params()
                
    def zero_grad(self):
        for param in self.params:
            param.zero_grad()
            
    def set_lr(self, lr):
        self.lr = lr
        
class SGD(Optimizer):
    def __init__(self, model, lr=1e-3):
        super().__init__("SGD", model, lr)
    
    def step(self):
        for param in self.params:
            param.value -= self.lr * param.delta
            
class SGDMomentum(Optimizer):
    def __init__(self, model, lr=1e-3, momentum=0.9):
        super().__init__("SGDMomentum", model, lr)
        self.momentum = momentum
        
        for param in self.params:
            param.v = 0
    
    # 移动平均
    def step(self):
        for param in self.params:
            param.v = self.momentum * param.v - self.lr * param.delta
            param.value += param.v
            
class Adam(Optimizer):
    def __init__(self, model, lr=1e-3, beta1=0.9, beta2=0.999, l2_regularization = 0):
        super().__init__("Adam", model, lr)
        self.beta1 = beta1
        self.beta2 = beta2
        self.l2_regularization = l2_regularization
        self.t = 0
        
        for param in self.params:
            param.m = 0
            param.v = 0
            
    # 指数移动平均
    def step(self):
        eps = 1e-8
        self.t += 1
        for param in self.params:
            g = param.delta
            param.m = self.beta1 * param.m + (1 - self.beta1) * g
            param.v = self.beta2 * param.v + (1 - self.beta2) * g ** 2
            mt_ = param.m / (1 - self.beta1 ** self.t)
            vt_ = param.v / (1 - self.beta2 ** self.t)
            param.value -= self.lr * mt_ / (np.sqrt(vt_) + eps) + self.l2_regularization * param.value

In [None]:
class Model(Module):
    def __init__(self, num_feature, num_hidden, num_classes):
        super().__init__("Model")
        self.backbone = ModuleList(
            Conv2d(1, 16, 3, 0, 2),
            ReLU(),
            Conv2d(16, 8, 3, 0, 2),
            ReLU(),
            Conv2d(8, 8, 3, 0, 1),
            ReLU(),
            Flatten(),
            Linear(num_feature, num_hidden),
            ReLU(),
            Dropout(),
            Linear(num_hidden, num_classes)
        )
        
    def forward(self, x):
        return self.backbone(x)
    
    def backward(self, G):
        return self.backbone.backward(G)

In [None]:
def sigmoid(x):
    return 1 / (1 + np.exp(-x))
    
def estimate_val(predict, gt_labels, classes, loss_func):
    plabel = predict.argmax(1)
    positive = plabel == gt_labels
    total_images = predict.shape[0]
    accuracy = sum(positive) / total_images
    return accuracy, loss_func(predict, one_hot(gt_labels, classes))

def lr_schedule_cosine(lr_min, lr_max, per_epochs):
    def compute(epoch):
        return lr_min + 0.5 * (lr_max - lr_min) * (1 + np.cos(epoch / per_epochs * np.pi))
    return compute

In [None]:
image = cv2.imread("cat.jpg")
show = image.copy()
image = cv2.resize(image, (28, 28))
plt.imshow(image[..., ::-1])
image = ((image / 255) - 0.5).astype(np.float32)
image = image.transpose(2, 0, 1)[None]
image.shape, show.shape

In [None]:
sx = 1 / 450
sy = 1 / 324
x, y, r, b = 41 * sx, 21 * sy, 210 * sx, 189 * sy
#cv2.rectangle(show, (x,y), (r, b), (0, 255, 0), 2)
#plt.imshow(show[..., ::-1])
x, y, r, b

In [None]:
class Model(Module):
    def __init__(self, num_feature, num_hidden, nreg):
        super().__init__("Model")
        self.backbone = ModuleList(
            Conv2d(3, 16, 3, 0, 2),
            ReLU(),
            Conv2d(16, 8, 3, 0, 2),
            ReLU(),
            Conv2d(8, 8, 3, 0, 1),
            ReLU(),
            Flatten(),
            Linear(num_feature, num_hidden),
            ReLU(),
            Linear(num_hidden, nreg)
        )
        
    def forward(self, x):
        return self.backbone(x)
    
    def backward(self, G):
        return self.backbone.backward(G)

np.random.seed(3)
lr = 1e-2
data_dims = 128

model = Model(data_dims, 64, 4)
optim = Adam(model, lr)
gt = np.array([x, y, r, b])

for i in range(100):
    predict = model(image)
    loss = np.sum((gt - predict) ** 2) * 0.5
    print(loss)

    optim.zero_grad()
    G = predict - gt
    model.backward(G)
    optim.step()   # 应用梯度，更新参数



In [None]:
predict = model(image)
predict, gt

In [None]:
h, w = show.shape[:2]
nx, ny, nr, nb = predict[0] * np.array([w, h, w, h])
nx, ny, nr, nb

In [None]:
cv2.rectangle(show, (int(nx), int(ny)), (int(nr), int(nb)), (0, 255, 0), 2)
plt.imshow(show[..., ::-1])

In [None]:
np.random.seed(3)
classes = 10                  # 定义10个类别
batch_size = 32              # 定义每个批次的大小
epochs = 20                   # 退出策略，也就是最大把所有数据看10次
lr = 1e-2
numdata, _, _, _ = train_images.shape  # 60000, 784
data_dims = 128

# 定义dataloader和dataset，用于数据抓取
train_data = DataLoader(Dataset(train_images, one_hot(train_labels, classes)), batch_size, shuffle=True)
model = Model(data_dims, 64, classes)
#loss_func = SoftmaxCrossEntropy()
loss_func = SigmoidCrossEntropy(model.params(), 0)
optim = Adam(model, lr)
iters = 0   # 定义迭代次数，因为我们需要展示loss曲线，那么x将会是iters

lr_schedule = {
    5: 1e-3,
    15: 1e-4,
    18: 1e-5
}

# 开始进行epoch循环，总数是epochs次
for epoch in range(epochs):
    
    if epoch in lr_schedule:
        lr = lr_schedule[epoch]
        optim.set_lr(lr)
    
    model.train()
    # 对一个批次内的数据进行迭代，每一次迭代都是一个batch（即256）
    for index, (images, labels) in enumerate(train_data):
        
        x = model(images)
        
        # 计算loss值
        loss = loss_func(x, labels)
        
        optim.zero_grad()
        G = loss_func.backward()
        model.backward(G)
        optim.step()   # 应用梯度，更新参数
        iters += 1
        
        print(f"Iter {iters}, {epoch} / {epochs}, Loss {loss:.3f}, LR {lr:g}")
    
    model.eval()
    val_accuracy, val_loss = estimate_val(model(val_images), val_labels, classes, loss_func)
    print(f"Val set, Accuracy: {val_accuracy:.6f}, Loss: {val_loss:.3f}")

### 今日总结
1. Makefile的使用
    - \\$\@ 生成项  \\$\< 依赖项第一个  \\$^ 依赖项的所有
    - var := \\$(shell command)  执行command，结果赋值给var
    - 生成项 : 依赖项1 依赖项2 依赖项n
         - command
    - \\$(patsubst src,dst,list)
    - 数据类型，字符串，数组，数组以空格区分
    - find . -name "*.cpp" 查找当前目录下的所有cpp文件
    - %.o : %.cpp  通配
2. 优化了程序结构
    - 使用Layer的方式抽象每一个层
    - 使用Model的方式抽象模型
    - 使用Parameter抽象可训练参数
    - 使用Optimizer抽象优化器
    - 引入参数初始化器，高斯初始化
    - 加入zero_grad，清空梯度，其实是摆设。多次迭代，一次更新时有用（etc. GAN、强化学习）
3. 引入优化器
    - Momentum SGD，动量SGD，移动平均，物理解释（算是合理的）是惯性，这个也很常用
    - Adam，指数移动平均，这个很常用
4. ReLU激活函数
    - 在0点的导数，是不可导。可以指定为0或者1，自己去研究
5. 没有了

### 作业
1. 周五24：00交
2. 交什么？
    - a. 实现老师今天总结的BP引入的内容（把这个程序打一遍）
    - b. 要求精度大于0.965以上
    - c. 精度第一的，考虑奖励一些东西