## 知识蒸馏-mnist手写数据集

In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision import datasets
import utils.calculate_param as cp
from tqdm import tqdm
import math
import numpy as np
import pandas as pd
import geatpy as ea
from multiprocessing.dummy import Pool as ThreadPool
from sklearn.metrics import mean_absolute_error, explained_variance_score, r2_score, mean_squared_error

### 数据准备

In [2]:
# 没有就下载-手写数据集
training_data = datasets.FashionMNIST(
    root="data",
    train=True,
    download=True,
    transform=transforms.ToTensor(),
)

test_data = datasets.FashionMNIST(
    root="data",
    train=False,
    download=True,
    transform=transforms.ToTensor(),
)

In [3]:
batch_size = 64

train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break


Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


### 设备准备

In [4]:
# 获取设备类型
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cpu'

### 定义教师模型

In [5]:
class TeacherModel(nn.Module):
    def __init__(self, num_classes= 10):
        super(TeacherModel, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(784, 1200)
        self.fc2 = nn.Linear(1200, 1200)
        self.fc3 = nn.Linear(1200, num_classes)
        self.dropout = nn.Dropout(p = 0.5)

    # 处理逻辑：fc1->dropout->relu->fc2->dropout->relu->fc3
    def forward(self, x):
        x = x.view(-1,784)
        x = self.fc1(x)
        x = self.dropout(x)
        x = self.relu(x)

        x = self.fc2(x)
        x = self.dropout(x)
        x = self.relu(x)

        x = self.fc3(x)
        return x


### 教师模型设置

In [6]:
model = TeacherModel().to(device)

criterion = nn.CrossEntropyLoss() # 设置使用交叉熵损失函数
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4) # 使用Adam优化器，学习率为lr=1e-4

### 教师模型信息

In [7]:
# 输出模型的参数信息-100w参数
cp.get_summary(model, input_size=(64, 1, 28,28))

torchinfo信息如下：
Layer (type:depth-idx)                   Output Shape              Param #
TeacherModel                             [64, 10]                  --
├─Linear: 1-1                            [64, 1200]                942,000
├─Dropout: 1-2                           [64, 1200]                --
├─ReLU: 1-3                              [64, 1200]                --
├─Linear: 1-4                            [64, 1200]                1,441,200
├─Dropout: 1-5                           [64, 1200]                --
├─ReLU: 1-6                              [64, 1200]                --
├─Linear: 1-7                            [64, 10]                  12,010
Total params: 2,395,210
Trainable params: 2,395,210
Non-trainable params: 0
Total mult-adds (M): 153.29
Input size (MB): 0.20
Forward/backward pass size (MB): 1.23
Params size (MB): 9.58
Estimated Total Size (MB): 11.02


### 教师模型训练&评估

In [8]:
%%time
epochs = 6 # 训练6轮
for epoch in range(epochs):
    model.train()
    for data,targets in tqdm(train_dataloader):
        # 前向预测
        outputs = model(data)
        loss = criterion(outputs, targets)

        # 反向传播，优化权重
        optimizer.zero_grad()  # 把梯度置为0
        loss.backward()
        optimizer.step()

    # 测试集上评估性能 进入评估模式
    model.eval()
    num_correct = 0
    num_samples = 0

    with torch.no_grad():
        for x,y in test_dataloader:
            outputs = model(x)
            predictions = outputs.max(1).indices
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
        acc = (num_correct / num_samples).item()

    model.train()  # 再次进入训练模式
    print("Epoch:{}\t Accuracy:{:4f}".format(epoch + 1, acc))

# 暂存教师模型为teacher_model
teacher_model = model


100%|██████████| 938/938 [00:23<00:00, 39.64it/s]


Epoch:1	 Accuracy:0.832200


100%|██████████| 938/938 [00:23<00:00, 40.62it/s]


Epoch:2	 Accuracy:0.848900


100%|██████████| 938/938 [00:22<00:00, 42.18it/s]


Epoch:3	 Accuracy:0.856800


100%|██████████| 938/938 [00:21<00:00, 43.54it/s]


Epoch:4	 Accuracy:0.863300


100%|██████████| 938/938 [00:21<00:00, 43.10it/s]


Epoch:5	 Accuracy:0.869300


100%|██████████| 938/938 [00:22<00:00, 41.94it/s]


Epoch:6	 Accuracy:0.871100
CPU times: user 4min 17s, sys: 13.6 s, total: 4min 30s
Wall time: 2min 25s


### 定义学生模型

In [24]:
# 学生模型
class StudentModel(nn.Module):
    def __init__( self, num_class=10):
        super(StudentModel, self).__init__()
        self.relu = nn.ReLU()
        self.fc1 = nn.Linear(784, 20)
        self.fc2 = nn.Linear(20, 20)
        self.fc3 = nn.Linear(20, num_class)
        self.dropout = nn.Dropout(p = 0.5)

    # 处理逻辑：fc1->relu->fc2->relu->fc3
    def forward(self, x):
        x = x.view(-1, 784)
        x = self.fc1(x)
        # x = self.dropout(x)
        x = self.relu(x)

        # x = self.fc2(x)
        # x = self.dropout(x)
        # x = self.relu(x)

        x = self.fc3(x)
        return x


### 学生模型设置

In [25]:
# 从头先训练一下学生模型
model = StudentModel().to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(),lr=1e-4)

### 学生模型信息

In [26]:
# 输出模型的参数信息-1w参数
cp.get_summary(model, input_size=(64, 1, 28,28))

torchinfo信息如下：
Layer (type:depth-idx)                   Output Shape              Param #
StudentModel                             [64, 10]                  210
├─Linear: 1-1                            [64, 20]                  15,700
├─ReLU: 1-2                              [64, 20]                  --
├─Linear: 1-3                            --                        420
├─Linear: 1-6                            [64, 10]                  (recursive)
├─Dropout: 1-5                           --                        --
├─Linear: 1-6                            [64, 10]                  (recursive)
Total params: 16,330
Trainable params: 16,330
Non-trainable params: 0
Total mult-adds (M): 1.03
Input size (MB): 0.20
Forward/backward pass size (MB): 0.01
Params size (MB): 0.06
Estimated Total Size (MB): 0.28


### 学生模型训练&评估

In [27]:
%%time
epochs = 6
for epoch in range(epochs):
    model.train()
    for data,targets in tqdm(train_dataloader):
        # 前向预测
        outputs = model(data)
        loss = criterion(outputs, targets)

        # 反向传播，优化权重
        optimizer.zero_grad() # 把梯度置为0
        loss.backward()
        optimizer.step()

    model.eval()
    num_correct = 0
    num_samples = 0

    with torch.no_grad():
        for x,y in  test_dataloader:
            outputs = model(x)
            predictions = outputs.max(1).indices
            num_correct += (predictions==y).sum()
            num_samples += predictions.size(0)
            acc = (num_correct / num_samples).item()

    model.train()
    print("Epoch:{}\t Accuracy:{:4f}".format(epoch + 1, acc))

# 暂存不加蒸馏学生模型为student_model
student_model = model


100%|██████████| 938/938 [00:05<00:00, 178.49it/s]


Epoch:1	 Accuracy:0.727300


100%|██████████| 938/938 [00:05<00:00, 180.80it/s]


Epoch:2	 Accuracy:0.774200


100%|██████████| 938/938 [00:05<00:00, 180.90it/s]


Epoch:3	 Accuracy:0.798800


100%|██████████| 938/938 [00:05<00:00, 180.39it/s]


Epoch:4	 Accuracy:0.807700


100%|██████████| 938/938 [00:05<00:00, 170.02it/s]


Epoch:5	 Accuracy:0.818200


100%|██████████| 938/938 [00:05<00:00, 173.47it/s]


Epoch:6	 Accuracy:0.824100
CPU times: user 1min 10s, sys: 1.09 s, total: 1min 11s
Wall time: 36.7 s


### 知识蒸馏准备

In [28]:
# 准备预训练好的教师模型
teacher_model.eval()

# 准备新的学生模型
model = StudentModel().to(device)

# 蒸馏温度
T = 7

### 蒸馏参数设置

In [29]:
# hard_loss
hard_loss = nn.CrossEntropyLoss()
# hard_loss权重
alpha = 0.27
# soft_loss kl散度
soft_loss = nn.KLDivLoss(reduction='batchmean')

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

### 知识蒸馏训练&评估

In [30]:
%%time
epochs = 8
for epoch in range(epochs):
    for data,targets in tqdm(train_dataloader):
        data, targets = data.to(device), targets.to(device)
        # 教师模型预测
        with torch.no_grad():
            teacher_outputs = teacher_model(data)
        # 学生模型预测
        student_outputs = model(data)
        student_loss = hard_loss(student_outputs, targets)
        # 计算蒸馏后的预测结果及soft_loss
        distillation_loss = soft_loss(
            F.softmax(student_outputs/T, dim=1),
            F.softmax(teacher_outputs/T, dim=1)
        )
        # 将 hard_loss 和 soft_loss 加权求和
        loss = alpha * student_loss + (1-alpha) * distillation_loss
        # 反向传播,优化权重
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # 测试集上评估性能
    model.eval()
    num_correct = 0
    num_samples = 0

    with torch.no_grad():
        for x,y in test_dataloader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            pred = outputs.max(1).indices
            num_correct += (pred == y).sum()
            num_samples += pred.size(0)
        acc = (num_correct/num_samples).item()

    model.train()
    print("Epoch:{}\t Accuracy:{:4f}".format(epoch + 1, acc))


100%|██████████| 938/938 [00:08<00:00, 115.25it/s]


Epoch:1	 Accuracy:0.730700


100%|██████████| 938/938 [00:07<00:00, 117.88it/s]


Epoch:2	 Accuracy:0.781900


100%|██████████| 938/938 [00:07<00:00, 120.81it/s]


Epoch:3	 Accuracy:0.800800


100%|██████████| 938/938 [00:07<00:00, 117.26it/s]


Epoch:4	 Accuracy:0.809700


100%|██████████| 938/938 [00:07<00:00, 120.69it/s]


Epoch:5	 Accuracy:0.815500


100%|██████████| 938/938 [00:07<00:00, 120.39it/s]


Epoch:6	 Accuracy:0.821200


100%|██████████| 938/938 [00:08<00:00, 112.96it/s]


Epoch:7	 Accuracy:0.824300


100%|██████████| 938/938 [00:07<00:00, 118.21it/s]


Epoch:8	 Accuracy:0.827000
CPU times: user 2min 14s, sys: 1.84 s, total: 2min 16s
Wall time: 1min 10s


### 蒸馏模型保存

In [18]:
# 保存模型
torch.save(model.state_dict(), "./models/distillation_model.pth")

### 定义问题类

In [19]:
class MOEA(ea.Problem):
    def __init__(self, train_dataloader, test_dataloader):
        name = 'MOEA'
        M = 1 # 初始化M（目标维数）
        maxormins = [-1] # 初始化maxormins（目标最小最大化标记列表，1：最小化该目标；-1：最大化该目标）
        Dim = 2 # 初始化Dim（决策变量维数）
        varTypes = np.array([0] * Dim) # 初始化varTypes 0-连续
        lb = [5, 0.1] # 决策变量下界
        ub = [10, 0.9] # 决策变量上界
        lbin = [1] * Dim # 决策变量下边界（0表示不包含该变量的下边界，1表示包含）
        ubin = [1] * Dim # 决策变量上边界（0表示不包含该变量的上边界，1表示包含）
        # 调用父类构造方法完成实例化
        ea.Problem.__init__(self, name, M, maxormins, Dim, varTypes, lb, ub, lbin, ubin)
        # 数据设置
        self.train_dataloader = train_dataloader
        self.test_dataloader = test_dataloader


    # 目标函数，采用多线程加速计算
    def aimFunc(self, pop):
        Vars = pop.Phen # 得到决策变量矩阵
        # print(Vars)
        pop.ObjV = np.zeros((pop.sizes, 1)) # 初始化种群个体目标函数值列向量
        def subAimFunc(i):
            epochs, alpha = int(Vars[i, 0]), float(Vars[i, 1])
            print(epochs, alpha)
            final_loss = 0
            for epoch in range(epochs):
                for data,targets in tqdm(train_dataloader):
                    data, targets = data.to(device), targets.to(device)
                    # 教师模型预测
                    with torch.no_grad():
                        teacher_outputs = teacher_model(data)
                    # 学生模型预测
                    student_outputs = model(data)
                    student_loss = hard_loss(student_outputs, targets)
                    # 计算蒸馏后的预测结果及soft_loss
                    distillation_loss = soft_loss(
                        F.softmax(student_outputs/T, dim=1),
                        F.softmax(teacher_outputs/T, dim=1)
                    )
                    # 将 hard_loss 和 soft_loss 加权求和
                    loss = alpha * student_loss + (1-alpha) * distillation_loss
                    final_loss = loss.item()
                    # 反向传播,优化权重
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

            pop.ObjV[i] = final_loss # 最小化最终的损失作为目标函数
        pool = ThreadPool(processes=2) # 设置池的大小
        pool.map(subAimFunc, list(range(pop.sizes))) # 散列种群每个个体进行加速计算


    # 代入优化后的参数先训练再对测试集进行检验，计算指标
    def test(self, epochs, alpha):
        for epoch in range(epochs):
            for data,targets in tqdm(train_dataloader):
                data, targets = data.to(device), targets.to(device)
                # 教师模型预测
                with torch.no_grad():
                    teacher_outputs = teacher_model(data)
                # 学生模型预测
                student_outputs = model(data)
                student_loss = hard_loss(student_outputs, targets)
                # 计算蒸馏后的预测结果及soft_loss
                distillation_loss = soft_loss(
                    F.softmax(student_outputs/T, dim=1),
                    F.softmax(teacher_outputs/T, dim=1)
                )
                # 将 hard_loss 和 soft_loss 加权求和
                loss = alpha * student_loss + (1-alpha) * distillation_loss
                # 反向传播,优化权重
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            # 测试集上评估性能
            model.eval()
            num_correct = 0
            num_samples = 0

            with torch.no_grad():
                for x,y in test_dataloader:
                    x, y = x.to(device), y.to(device)
                    outputs = model(x)
                    pred = outputs.max(1).indices
                    num_correct += (pred == y).sum()
                    num_samples += pred.size(0)
                acc = (num_correct/num_samples).item()

            model.train()
            print("Epoch:{}\t Accuracy:{:4f}".format(epoch + 1, acc))

        torch.save(model.state_dict(), "./models/moea_distillation.pth")



In [20]:
"""===============================实例化问题对象==========================="""

problem = MOEA(train_dataloader, test_dataloader) # 生成问题对象

"""=================================种群设置==============================="""

Encoding = 'RI'       # 编码方式
NIND = 10             # 种群规模
Field = ea.crtfld(Encoding, problem.varTypes, problem.ranges, problem.borders) # 创建区域描述器
population = ea.Population(Encoding, Field, NIND) # 实例化种群对象（此时种群还没被初始化，仅仅是完成种群对象的实例化）

"""===============================算法参数设置============================="""

myAlgorithm = ea.soea_DE_rand_1_bin_templet(problem, population) # 实例化一个算法模板对象
myAlgorithm.MAXGEN = 10 # 最大进化代数
myAlgorithm.trappedValue = 1e-6 # “进化停滞”判断阈值
myAlgorithm.maxTrappedCount = 10 # 进化停滞计数器最大上限值，如果连续maxTrappedCount代被判定进化陷入停滞，则终止进化
myAlgorithm.logTras = 1  # 设置每隔多少代记录日志，若设置成0则表示不记录日志
myAlgorithm.verbose = True  # 设置是否打印输出日志信息
myAlgorithm.drawing = 1  # 设置绘图方式（0：不绘图；1：绘制结果图；2：绘制目标空间过程动画；3：绘制决策空间过程动画）

"""===========================调用算法模板进行种群进化======================="""

[BestIndi, population] = myAlgorithm.run()  # 执行算法模板，得到最优个体以及最后一代种群
BestIndi.save()  # 把最优个体的信息保存到文件中

"""==================================输出结果============================="""

print('用时：%f 秒' % myAlgorithm.passTime)
print('评价次数：%d 次' % myAlgorithm.evalsNum)
if BestIndi.sizes != 0:
    print('最优的目标函数值为：%s' % BestIndi.ObjV[0][0])
    print('最优的控制变量值为：')
    for i in range(BestIndi.Phen.shape[1]):
        print(BestIndi.Phen[0, i])
else:
    print('没找到可行解。')

"""=================================检验结果==============================="""

problem.test(epochs= int(BestIndi.Phen[0][0]), alpha= float(BestIndi.Phen[0][1]))


7 0.3189171239733696





6 0.2855765145272017


  0%|          | 0/938 [00:00<?, ?it/s][A

  0%|          | 2/938 [00:00<00:54, 17.05it/s]


6 0.8424268286675215



  0%|          | 0/938 [00:00<?, ?it/s][A

  0%|          | 3/938 [00:00<00:39, 23.75it/s][A[A
  0%|          | 2/938 [00:00<00:52, 17.78it/s][A

  1%|          | 6/938 [00:00<00:44, 20.86it/s][A[A
  0%|          | 4/938 [00:00<00:51, 18.15it/s][A

 94%|█████████▎| 878/938 [00:08<00:01, 36.62it/s]A[A
 94%|█████████▎| 878/938 [00:08<00:00, 102.62it/s]


  1%|          | 9/938 [00:00<00:48, 19.11it/s]][A[A


6 0.8656420316547155


  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 1/938 [00:00<01:05, 14.39it/s]][A[A


9 0.6277091335505247


  0%|          | 0/938 [00:00<?, ?it/s]

  0%|          | 4/938 [00:00<00:25, 37.06it/s]][A[A

  1%|          | 10/938 [00:00<00:26, 34.80it/s][A[A


  3%|▎         | 31/938 [00:00<00:23, 38.58it/s][A[A

  4%|▍         | 38/938 [00:01<00:19, 45.81it/s][A[A

  5%|▍         | 46/938 [00:01<00:16, 54.51it/s][A[A

  6%|▌         | 54/938 [00:01<00:14, 61.32it/s][A[A

  7%|▋         | 65/938 [00:01<00:11, 74.37it/s][A[A

  8%|▊         | 76/938 [00:01<00:10, 83.75it/s][A[A

  9%|▉         | 86/938 [00:01<00:09, 88.29it/s][A[A

 10%|█         | 96/938 [00:01<00:09, 89.71it/s][A[A

 11%|█▏        | 106/938 [00:01<00:09, 85.26it/s][A[A

 12%|█▏        | 116/938 [00:01<00:09, 87.53it/s][A[A

 13%|█▎        | 125/938 [00:02<00:09, 87.56it/s][A[A

 14%|█▍        | 134/938 [00:02<00:09, 86.86it/s][A[A

 15%|█▌        | 143/938 [00:02<00:09, 87.52it/s][A[A

 16%|█▌        | 152/938 [00:02<00:09, 86.56it/s][A[A

 17%|█▋        | 163/938 [00:02<00:08, 92.49it/s][A[A


8 0.2740337282419205


100%|██████████| 938/938 [00:08<00:00, 109.97it/s]
100%|██████████| 938/938 [00:07<00:00, 118.57it/s]
100%|██████████| 938/938 [00:08<00:00, 108.56it/s]
100%|██████████| 938/938 [00:08<00:00, 114.24it/s]
100%|██████████| 938/938 [00:08<00:00, 104.54it/s]
100%|██████████| 938/938 [00:08<00:00, 114.32it/s]
100%|██████████| 938/938 [00:08<00:00, 106.33it/s]
100%|██████████| 938/938 [00:08<00:00, 116.82it/s]


RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [20, 10]], which is output 0 of AsStridedBackward0, is at version 23416; expected version 23415 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).