In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import random


# class SimpleMLP(nn.Module):
#     def __init__(self, input_size, hidden_size, num_classes):
#         super(SimpleMLP, self).__init__()
#         self.fc1 = nn.Linear(input_size, hidden_size)
#         self.relu = nn.ReLU()
#         self.fc2 = nn.Linear(hidden_size, num_classes)

#     def forward(self, x):
#         x = x.view(x.size(0), -1)
#         x = self.fc1(x)
#         x = self.relu(x)
#         x = self.fc2(x)
#         return x

class CombinedMLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, state_dicts):
        super(CombinedMLP, self).__init__()
        self.alpha = nn.Parameter(torch.randn(1))
        self.beta = nn.Parameter(torch.randn(1))
        self.gamma = nn.Parameter(torch.randn(1))

        self.pretrained_weights = state_dicts
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)

        # 归一化权重
        total = self.alpha + self.beta + self.gamma
        norm_alpha = self.alpha / total
        norm_beta = self.beta / total
        norm_gamma = self.gamma / total

        combined_weight1 = norm_alpha * self.pretrained_weights[0]['fc1.weight'] + norm_beta * \
            self.pretrained_weights[1]['fc1.weight'] + \
            norm_gamma * self.pretrained_weights[2]['fc1.weight']
        combined_bias1 = norm_alpha * self.pretrained_weights[0]['fc1.bias'] + norm_beta * \
            self.pretrained_weights[1]['fc1.bias'] + \
            norm_gamma * self.pretrained_weights[2]['fc1.bias']
        x = F.linear(x, combined_weight1, combined_bias1)

        x = self.relu(x)

        combined_weight2 = norm_alpha * self.pretrained_weights[0]['fc2.weight'] + norm_beta * \
            self.pretrained_weights[1]['fc2.weight'] + \
            norm_gamma * self.pretrained_weights[2]['fc2.weight']
        combined_bias2 = norm_alpha * self.pretrained_weights[0]['fc2.bias'] + norm_beta * \
            self.pretrained_weights[1]['fc2.bias'] + \
            norm_gamma * self.pretrained_weights[2]['fc2.bias']
        x = F.linear(x, combined_weight2, combined_bias2)

        return x



# 1. 检查CUDA是否可用
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 2. 定义简单的多层感知机

seed = 3
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
import numpy as np
import random





# 3. 加载MNIST数据集
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(
    root='./data', train=True, transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset, batch_size=100, shuffle=True)

test_dataset = torchvision.datasets.MNIST(
    root='./data', train=False, transform=transform)
test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset, batch_size=100, shuffle=False)

# 4. 定义损失函数和优化器
model = SimpleMLP(784, 500, 10).to(device)
criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(model.parameters(), lr=0.01)

# 5. 训练网络
num_epochs = 5
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if (i+1) % 100 == 0:
            print(
                f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

# 6. 测试网络的性能
model.eval()
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)

        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    print(
        f'Accuracy of the model on the 10000 test images: {100 * correct / total} %')

# 7. 保存网络权重
torch.save(model.state_dict(), f'./{seed}_{100 * correct / total}%.pth')


Epoch [1/5], Step [100/600], Loss: 1.3662
Epoch [1/5], Step [200/600], Loss: 0.9781
Epoch [1/5], Step [300/600], Loss: 0.5905
Epoch [1/5], Step [400/600], Loss: 0.4220
Epoch [1/5], Step [500/600], Loss: 0.4833
Epoch [1/5], Step [600/600], Loss: 0.4650
Epoch [2/5], Step [100/600], Loss: 0.5053
Epoch [2/5], Step [200/600], Loss: 0.4735
Epoch [2/5], Step [300/600], Loss: 0.4230
Epoch [2/5], Step [400/600], Loss: 0.4392
Epoch [2/5], Step [500/600], Loss: 0.3488
Epoch [2/5], Step [600/600], Loss: 0.2717
Epoch [3/5], Step [100/600], Loss: 0.2979
Epoch [3/5], Step [200/600], Loss: 0.3149
Epoch [3/5], Step [300/600], Loss: 0.2674
Epoch [3/5], Step [400/600], Loss: 0.2574
Epoch [3/5], Step [500/600], Loss: 0.4853
Epoch [3/5], Step [600/600], Loss: 0.3642
Epoch [4/5], Step [100/600], Loss: 0.4420
Epoch [4/5], Step [200/600], Loss: 0.3443
Epoch [4/5], Step [300/600], Loss: 0.5042
Epoch [4/5], Step [400/600], Loss: 0.1626
Epoch [4/5], Step [500/600], Loss: 0.2240
Epoch [4/5], Step [600/600], Loss:

In [8]:
torch.cuda.is_available()


True

In [35]:
model1 = SimpleMLP(784, 500, 10).to(device)
model2 = SimpleMLP(784, 500, 10).to(device)
model3 = SimpleMLP(784, 500, 10).to(device)
model1.load_state_dict(torch.load("1_92.17%.pth"))
model2.load_state_dict(torch.load("2_92.05%.pth"))
model3.load_state_dict(torch.load("3_92.03%.pth"))
print(model1.fc1.weight)

Parameter containing:
tensor([[ 0.0154, -0.0187, -0.0099,  ...,  0.0038, -0.0070, -0.0005],
        [-0.0266, -0.0217, -0.0302,  ..., -0.0022,  0.0020, -0.0159],
        [ 0.0283, -0.0093, -0.0339,  ..., -0.0139,  0.0058, -0.0334],
        ...,
        [ 0.0301, -0.0203, -0.0329,  ...,  0.0113, -0.0294,  0.0163],
        [ 0.0156, -0.0219, -0.0285,  ..., -0.0188, -0.0348,  0.0248],
        [ 0.0257, -0.0343, -0.0230,  ...,  0.0342,  0.0068,  0.0211]],
       device='cuda:0', requires_grad=True)


In [49]:
# model1 = SimpleMLP(784, 500, 10).to(device)
# model2 = SimpleMLP(784, 500, 10).to(device)
# model3 = SimpleMLP(784, 500, 10).to(device)
# model1.load_state_dict(torch.load("1_92.17%.pth"))
# model2.load_state_dict(torch.load("2_92.05%.pth"))
# model3.load_state_dict(torch.load("3_92.03%.pth"))

state_dict1 = torch.load("1_92.17%.pth")
state_dict2 = torch.load("2_92.05%.pth")
state_dict3 = torch.load("3_92.03%.pth")


def test_model(model, test_loader):
    model.eval()

    # 推理并计算准确度
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Accuracy on the test set: {accuracy:.2f}%")
    return accuracy


def weighted_sum_parameters(fusion_model, models, weights):
    # 使用zip将每个模型的参数和权重组合起来
    for params in zip(fusion_model.parameters(), *[model.parameters() for model in models]):
        weighted_param = sum(w * p for w, p in zip(weights, params))
        with torch.no_grad():  # 使用no_grad()来避免记录此操作
            params[0].copy_(weighted_param)


best_accuracy = 0

for seed in range(1, 2):
    print("SEED:",seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    # alpha = torch.randn(1, device=device, requires_grad=True)
    # beta = torch.randn(1, device=device, requires_grad=True)
    # gamma = torch.randn(1, device=device, requires_grad=True)

    # alpha = nn.Parameter(torch.randn(1, device=device), requires_grad=True)
    # beta = nn.Parameter(torch.randn(1, device=device), requires_grad=True)
    # gamma = nn.Parameter(torch.randn(1, device=device), requires_grad=True)


    # weights = torch.softmax(torch.stack([alpha, beta, gamma]), dim=0)
    # models = [model1, model2, model3]
    # print('最初的alpha:', alpha, '最初的beta:', beta, '最初的gamma:', gamma)
    # print(alpha.grad, beta.grad, gamma.grad)

    model = CombinedMLP(
        28*28, 500, 10, [state_dict1, state_dict2, state_dict3]).to(device)
    # fusion_model.load_state_dict(combine_parameters(alpha, beta, gamma, [
    #                       state_dict1, state_dict2, state_dict3]))
    # weighted_sum_parameters(fusion_model, models, weights)

    # 4. 训练加权融合的模型
    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
    train_dataset = torchvision.datasets.MNIST(
        root='./data', train=True, transform=transform, download=True)
    train_loader = torch.utils.data.DataLoader(
        dataset=train_dataset, batch_size=100, shuffle=True)
    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
    test_dataset = torchvision.datasets.MNIST(
        root='./data', train=False, transform=transform)
    test_loader = torch.utils.data.DataLoader(
        dataset=test_dataset, batch_size=100, shuffle=False)

    criterion = nn.CrossEntropyLoss()
    # optimizer = optim.Adam([alpha, beta, gamma], lr=0.005)
    optimizer = optim.Adam([model.alpha, model.beta, model.gamma], lr=0.005)

    num_epochs = 5

    for epoch in range(num_epochs):  # loop over the dataset multiple times
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data[0].to(device), data[1].to(device)

            # 使用softmax归一化权重

            # 在每次迭代之前，使用alpha和beta的当前值更新fusion_model的参数

            # weights = torch.softmax(torch.stack([alpha, beta, gamma]), dim=0)
            # weighted_sum_parameters(fusion_model, models, weights)

            # # 计算每个模型的输出
            # outputs1 = model1(inputs)
            # outputs2 = model2(inputs)
            # outputs3 = model3(inputs)

            # # 计算加权和的输出
            # outputs = weights[0] * outputs1 + weights[1] * \
            #     outputs2 + weights[2] * outputs3



            outputs = model(inputs)
            loss = criterion(outputs, labels)

            optimizer.zero_grad()
            loss.backward()
            # print(alpha.grad, beta.grad, gamma.grad)
            # print(weights[0].grad, weights[1].grad, weights[2].grad)
            optimizer.step()
            print("alpha:", model.alpha, "beta:",
                  model.beta, "gamma:", model.gamma)

            if (i+1) % 100 == 0:
                print(
                    f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

        print("alpha:", model.alpha.grad, "beta:",
              model.beta.grad, "gamma:", model.gamma.grad)
        # print("alpha:", alpha, "beta:",
        #       beta, "gamma:", gamma)


    accuracy = test_model(model, test_loader)
    print(f"Accuracy on the test set: {accuracy:.2f}%")
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_alpha = model.alpha
        best_beta = model.beta
        best_gamma = model.gamma
        best_seed = seed

# 保存alpha和beta的值
print(best_alpha.item(), best_beta.item(), best_gamma.item(), best_seed)
torch.save({"alpha": best_alpha.item(), "beta": best_beta.item(),
           "gamma": best_gamma.item()}, "weights.pth")


SEED: 1
alpha: Parameter containing:
tensor([0.6664], device='cuda:0', requires_grad=True) beta: Parameter containing:
tensor([0.2619], device='cuda:0', requires_grad=True) gamma: Parameter containing:
tensor([0.0567], device='cuda:0', requires_grad=True)
alpha: Parameter containing:
tensor([0.6713], device='cuda:0', requires_grad=True) beta: Parameter containing:
tensor([0.2569], device='cuda:0', requires_grad=True) gamma: Parameter containing:
tensor([0.0517], device='cuda:0', requires_grad=True)
alpha: Parameter containing:
tensor([0.6763], device='cuda:0', requires_grad=True) beta: Parameter containing:
tensor([0.2520], device='cuda:0', requires_grad=True) gamma: Parameter containing:
tensor([0.0467], device='cuda:0', requires_grad=True)
alpha: Parameter containing:
tensor([0.6812], device='cuda:0', requires_grad=True) beta: Parameter containing:
tensor([0.2470], device='cuda:0', requires_grad=True) gamma: Parameter containing:
tensor([0.0418], device='cuda:0', requires_grad=True)


In [38]:
torch.save({"alpha": best_alpha.item(),
           "beta": best_beta.item()}, "weights.pth")


In [21]:
loaded_weights = torch.load("weights.pth")
loaded_alpha = loaded_weights["alpha"]
loaded_beta = loaded_weights["beta"]
print("alpha:",loaded_alpha, "beta:",loaded_beta)

alpha: 0.17560148239135742 beta: -0.6090418100357056


In [26]:
seed = best_seed
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(seed)
random.seed(seed)

model1 = SimpleMLP(784, 500, 10).to(device)
model2 = SimpleMLP(784, 500, 10).to(device)
model3 = SimpleMLP(784, 500, 10).to(device)
model1.load_state_dict(torch.load("1_92.17%.pth"))
model2.load_state_dict(torch.load("2_92.05%.pth"))
model3.load_state_dict(torch.load("3_92.03%.pth"))
loaded_weights = torch.load("weights.pth")
loaded_alpha = loaded_weights["alpha"]
loaded_beta = loaded_weights["beta"]
loaded_gamma = loaded_weights["gamma"]
# loaded_alpha = torch.tensor([1.0], requires_grad=True).to(device)
# loaded_beta = torch.tensor([0.0], requires_grad=True).to(device)
print("alpha:", loaded_alpha, "beta:", loaded_beta, "gamma:", loaded_gamma)

transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
test_dataset = torchvision.datasets.MNIST(
    root='./data', train=False, transform=transform)
test_loader = torch.utils.data.DataLoader(
    dataset=test_dataset, batch_size=100, shuffle=False)


# 加载权重
# model = SimpleMLP(784, 500, 10)
# model.load_state_dict(torch.load('11_92.05%.pth'))

# fusion_model = SimpleMLP(784, 500, 10).to(device)
# weighted_sum_parameters(fusion_model, model1, model2,
#                         model3, loaded_alpha, loaded_beta)
# model = fusion_model


model1.eval()
model2.eval()
model3.eval()

# 推理并计算准确度
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs1 = model1(images)
        outputs2 = model2(images)
        outputs3 = model3(images)

        weights = torch.softmax(torch.tensor(
            [loaded_alpha, loaded_beta, loaded_gamma], device=device), dim=0)

        # 计算加权和的输出
        outputs = weights[0] * outputs1 + weights[1] * \
            outputs2 + weights[2] * outputs3
        # outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Accuracy on the test set: {accuracy:.2f}%")


alpha: 0.8721233606338501 beta: 1.0146440267562866 gamma: 0.8551493287086487
Accuracy on the test set: 92.10%


In [30]:
def test_model(model, test_loader):
    model.eval()

    # 推理并计算准确度
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f"Accuracy on the test set: {accuracy:.2f}%")


In [None]:
state_dict1 = torch.load("1_92.17%.pth")
state_dict2 = torch.load("2_92.05%.pth")
state_dict3 = torch.load("3_92.03%.pth")


In [45]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms

# 定义网络结构


class CombinedMLP(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes, state_dicts):
        super(CombinedMLP, self).__init__()
        self.alpha = nn.Parameter(torch.randn(1))
        self.beta = nn.Parameter(torch.randn(1))
        self.gamma = nn.Parameter(torch.randn(1))

        self.pretrained_weights = state_dicts
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)

        # 归一化权重
        total = self.alpha + self.beta + self.gamma
        norm_alpha = self.alpha / total
        norm_beta = self.beta / total
        norm_gamma = self.gamma / total

        combined_weight1 = norm_alpha * self.pretrained_weights[0]['fc1.weight'] + norm_beta * \
            self.pretrained_weights[1]['fc1.weight'] + \
            norm_gamma * self.pretrained_weights[2]['fc1.weight']
        combined_bias1 = norm_alpha * self.pretrained_weights[0]['fc1.bias'] + norm_beta * \
            self.pretrained_weights[1]['fc1.bias'] + \
            norm_gamma * self.pretrained_weights[2]['fc1.bias']
        x = F.linear(x, combined_weight1, combined_bias1)

        x = self.relu(x)

        combined_weight2 = norm_alpha * self.pretrained_weights[0]['fc2.weight'] + norm_beta * \
            self.pretrained_weights[1]['fc2.weight'] + \
            norm_gamma * self.pretrained_weights[2]['fc2.weight']
        combined_bias2 = norm_alpha * self.pretrained_weights[0]['fc2.bias'] + norm_beta * \
            self.pretrained_weights[1]['fc2.bias'] + \
            norm_gamma * self.pretrained_weights[2]['fc2.bias']
        x = F.linear(x, combined_weight2, combined_bias2)

        return x


# 设备配置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 加载预训练的state_dicts
# 注意: 这里仅为示例，您需要加载您自己的state_dicts
state_dict1 = torch.load("1_92.17%.pth")
state_dict2 = torch.load("2_92.05%.pth")
state_dict3 = torch.load("3_92.03%.pth")

# 实例化模型
model = CombinedMLP(28*28, 500, 10, [state_dict1, state_dict2, state_dict3])
model = model.to(device)

# 创建优化器
optimizer = optim.Adam([model.alpha, model.beta, model.gamma], lr=0.005)

# 定义损失函数
criterion = nn.CrossEntropyLoss()


def custom_loss(outputs, labels, alpha, beta, gamma, lam=1e-5):
    classification_loss = F.cross_entropy(outputs, labels)
    reg_loss = lam * (alpha**2 + beta**2 + gamma**2)
    total_loss = classification_loss + reg_loss
    return total_loss


# 加载MNIST数据集
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])
train_dataset = torchvision.datasets.MNIST(
    root='./data', train=True, transform=transform, download=True)
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=100, shuffle=True)

# 训练模型
num_epochs = 5
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        images = images.to(device)
        labels = labels.to(device)

        # 前向传播
        outputs = model(images)
        loss = criterion(outputs, labels)


        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print("model.alpha:", model.alpha.grad, "model.beta:",
              model.beta.grad, "model.gamma:", model.gamma.grad)

        if (i+1) % 100 == 0:
            print(
                f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')


model.alpha: tensor([-0.5917], device='cuda:0') model.beta: tensor([0.2191], device='cuda:0') model.gamma: tensor([-0.5163], device='cuda:0')
model.alpha: tensor([-0.6746], device='cuda:0') model.beta: tensor([0.2480], device='cuda:0') model.gamma: tensor([-0.6167], device='cuda:0')
model.alpha: tensor([-0.5878], device='cuda:0') model.beta: tensor([0.2156], device='cuda:0') model.gamma: tensor([-0.5697], device='cuda:0')
model.alpha: tensor([-0.5946], device='cuda:0') model.beta: tensor([0.2112], device='cuda:0') model.gamma: tensor([-0.5686], device='cuda:0')
model.alpha: tensor([-0.4245], device='cuda:0') model.beta: tensor([0.1469], device='cuda:0') model.gamma: tensor([-0.4069], device='cuda:0')
model.alpha: tensor([-0.5984], device='cuda:0') model.beta: tensor([0.2020], device='cuda:0') model.gamma: tensor([-0.5757], device='cuda:0')
model.alpha: tensor([-0.5279], device='cuda:0') model.beta: tensor([0.1770], device='cuda:0') model.gamma: tensor([-0.5353], device='cuda:0')
model.

KeyboardInterrupt: 