In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torchvision import datasets, transforms
from art.attacks.evasion import FastGradientMethod
from art.estimators.classification import PyTorchClassifier
import shap

# 使用裝置
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"使用裝置: {device}")



使用裝置: cuda


In [2]:
# 資料加載
def load_data():
    transform = transforms.Compose([transforms.ToTensor()])
    train_data = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
    test_data = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
    train_loader = torch.utils.data.DataLoader(train_data, batch_size=128, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test_data, batch_size=128, shuffle=False)
    return train_loader, test_loader

train_loader, test_loader = load_data()



In [3]:
# PyTorch 模型
class MNISTModel(nn.Module):
    def __init__(self):
        super(MNISTModel, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3)
        self.pool1 = nn.MaxPool2d(kernel_size=2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3)
        self.pool2 = nn.MaxPool2d(kernel_size=2)
        self.flatten = nn.Flatten()

        # 計算卷積層輸出大小，假設輸入大小為 (1, 28, 28)
        self.fc = nn.Linear(64 * 5 * 5, 10)  # 修改輸入大小

    def forward(self, x):
        x = torch.relu(self.conv1(x))
        x = self.pool1(x)
        x = torch.relu(self.conv2(x))
        x = self.pool2(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x


model = MNISTModel().to(device)


In [5]:
# 訓練目標模型
def train_classifier(model, train_loader, epochs=10):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for images, labels in train_loader:
            images, labels = images.to(device), labels.to(device)

            optimizer.zero_grad()

            outputs = model(images)

            loss = criterion(outputs, labels)

            loss.backward()

            optimizer.step()

            running_loss += loss.item()
        print(f"Epoch [{epoch+1}/{epochs}], Loss: {running_loss:.4f}")


train_classifier(model, train_loader)


Epoch [1/10], Loss: 11.3758
Epoch [2/10], Loss: 9.5694
Epoch [3/10], Loss: 7.9750
Epoch [4/10], Loss: 7.0070
Epoch [5/10], Loss: 6.0901
Epoch [6/10], Loss: 5.0333
Epoch [7/10], Loss: 4.5339
Epoch [8/10], Loss: 4.0691
Epoch [9/10], Loss: 3.2847
Epoch [10/10], Loss: 3.3009


In [10]:
# ART PyTorchClassifier
classifier = PyTorchClassifier(
    model=model,
    loss=nn.CrossEntropyLoss(),
    optimizer=optim.Adam(model.parameters(), lr=0.001),
    input_shape=(1, 28, 28),
    nb_classes=10,
)



In [11]:
# 生成對抗樣本
def generate_adversarial_samples(classifier, test_loader):
    attack = FastGradientMethod(estimator=classifier, eps=0.1)
    adversarial_samples = []
    normal_samples = []
    for images, labels in test_loader:
        images = images.numpy().reshape(-1, 1, 28, 28)  # 確保形狀正確
        adversarial_images = attack.generate(x=images)
        adversarial_samples.append((adversarial_images, labels.numpy()))
        normal_samples.append((images, labels.numpy()))
    return normal_samples, adversarial_samples

normal_samples, adversarial_samples = generate_adversarial_samples(classifier, test_loader)




In [12]:
# SHAP 簽名生成
def generate_shap_signatures(model, samples, num_classes=10):
    # 使用樣本中的前 50 個作為背景資料
    background = torch.tensor(samples[0][0][:50]).to(device)
    background = background.reshape(-1, 1, 28, 28)  # MNIST 的輸入形狀
    explainer = shap.DeepExplainer(model, background)
    shap_signatures = []

    for images, labels in samples:
        images = torch.tensor(images).to(device)
        images = images.reshape(-1, 1, 28, 28)  # 確保輸入形狀正確
        shap_values = explainer.shap_values(images)

        # 確保簽名大小正確
        flattened = np.concatenate([shap_values[i].flatten() for i in range(num_classes)], axis=0)
        print("SHAP 簽名大小:", flattened.shape)  # 打印形狀檢查
        shap_signatures.append((flattened, labels))
    return shap_signatures



normal_shap_signatures = generate_shap_signatures(model, normal_samples)
adversarial_shap_signatures = generate_shap_signatures(model, adversarial_samples)



AssertionError: The SHAP explanations do not sum up to the model's output! This is either because of a rounding error or because an operator in your computation graph was not fully supported. If the sum difference of %f is significant compared to the scale of your model outputs, please post as a github issue, with a reproducible example so we can debug it. Used framework: pytorch - Max. diff: 18.664880711976622 - Tolerance: 0.01

In [None]:
# 偵測器模型
class DetectorModel(nn.Module):
    def __init__(self, input_size):
        super(DetectorModel, self).__init__()
        self.fc1 = nn.Linear(input_size, 256)  # 修改輸入大小
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 16)
        self.fc4 = nn.Linear(16, 1)

    def forward(self, x):
        x = x.view(x.size(0), -1)  # 確保展平輸入
        print("Input shape:", x.shape)  # 打印形狀
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = torch.sigmoid(self.fc4(x))
        return x


# 根據 SHAP簽名大小初始化偵測器模型
input_size = normal_shap_signatures[0][0].shape[0]
detector = DetectorModel(input_size).to(device)



In [None]:
# 訓練偵測器模型
def train_detector(detector, normal_signatures, adversarial_signatures, epochs=10):
    criterion = nn.BCELoss()
    optimizer = optim.Adam(detector.parameters(), lr=0.001)

    x_train = []
    y_train = []
    for normal, adversarial in zip(normal_signatures, adversarial_signatures):
        normal_features, normal_labels = normal
        adversarial_features, adversarial_labels = adversarial

        # 確保特徵和標籤形狀一致
        features = np.concatenate([normal_features, adversarial_features], axis=0)
        labels = np.concatenate([
            np.zeros(len(normal_labels)),
            np.ones(len(adversarial_labels))
        ], axis=0)

        x_train.append(features)
        y_train.append(labels)

    x_train = torch.tensor(np.concatenate(x_train, axis=0)).float().to(device)
    y_train = torch.tensor(np.concatenate(y_train, axis=0)).float().to(device)

    detector.train()
    for epoch in range(epochs):
        optimizer.zero_grad()

        # 前向傳播
        outputs = detector(x_train)

        # 計算損失
        loss = criterion(outputs.squeeze(), y_train)

        # 反向傳播
        loss.backward()

        # 更新參數
        optimizer.step()

        print(f"Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}")

train_detector(detector, normal_shap_signatures, adversarial_shap_signatures)


