In [1]:
!pip install shap
!pip install pandas
!pip install -U scikit-learn

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Collecting shap
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/9e/3f/247e0017d52eeef37c170d71357eb3a12a2c06718d2e184c9929b6f3d9ed/shap-0.43.0-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (532 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m532.9/532.9 kB[0m [31m129.6 MB/s[0m eta [36m0:00:00[0m
Collecting slicer==0.0.7 (from shap)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/78/c2/b3f55dfdb8af9812fdb9baf70cacf3b9e82e505b2bd4324d588888b81202/slicer-0.0.7-py3-none-any.whl (14 kB)
Collecting numba (from shap)
  Downloading https://pypi.tuna.tsinghua.edu.cn/packages/ed/13/b66627125b35f2987bd9872cf028b5e1e1ffcbc8d1e182ac4e84eed3998f/numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m137.6 MB/s[0m eta [36m0:00:00[0m
[

In [3]:
!featurize dataset download 0b3523e6-71ba-4e66-9fab-fa73ecc0f58c

100%|████████████████████████████████████████| 202M/202M [00:00<00:00, 204MiB/s]
🍬  下载完成，正在解压...
🏁  数据集已经成功添加


In [9]:
import torch
from torchvision import datasets, transforms
from torch import optim
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

from resnet import resnet50 as self_resnet50

import os
import numpy as np
from sklearn.model_selection import train_test_split
from tqdm import tqdm


# //////////////////////////////////////////////////////
# 自定义数据集类
class MyDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.classes = os.listdir(root_dir)
        self.file_paths = []
        self.labels = []
        for i, class_name in enumerate(self.classes):
            class_path = os.path.join(root_dir, class_name)
            files = os.listdir(class_path)
            self.file_paths.extend([os.path.join(class_path, file) for file in files])
            self.labels.extend([i] * len(files))

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        data = np.loadtxt(file_path).reshape(32, 512)
        data = torch.from_numpy(data).float()
        return data, label
    
    def get_info(self, idx):
        """ 返回文件路径和标签 """
        return self.file_paths[idx], self.labels[idx]


if __name__ == "__main__":
    # 设置随机种子
    torch.manual_seed(42)

    # 数据集目录和文件路径
    #'D:\PychramProject\transzero'
    # data_dir = 'data/'
    # C:\Users\ZHY\Desktop\data_txt
    # data_dir = 'D:/PychramProject/transzero/data55-512/'
    data_dir = 'data/datall/'
    class_names = os.listdir(data_dir)

    # 构建数据集
    dataset = MyDataset(data_dir)

    # 划分训练集和测试集
    train_indices, test_indices = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
    train_dataset = torch.utils.data.Subset(dataset, train_indices)
    test_dataset = torch.utils.data.Subset(dataset, test_indices)
    # 创建数据加载器
    batch_size = 256
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    # //////////////////////////////////////////////////////

    resnet50 = self_resnet50(num_classes=9)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # 网络模型cuda
    resnet50 = resnet50.cuda()

    # loss
    loss_fn = nn.CrossEntropyLoss()
    if torch.cuda.is_available():
        loss_fn = loss_fn.cuda()


    # optimizer
    learning_rate = 0.01
    optimizer = torch.optim.SGD(resnet50.parameters(), lr=learning_rate, momentum=0.9, nesterov=True, )


    # 设置网络训练的一些参数
    # 记录训练的次数
    total_train_step = 0
    # 记录测试的次数
    total_test_step = 0
    # 训练的轮数
    epoch = 1

    best_acc = 0
    predictions = []
    targets = []
    probabilities = []
    acc = 0

    for i in range(epoch):
        print("-------第{}轮训练开始-------".format(i + 1))
        resnet50.train()
        # 训练步骤开始
        for data in tqdm(train_dataloader, ncols=100, desc='Train'):
            imgs, targets = data
            if torch.cuda.is_available():
                # 图像cuda；标签cuda
                # 训练集和测试集都要有
                imgs = imgs.cuda()
                targets = targets.cuda()
            imgs = imgs.reshape(-1, 1, 32, 512)
            # imgs1 = torch.cat((imgs, imgs), 1)
            # imgs2 = torch.cat((imgs, imgs1), 1)
            outputs = resnet50(imgs)
            loss = loss_fn(outputs, targets)

            # 优化器优化模型
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_step = total_train_step + 1
            # if total_train_step % 100 == 0:
        print("训练次数：{}, Loss: {:.4f}".format(total_train_step, loss.item()))
                # writer.add_scalar("train_loss", loss.item(), total_train_step)

        # 测试集
        # predictions = []
        # targets = []
        # probabilities = []
        #
        resnet50.eval()
        total_test_loss = 0
        with torch.no_grad():
            # test
            total_correct = 0
            total_num = 0

            for data in test_dataloader:
                imgs, targets = data
                if torch.cuda.is_available():
                    # 图像cuda；标签cuda
                    # 训练集和测试集都要有
                    imgs = imgs.cuda()
                    targets = targets.cuda()
                imgs = imgs.reshape(-1, 1, 32, 512)
                # imgs1 = torch.cat((imgs, imgs), 1)
                # imgs2 = torch.cat((imgs, imgs1), 1)
                outputs = resnet50(imgs)
                #
                # intermediate_layer = resnet50.layer4[-1].conv3  # Modify this line to select the desired intermediate layer
                # visualize_features(outputs)
                #
                _, predicted = torch.max(outputs, 1)
                predictions.extend(predicted.tolist())
                #
                loss = loss_fn(outputs, targets)
                total_test_loss += loss.item()
                total_test_step += 1

                # logits = resnet50(imgs)
                logits = outputs
                pred = logits.argmax(dim=1)
                correct = torch.eq(pred, targets).float().sum().item()
                total_correct += correct
                total_num += imgs.size(0)
                #
                softmax = nn.Softmax(dim=1)
                probs = softmax(outputs)
                probabilities.extend(probs.tolist())


                # if total_test_step % 100 == 0:
            print("测试次数：{}，Loss：{:.4f}".format(total_test_step, total_test_loss))
            acc = total_correct / total_num
            print(epoch, 'test acc:', acc)
            #
            # print("定性预测结果：", predictions)
            # print("定量预测结果：", probabilities)
            # 保存最优模型
            if acc > best_acc:
                best_acc = acc
                torch.save(resnet50.state_dict(), 'best_model.pt')

    # 将结果写入文件
    result_file = 'result.txt'
    with open(result_file, 'w') as f:
        f.write(f"Accuracy: {acc:.4f}\n")
        f.write("定性预测结果：\n")
        for pred in predictions:
            f.write(f"{pred}+','")
        f.write("定量预测结果：\n")
        for prob in probabilities:
            f.write(f"{prob}+','")

-------第1轮训练开始-------


Train: 100%|██████████████████████████████████████████████████████| 272/272 [01:57<00:00,  2.32it/s]

训练次数：272, Loss: 0.5525





测试次数：68，Loss：35.1699
1 test acc: 0.8300431654676259


In [13]:
import torch
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

from resnet import resnet50 as self_resnet50
from Resnet50data3 import MyDataset

import os
import numpy as np
import shap
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# 设置随机种子
torch.manual_seed(42)

# 数据集目录和文件路径
data_dir = 'data/datall/'
class_names = os.listdir(data_dir)

# 构建数据集
dataset = MyDataset(data_dir)


# 划分训练集和测试集
train_indices, test_indices = train_test_split(range(len(dataset)), test_size=0.2, random_state=42)
train_dataset = torch.utils.data.Subset(dataset, train_indices)
test_dataset = torch.utils.data.Subset(dataset, test_indices)

batch_size = 1
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_dataloader = DataLoader(test_dataset, batch_size=30, shuffle=False, num_workers=4)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
resnet50 = self_resnet50(num_classes=9)
state = torch.load('best_modell.pt', map_location=device)  # 直接加载到指定设备
resnet50.load_state_dict(state)
resnet50.to(device).eval()

iter_train = iter(train_dataloader)
batch_train = next(iter_train)
background = None
for i in range(100):
    image_, _ = batch_train
    background = image_ if background is None else torch.cat((background, image_), dim=0)
    batch_train = next(iter_train)

del iter_train
del train_dataloader
# e = shap.GradientExplainer(resnet50, background.reshape(-1, 1, 32, 512).cuda())
e = shap.GradientExplainer(resnet50, background.reshape(-1, 1, 32, 512).to(device))  # 确保背景数据在GPU上

# write test
temps = {}
count = 0
for i, (images, labels) in enumerate(tqdm(test_dataloader, ncols=100, desc="cnts")):

    images = images.to(device)  # 确保图像在GPU上
    # 调用前尝试克隆输入的张量
    cloned_images = images.clone().reshape(-1, 1, 32, 512)  # 使用克隆并重塑图像的副本
    shap_values = e.shap_values(cloned_images)  # 确保使用GPU上的数据计算SHAP值

    for j, l in enumerate(labels):
        value = shap_values[l][j][0]
        if isinstance(value, torch.Tensor):
            value = value.cpu().numpy()  # 将结果转移到CPU并转换为NumPy数组
        filename, _ = dataset.get_info(test_indices[count])
        if int(l) not in temps.keys():
            temps[int(l)] = []
        temps[int(l)].append([filename, int(l), value])
        count += 1


import pickle as pkl
with open('temp.pkl', 'wb') as f:
    pkl.dump(temps, f)

cnts: 100%|████████████████████████████████████████████████████| 580/580 [15:50:23<00:00, 98.32s/it]


In [14]:
import pickle as pkl
import numpy as np

TopK = 180
num_classes = 9

with open('temp.pkl', 'rb') as f:
    data = pkl.load(f)

# -------------- 之前的方法 ----------------- # 
# results = {}  # 保存每类前20
# for i in range(9):
#     results[i] = {}
#     samples = data[i]  # 获取每类保存的数据 [filename, label, data(32, 512)]
#     d_array = [samples[k][-1] for k in range(len(samples))]

#     # 先求和，再排序
#     d_sum = np.sum(d_array, axis=0)  # shape =（512, ）
#     d_sort_value = np.sort(d_sum)[::-1]    # 贡献值, 从大到小
#     d_sort_index = np.argsort(d_sum)[::-1] # 字段索引, 从大到小


#     results[i]['contribute'] = d_sort_value
#     results[i]['index'] = d_sort_index


# print(results)
# -------------- 之前的方法 ----------------- # 


# -------------- 修改后的方法 ----------------- # 
results = []
for i in range(num_classes):
    samples = data[i]  # 获取每类保存的数据 [filename, label, data(32, 512)]
    d_list = [samples[k][-1] for k in range(len(samples))]
    results += d_list


# step 1: (n, 32, 512)
results_array = np.array(results)
# step 2: (1, 32, 512), 按第一维度求和得到 (32, 512)
results_array = np.sum(results_array, axis=0)
# step 3: (32, 512), 按第一维度取最大值得到 (512, )
results_array = np.max(results_array, axis=0)
# step 4: (512, )按大小排序，取前20
sort_value = np.sort(results_array)[::-1]    # 贡献值进行从大到小的排序
sort_index = np.argsort(results_array)[::-1] # 贡献值的索引从大到小的排序

sort_index += 1

print("value: {}".format(sort_value[:TopK]))
print("index: {}".format(sort_index[:TopK]))

# 输出index中小于55的值
print("*"*100)
index_topK = sort_index[:TopK]
less_than_55 = index_topK[index_topK < 55]
less_than_55_sort = np.sort(less_than_55)
print("报头字段为: {}".format(less_than_55))
print("*"*100)
print("排序后字段为: {}".format(less_than_55_sort))

# -------------- 修改后的方法 ----------------- # 

value: [13623.79664855  3827.64486606  3511.75438845  2782.20875227
  2573.47644634  2197.20298206  2104.24202781  1981.46947019
  1917.20958301  1724.45580376  1664.41926583  1661.97843529
  1660.38629902  1543.68810613  1491.02414757  1392.5480196
  1299.1851391   1281.74834583  1248.10392086  1242.32749654
  1240.7238303   1204.14855015  1184.52369425  1099.78604379
  1098.29382734  1060.18043611   988.9955072    960.07658422
   958.04786774   908.7179597    872.68022879   861.79365579
   781.47949123   711.25758849   674.64443257   668.14364201
   664.73660849   610.33245464   583.43804327   580.4575966
   550.86618266   547.40051955   535.59914335   526.48282892
   526.12965875   502.19834428   491.4594207    489.92888326
   479.98866585   478.65482295   474.73095391   474.72176852
   462.48782735   458.07705267   454.0029156    445.13377533
   443.15948578   436.17507953   426.97409601   425.94693833
   404.74377253   396.51640981   365.01819747   363.88622236
   361.05703794   3