### 分析 ISIC2018_Task3_Training_LesionGroupings.csv


In [1]:
from config import dataset_dir


# 找出 ISIC2018_Task3_Training_LesionGroupings.csv 中 diagnosis_confirm_type 列不同的类别

import pandas as pd
# 读取 CSV 文件
file_path = f'{dataset_dir}/2018/ISIC2018_Task3_Training_LesionGroupings.csv'
data = pd.read_csv(file_path)

# 提取 diagnosis_confirm_type 列的不同类别
unique_categories = data['diagnosis_confirm_type'].unique()

print("Different diagnosis_confirm_type:")
for category in unique_categories:
    print(category)


Different diagnosis_confirm_type:
serial imaging showing no change
histopathology
single image expert consensus
confocal microscopy with consensus dermoscopy


In [16]:
# 按 `lesion_id` 分组，查看每个病变ID下的图像和诊断类型，避免警告
grouped_data = data.groupby('lesion_id', group_keys=False).apply(lambda x: x[['image', 'diagnosis_confirm_type']].values.tolist()).reset_index(name='images_info')

# 计数变量
total_images_count = 0

# 打印结果并计算图片总数
for _, row in grouped_data.iterrows():
    print(f"Lesion ID: {row['lesion_id']}")
    num_images = len(row['images_info'])
    total_images_count += num_images
    print(f"  Number of images: {num_images}")
    for image_info in row['images_info']:
        print(f"    Image: {image_info[0]}, Diagnosis: {image_info[1]}")
    print("\n")


Lesion ID: HAM_0000000
  Number of images: 2
    Image: ISIC_0025346, Diagnosis: histopathology
    Image: ISIC_0028498, Diagnosis: histopathology


Lesion ID: HAM_0000001
  Number of images: 1
    Image: ISIC_0027859, Diagnosis: histopathology


Lesion ID: HAM_0000002
  Number of images: 3
    Image: ISIC_0032622, Diagnosis: histopathology
    Image: ISIC_0033848, Diagnosis: histopathology
    Image: ISIC_0034246, Diagnosis: histopathology


Lesion ID: HAM_0000003
  Number of images: 1
    Image: ISIC_0027886, Diagnosis: serial imaging showing no change


Lesion ID: HAM_0000004
  Number of images: 1
    Image: ISIC_0024645, Diagnosis: serial imaging showing no change


Lesion ID: HAM_0000005
  Number of images: 4
    Image: ISIC_0024579, Diagnosis: histopathology
    Image: ISIC_0025577, Diagnosis: histopathology
    Image: ISIC_0029638, Diagnosis: histopathology
    Image: ISIC_0030591, Diagnosis: histopathology


Lesion ID: HAM_0000006
  Number of images: 3
    Image: ISIC_0032187, 

  grouped_data = data.groupby('lesion_id', group_keys=False).apply(lambda x: x[['image', 'diagnosis_confirm_type']].values.tolist()).reset_index(name='images_info')


In [17]:
# 总照片数
actual_total_images = len(data)
print(f"Calculated total images: {total_images_count}")
print(f"Actual total images: {actual_total_images}")

# 核验结果
if total_images_count == actual_total_images:
    print("All images have been accounted for.")
else:
    print("There is a discrepancy in the image count!")

Calculated total images: 10015
Actual total images: 10015
All images have been accounted for.


### 训练集数据来验证病变分组和良性/恶性标记

In [19]:
import pandas as pd

# 定义 CSV 文件路径
ground_truth_csv = f'{dataset_dir}/2018/ISIC2018_Task3_Training_GroundTruth.csv'
lesion_groupings_csv = f'{dataset_dir}/2018/ISIC2018_Task3_Training_LesionGroupings.csv'

# 读取 CSV 文件
ground_truth_data = pd.read_csv(ground_truth_csv)
lesion_groupings_data = pd.read_csv(lesion_groupings_csv)

# 合并两个数据集，通过 'image' 列
merged_data = pd.merge(ground_truth_data, lesion_groupings_data, on='image')

# 定义 `_group_lesions` 方法
def group_lesions(data):
    """ Group images by lesion_id and determine if they are benign or malignant. """
    lesion_groups = {}
    benign_labels = ['NV', 'BKL']
    malignant_labels = ['MEL', 'BCC', 'AKIEC']

    for _, row in data.iterrows():
        lesion_id = row['lesion_id']
        is_malignant = any(row[label] == 1 for label in malignant_labels)
        is_benign = any(row[label] == 1 for label in benign_labels) and not is_malignant

        if lesion_id not in lesion_groups:
            lesion_groups[lesion_id] = {
                'images': [],
                'is_malignant': False,
                'is_benign': False
            }

        lesion_groups[lesion_id]['images'].append(row['image'])
        lesion_groups[lesion_id]['is_malignant'] = lesion_groups[lesion_id]['is_malignant'] or is_malignant
        lesion_groups[lesion_id]['is_benign'] = lesion_groups[lesion_id]['is_benign'] or is_benign

    return lesion_groups

# 测试 `_group_lesions` 方法
lesion_groups = group_lesions(merged_data)

# 打印结果
for lesion_id, details in lesion_groups.items():
    print(f"Lesion ID: {lesion_id}")
    print(f"  Images: {details['images']}")
    print(f"  Is Malignant: {details['is_malignant']}")
    print(f"  Is Benign: {details['is_benign']}")
    print("\n")

# 统计良性和恶性病变的数量
benign_count = sum(1 for group in lesion_groups.values() if group['is_benign'])
malignant_count = sum(1 for group in lesion_groups.values() if group['is_malignant'])

print(f"Total benign lesions: {benign_count}")
print(f"Total malignant lesions: {malignant_count}")


Lesion ID: HAM_0000550
  Images: ['ISIC_0024306']
  Is Malignant: False
  Is Benign: True


Lesion ID: HAM_0003577
  Images: ['ISIC_0024307']
  Is Malignant: False
  Is Benign: True


Lesion ID: HAM_0001477
  Images: ['ISIC_0024308']
  Is Malignant: False
  Is Benign: True


Lesion ID: HAM_0000484
  Images: ['ISIC_0024309']
  Is Malignant: False
  Is Benign: True


Lesion ID: HAM_0003350
  Images: ['ISIC_0024310', 'ISIC_0027300', 'ISIC_0028512']
  Is Malignant: True
  Is Benign: False


Lesion ID: HAM_0000981
  Images: ['ISIC_0024311']
  Is Malignant: False
  Is Benign: True


Lesion ID: HAM_0001359
  Images: ['ISIC_0024312', 'ISIC_0028628']
  Is Malignant: False
  Is Benign: True


Lesion ID: HAM_0002869
  Images: ['ISIC_0024313', 'ISIC_0026694']
  Is Malignant: True
  Is Benign: False


Lesion ID: HAM_0002198
  Images: ['ISIC_0024314', 'ISIC_0026722']
  Is Malignant: False
  Is Benign: True


Lesion ID: HAM_0007538
  Images: ['ISIC_0024315', 'ISIC_0031941']
  Is Malignant: True
  Is 

### 这是一个图像病症分类问题 而我为什么要在 DataLoader 中获取和加载良性或恶性病变信息

1. 数据增强 (Data Augmentation)
在数据中，同一个病变 (lesion_id) 可能对应多张图像。在模型训练时，将这些图像视为一个组来处理可以帮助数据增强。例如，如果一个良性病变有多张图像，这些图像的不同视角或拍摄条件能提供更多样本，帮助模型更好地学习和泛化。
通过标记整个 lesion_id 组作为良性或恶性，可以确保数据增强时模型不会因单个图像的噪声或异常条件产生偏差。
2. 标签平滑和一致性 (Label Smoothing and Consistency)
在真实的医疗数据中，不同图像有时会有细微差别。即使是同一个 lesion_id，不同图像上的诊断可能不完全相同。如果可以通过 lesion_id 合并标记，将整个病变组作为一个整体来分析和学习，能够减少因为单个图像的不一致性带来的分类混乱。
对于恶性病变，如果一个 lesion_id 被确诊为恶性病变，通过同组中的所有图像来强化模型的学习有助于更精确地捕捉疾病特征。
3. 减少数据不平衡问题 (Address Data Imbalance)
在医学图像分类中，数据通常会存在类别不平衡（如恶性病例数量少）。通过考虑 lesion_id 分组，可以更有效地处理这种不平衡，因为这使得每个 lesion_id 的所有图像都能一起使用，从而有效扩大恶性样本集。
这样还可以减少模型因为良性和恶性样本比例差异大而造成的偏差，使模型的训练更加平衡。
4. 提高模型对复杂特征的学习能力
恶性病变通常会在一段时间内显示出更复杂或多样的特征表现，通过多张图像结合分析，可以帮助模型学习到这些复杂的特征，这在处理恶性分类任务时特别有价值。
使用分组数据，还可以在模型架构中加入多输入模型设计，通过输入多张图像的信息，模型可以捕获到单张图像无法展示的特征，提高对病变分类的整体准确率。
5. 模型决策的可解释性 (Model Interpretability)
对于实际的临床应用，通过良性或恶性分组信息来判断每张图像的诊断，有助于提高模型决策的可解释性。可以更好地理解为什么某个 lesion_id 被分类为恶性，以及它的每张图像在这个分类中的作用。

**如何在 DataLoader 中利用这些信息**
在 DataLoader 中，利用良性或恶性病变信息的方式可以是：
标签增强：将多个图像合并，并为这些图像统一分配一个标签，如 “该组病变为恶性”。
数据分组加载：通过每次加载一个组的图像，确保同一 lesion_id 的多个视角都能参与到模型的学习中。
改进的损失函数：在训练时，通过为良性和恶性设置不同的损失函数权重，可以让模型在数据不平衡时有更好的表现。

### 测试函数

In [20]:
import pandas as pd

# 定义 CSV 文件路径
ground_truth_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Training_GroundTruth.csv'
lesion_groupings_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Training_LesionGroupings.csv'

# 读取 CSV 文件
ground_truth_data = pd.read_csv(ground_truth_csv)
lesion_groupings_data = pd.read_csv(lesion_groupings_csv)

# 合并数据，确保每个 image 都有 lesion_id 信息
merged_data = pd.merge(ground_truth_data, lesion_groupings_data, on='image')

# 定义 _group_lesions 方法
def group_lesions(data):
    """ Group images by lesion_id and determine if they are benign or malignant. """
    lesion_groups = {}
    benign_labels = ['NV', 'BKL']
    malignant_labels = ['MEL', 'BCC', 'AKIEC']

    for _, row in data.iterrows():
        lesion_id = row['lesion_id']
        is_malignant = any(row[label] == 1 for label in malignant_labels)
        is_benign = any(row[label] == 1 for label in benign_labels) and not is_malignant

        if lesion_id not in lesion_groups:
            lesion_groups[lesion_id] = {
                'images': [],
                'is_malignant': False,
                'is_benign': False
            }

        lesion_groups[lesion_id]['images'].append(row['image'])
        lesion_groups[lesion_id]['is_malignant'] = lesion_groups[lesion_id]['is_malignant'] or is_malignant
        lesion_groups[lesion_id]['is_benign'] = lesion_groups[lesion_id]['is_benign'] or is_benign

    return lesion_groups

# 获取 lesion_groups
lesion_groups = group_lesions(merged_data)

# 定义 _prepare_samples_and_weights 方法
def prepare_samples_and_weights(lesion_groups):
    """ Prepare samples and weights for balanced sampling. """
    samples = []
    weights = []
    for lesion_id, details in lesion_groups.items():
        is_malignant = details['is_malignant']
        for img_name in details['images']:
            samples.append((lesion_id, img_name))
            weights.append(1.0 if is_malignant else 0.5)  # Adjust the weight here for balancing

    return samples, weights

# 测试 _prepare_samples_and_weights
samples, weights = prepare_samples_and_weights(lesion_groups)

# 打印结果以检查
print(f"Total samples: {len(samples)}")
print(f"Total weights: {len(weights)}")
print("\nSample of first 10:")
for i in range(10):
    lesion_id, img_name = samples[i]
    weight = weights[i]
    print(f"Lesion ID: {lesion_id}, Image: {img_name}, Weight: {weight}")

# 检查良性和恶性样本的分布
malignant_count = sum(1 for weight in weights if weight == 1.0)
benign_count = sum(1 for weight in weights if weight == 0.5)

print(f"Malignant samples: {malignant_count}")
print(f"Benign samples: {benign_count}")


Total samples: 10015
Total weights: 10015

Sample of first 10:
Lesion ID: HAM_0000550, Image: ISIC_0024306, Weight: 0.5
Lesion ID: HAM_0003577, Image: ISIC_0024307, Weight: 0.5
Lesion ID: HAM_0001477, Image: ISIC_0024308, Weight: 0.5
Lesion ID: HAM_0000484, Image: ISIC_0024309, Weight: 0.5
Lesion ID: HAM_0003350, Image: ISIC_0024310, Weight: 1.0
Lesion ID: HAM_0003350, Image: ISIC_0027300, Weight: 1.0
Lesion ID: HAM_0003350, Image: ISIC_0028512, Weight: 1.0
Lesion ID: HAM_0000981, Image: ISIC_0024311, Weight: 0.5
Lesion ID: HAM_0001359, Image: ISIC_0024312, Weight: 0.5
Lesion ID: HAM_0001359, Image: ISIC_0028628, Weight: 0.5
Malignant samples: 1954
Benign samples: 8061


### 测试 data_loader

[data_loader version_0](https://objectstorage.ap-sydney-1.oraclecloud.com/n/sdgfztegun7d/b/bucket-20240909-1436/o/dataset_version_0.py)

In [3]:
from config import dataset_dir
from dataset import SkinLesionDataset
from torch.utils.data import DataLoader, WeightedRandomSampler

# 使用示例
train_dataset = SkinLesionDataset(
    ground_truth_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Training_GroundTruth.csv',
    lesion_groupings_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Training_LesionGroupings.csv',
    img_dir=f'{dataset_dir}/2018/ISIC2018_Task3_Training_Input',
    transform=None
)

# 设置 WeightedRandomSampler 以处理数据不平衡问题
sampler = WeightedRandomSampler(train_dataset.weights, len(train_dataset), replacement=True)
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler)

# 在训练循环中使用
for images, is_malignant in train_loader:
    print(images.size(), is_malignant)
    break


torch.Size([32, 3, 224, 224]) tensor([1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0.])


```
torch.Size([32, 3, 224, 224]) tensor([1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 1., 0., 0.,
        1., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 1., 0.])
```

一个批量 (batch) 的 32 张图像，每张图像的尺寸为 3 x 224 x 224， 3 通道 RGB，224 x 224 图像的size

这说明DataLoader 已经成功地将 32 张图像从 PIL 转换为 PyTorch 的张量格式，并调整为统一大小。

`tensor([...])`：

这是一个长度为 32 的张量，其中包含每张图像的 is_malignant 标签。
其中，1.0 表示该病变组是恶性的，0.0 表示该病变组是良性的。
例如：`[1., 0., 0., 0., 1., ...]` 表示第 1、5 张图像被标记为恶性，而第 2、3、4 张图像被标记为良性。


### 返回的tensor 关注 病症诊断的标签

[data_loader version_0](https://objectstorage.ap-sydney-1.oraclecloud.com/n/sdgfztegun7d/b/bucket-20240909-1436/o/dataset_version_0.py)只是根据 lesion_id 分组返回了一个总体的 is_malignant 标记，还需要 返回病症的具体分类标签（MEL, NV, BCC, AKIEC, BKL, DF, VASC）

In [2]:
from config import dataset_dir
from dataset import SkinLesionDataset
from torch.utils.data import DataLoader, WeightedRandomSampler

# 使用示例
train_dataset = SkinLesionDataset(
    ground_truth_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Training_GroundTruth.csv',
    lesion_groupings_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Training_LesionGroupings.csv',
    img_dir=f'{dataset_dir}/2018/ISIC2018_Task3_Training_Input',
    transform=None
)

# 设置 WeightedRandomSampler 以处理数据不平衡问题
sampler = WeightedRandomSampler(train_dataset.weights, len(train_dataset), replacement=True)
train_loader = DataLoader(train_dataset, batch_size=32, sampler=sampler)

# 在训练循环中使用
for images, diagnosis_labels, is_malignant in train_loader:
    print(images.size(), diagnosis_labels, is_malignant)
    break


torch.Size([32, 3, 224, 224]) tensor([[1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0.],
        [1., 0., 0.,

上一个版本[data_loader version_1](https://objectstorage.ap-sydney-1.oraclecloud.com/n/sdgfztegun7d/b/bucket-20240909-1436/o/dataset_version_1.py) 

lesion_groupings_csv 可选：在实例化 SkinLesionDataset 时，可以选择是否提供 lesion_groupings_csv。如果没有提供，则会默认不进行分组。

因为只有训练集 有 ISIC2018_Task3_Training_LesionGroupings.csv

### 训练 蠢蠢欲动


In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.models as models
from torch.utils.data import DataLoader, WeightedRandomSampler
from torchvision import transforms
from tqdm import tqdm  # 导入 tqdm 进度条
from dataset import SkinLesionDataset  # 请确保这个路径是正确的
from config import dataset_dir


from torchvision import transforms

import warnings

# 忽略特定的 UserWarning
warnings.filterwarnings("ignore", category=UserWarning)


# 定义训练集的数据增强策略
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),  # 调整图像大小
    transforms.RandomHorizontalFlip(p=0.5),  # 随机水平翻转
    transforms.RandomRotation(degrees=15),  # 随机旋转
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),  # 随机调整亮度、对比度、饱和度和色调
    transforms.RandomResizedCrop((224, 224), scale=(0.8, 1.0)),  # 随机裁剪并调整大小
    transforms.ToTensor(),  # 转换为张量
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # 标准化
])

# 定义验证集的数据增强策略（通常不进行增强，仅调整大小）
valid_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])


# 实例化训练和验证数据集
train_dataset = SkinLesionDataset(
    ground_truth_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Training_GroundTruth.csv',
    lesion_groupings_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Training_LesionGroupings.csv',
    img_dir=f'{dataset_dir}/2018/ISIC2018_Task3_Training_Input',
    transform=train_transform
)

valid_dataset = SkinLesionDataset(
    ground_truth_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Validation_GroundTruth.csv',
    img_dir=f'{dataset_dir}/2018/ISIC2018_Task3_Validation_Input',
    transform=valid_transform
)

# 设置 WeightedRandomSampler 以处理数据不平衡问题
train_sampler = WeightedRandomSampler(train_dataset.weights, len(train_dataset), replacement=True)
train_loader = DataLoader(train_dataset, batch_size=32, sampler=train_sampler)

valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False)

# 定义模型
class SkinLesionClassifier(nn.Module):
    def __init__(self, num_classes=7):
        super(SkinLesionClassifier, self).__init__()
        # 使用预训练的 ResNet 模型
        self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        # 修改最后的全连接层用于多标签分类（7 种病症）
        self.backbone.fc = nn.Linear(self.backbone.fc.in_features, num_classes)
        # 单独的二分类层用于良性/恶性判断
        self.malignant_classifier = nn.Linear(self.backbone.fc.in_features, 1)

    def forward(self, x):
        # 获取到 ResNet 的中间特征
        features = self.backbone.avgpool(self.backbone.layer4(self.backbone.layer3(self.backbone.layer2(self.backbone.layer1(self.backbone.maxpool(self.backbone.relu(self.backbone.bn1(self.backbone.conv1(x))))))))).flatten(1)
        
        # 使用特征向量输出病症分类
        diagnosis_output = torch.sigmoid(self.backbone.fc(features))
        
        # 使用特征向量进行良性/恶性判断
        malignant_output = torch.sigmoid(self.malignant_classifier(features))
        return diagnosis_output, malignant_output

# 实例化模型
model = SkinLesionClassifier(num_classes=7)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 定义损失函数和优化器
criterion_diagnosis = nn.BCELoss()
criterion_malignant = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)


import logging
import os
from datetime import datetime

def setup_logger():
    """Set up a logger with a unique filename based on the current timestamp."""
    # Create output/log directory if it doesn't exist
    log_dir = 'output/log'
    os.makedirs(log_dir, exist_ok=True)
    
    # Generate a timestamp for unique log filename
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    log_file = os.path.join(log_dir, f'{timestamp}.log')
    
    # Configure the logger
    logging.basicConfig(
        filename=log_file,
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s'
    )
    
    # Also log to console
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
    console.setFormatter(formatter)
    logging.getLogger('').addHandler(console)
    
    logging.info(f"Logger initialized. Log file: {log_file}")
    return log_file


# 定义早停策略参数
early_stopping_patience = 5
best_val_loss = float('inf')
patience_counter = 0

import torch
import os
import logging
from tqdm import tqdm  # 导入 tqdm 进度条

# 初始化 logger
log_file = setup_logger()

# 定义用于记录损失的列表
train_diagnosis_losses = []
train_malignant_losses = []
valid_diagnosis_losses = []

# 模拟一个训练和验证循环
for epoch in range(50):  # 假设最多训练 50 个 epoch
    model.train()
    running_train_diagnosis_loss = 0.0
    running_train_malignant_loss = 0.0

    # 使用 tqdm 进度条显示训练进度
    train_progress = tqdm(train_loader, desc=f"Epoch {epoch + 1} [Training]")
    for images, diagnosis_labels, is_malignant in train_progress:
        images = images.to(device)
        diagnosis_labels = diagnosis_labels.to(device)
        is_malignant = is_malignant.to(device).unsqueeze(1)

        # 前向传播
        optimizer.zero_grad()
        diagnosis_output, malignant_output = model(images)

        # 计算多标签损失和二分类损失
        loss_diagnosis = criterion_diagnosis(diagnosis_output, diagnosis_labels)
        loss_malignant = criterion_malignant(malignant_output, is_malignant)

        # 反向传播和优化
        total_loss = loss_diagnosis + loss_malignant
        total_loss.backward()
        optimizer.step()

        # 累加每个 batch 的损失
        running_train_diagnosis_loss += loss_diagnosis.item()
        running_train_malignant_loss += loss_malignant.item()

        # 更新 tqdm 进度条描述
        train_progress.set_postfix({
            "Diagnosis Loss": f"{loss_diagnosis.item():.4f}",
            "Malignant Loss": f"{loss_malignant.item():.4f}"
        })

    # 计算平均训练损失
    avg_train_diagnosis_loss = running_train_diagnosis_loss / len(train_loader)
    avg_train_malignant_loss = running_train_malignant_loss / len(train_loader)

    train_diagnosis_losses.append(avg_train_diagnosis_loss)
    train_malignant_losses.append(avg_train_malignant_loss)

    logging.info(f"Epoch {epoch + 1}, Training Diagnosis Loss: {avg_train_diagnosis_loss:.4f}, Training Malignant Loss: {avg_train_malignant_loss:.4f}")

    # 评估验证集性能，只计算诊断损失
    model.eval()
    running_val_diagnosis_loss = 0.0

    # 使用 tqdm 进度条显示验证进度
    valid_progress = tqdm(valid_loader, desc=f"Epoch {epoch + 1} [Validation]")
    with torch.no_grad():
        for images, diagnosis_labels, _ in valid_progress:
            images = images.to(device)
            diagnosis_labels = diagnosis_labels.to(device)

            # 前向传播
            diagnosis_output, _ = model(images)  # 忽略 malignant_output

            # 计算损失
            loss_diagnosis = criterion_diagnosis(diagnosis_output, diagnosis_labels)

            # 累加每个 batch 的验证损失
            running_val_diagnosis_loss += loss_diagnosis.item()

            # 更新 tqdm 进度条描述
            valid_progress.set_postfix({
                "Diagnosis Loss": f"{loss_diagnosis.item():.4f}"
            })

    # 计算平均验证损失
    avg_val_diagnosis_loss = running_val_diagnosis_loss / len(valid_loader)

    valid_diagnosis_losses.append(avg_val_diagnosis_loss)

    logging.info(f"Epoch {epoch + 1}, Validation Diagnosis Loss: {avg_val_diagnosis_loss:.4f}")

    # 保存模型并创建路径
    output_dir = 'output/models'
    os.makedirs(output_dir, exist_ok=True)
    torch.save(model.state_dict(), os.path.join(output_dir, f'best_model_epoch_{epoch + 1}.pth'))

    # 打印早停信息
    if patience_counter >= early_stopping_patience:
        logging.info("Early stopping triggered!")
        break



2024-10-19 05:16:22,689 - INFO - Logger initialized. Log file: output/log\20241019_051622.log
Epoch 1 [Training]: 100%|██████████| 313/313 [02:24<00:00,  2.17it/s, Diagnosis Loss=0.1730, Malignant Loss=0.3185]
2024-10-19 05:18:47,239 - INFO - Epoch 1, Training Diagnosis Loss: 0.2282, Training Malignant Loss: 0.4053
Epoch 1 [Validation]: 100%|██████████| 7/7 [00:01<00:00,  4.50it/s, Diagnosis Loss=0.2136]
2024-10-19 05:18:48,797 - INFO - Epoch 1, Validation Diagnosis Loss: 0.1532
Epoch 2 [Training]: 100%|██████████| 313/313 [02:25<00:00,  2.15it/s, Diagnosis Loss=0.0989, Malignant Loss=0.1967]
2024-10-19 05:21:14,136 - INFO - Epoch 2, Training Diagnosis Loss: 0.1602, Training Malignant Loss: 0.3251
Epoch 2 [Validation]: 100%|██████████| 7/7 [00:01<00:00,  4.50it/s, Diagnosis Loss=0.3120]
2024-10-19 05:21:15,693 - INFO - Epoch 2, Validation Diagnosis Loss: 0.1574
Epoch 3 [Training]: 100%|██████████| 313/313 [02:27<00:00,  2.13it/s, Diagnosis Loss=0.1329, Malignant Loss=0.2327]
2024-10-19

In [5]:
from sklearn.metrics import classification_report, accuracy_score
import numpy as np
import os
import torch
import torch.nn as nn
import torchvision.models as models
from torch.utils.data import DataLoader
from dataset import SkinLesionDataset  # 请确保这个路径是正确的
from config import dataset_dir
from torchvision import transforms
import warnings

# 忽略 UserWarning 和 FutureWarning
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

class SkinLesionClassifier(nn.Module):
    def __init__(self, num_classes=7):
        super(SkinLesionClassifier, self).__init__()
        # 使用预训练的 ResNet 模型
        self.backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
        # 修改最后的全连接层用于多标签分类（7 种病症）
        self.backbone.fc = nn.Linear(self.backbone.fc.in_features, num_classes)
        # 单独的二分类层用于良性/恶性判断
        self.malignant_classifier = nn.Linear(self.backbone.fc.in_features, 1)

    def forward(self, x):
        # 获取到 ResNet 的中间特征
        features = self.backbone.avgpool(self.backbone.layer4(self.backbone.layer3(self.backbone.layer2(self.backbone.layer1(self.backbone.maxpool(self.backbone.relu(self.backbone.bn1(self.backbone.conv1(x))))))))).flatten(1)
        
        # 使用特征向量输出病症分类
        diagnosis_output = torch.sigmoid(self.backbone.fc(features))
        
        # 使用特征向量进行良性/恶性判断
        malignant_output = torch.sigmoid(self.malignant_classifier(features))
        return diagnosis_output, malignant_output

# 定义测试集的数据增强策略（通常不进行增强，仅调整大小）
test_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# 实例化测试数据集
test_dataset = SkinLesionDataset(
    ground_truth_csv=f'{dataset_dir}/2018/ISIC2018_Task3_Test_GroundTruth.csv',
    img_dir=f'{dataset_dir}/2018/ISIC2018_Task3_Test_Input',
    transform=test_transform
)

test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# 加载已训练的模型
model = SkinLesionClassifier(num_classes=7)
model.load_state_dict(torch.load('output/models/best_model_epoch_20.pth'))  # 当前最佳模型的路径
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 评估模型性能
all_labels = []
all_preds = []

with torch.no_grad():
    for images, diagnosis_labels, _ in test_loader:
        images = images.to(device)
        diagnosis_labels = diagnosis_labels.to(device)

        # 预测
        diagnosis_output, _ = model(images)
        diagnosis_preds = (diagnosis_output > 0.5).float()  # 二值化预测结果

        all_labels.extend(diagnosis_labels.cpu().numpy())
        all_preds.extend(diagnosis_preds.cpu().numpy())

# 将预测结果和真实标签转换为 numpy 数组
all_labels = np.array(all_labels).reshape(-1, 7)
all_preds = np.array(all_preds).reshape(-1, 7)

# 计算分类报告
report = classification_report(all_labels, all_preds, target_names=['MEL', 'NV', 'BCC', 'AKIEC', 'BKL', 'DF', 'VASC'], zero_division=0)
accuracy = accuracy_score(all_labels, all_preds)

print("Test Accuracy: {:.2f}%".format(accuracy * 100))
print("Classification Report:\n", report)


  model.load_state_dict(torch.load('output/models/best_model_epoch_20.pth'))  # 替换为最佳模型的路径


Test Accuracy: 74.74%
Classification Report:
               precision    recall  f1-score   support

         MEL       0.57      0.53      0.55       171
          NV       0.87      0.92      0.89       909
         BCC       0.82      0.40      0.54        93
       AKIEC       0.58      0.44      0.50        43
         BKL       0.71      0.60      0.65       217
          DF       0.82      0.61      0.70        44
        VASC       0.94      0.46      0.62        35

   micro avg       0.80      0.76      0.78      1512
   macro avg       0.76      0.57      0.63      1512
weighted avg       0.80      0.76      0.77      1512
 samples avg       0.76      0.76      0.76      1512
