In [8]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
from torch.utils.data import DataLoader, SubsetRandomSampler
from sklearn.model_selection import train_test_split
import numpy as np
import medmnist
from medmnist import INFO
import torchvision.transforms as transforms
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# 定义模型结构
class ResNet18Baseline(nn.Module):
    def __init__(self, out_dim, num_classes):
        super(ResNet18Baseline, self).__init__()
        self.encoder = self.get_resnet('resnet18')
        self.projector = nn.Sequential(
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, out_dim)
        )
        self.classifier = nn.Linear(out_dim, num_classes)

    def get_resnet(self, base_model):
        model = models.__dict__[base_model](pretrained=False)
        model = nn.Sequential(*list(model.children())[:-1])
        return model

    def forward(self, x):
        h = self.encoder(x).squeeze()
        z = self.projector(h)
        logits = self.classifier(z)
        return logits

# 数据预处理和加载
data_flag = 'pathmnist'
download = True

info = INFO[data_flag]
DataClass = getattr(medmnist, info['python_class'])

data_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[.5], std=[.5])
])

train_dataset = DataClass(split='train', transform=data_transform, download=download)
val_dataset = DataClass(split='val', transform=data_transform, download=download)

# 抽取不同比例的训练数据
def get_data_loader(dataset, split_size, batch_size=256):
    num_samples = len(dataset)
    indices = list(range(num_samples))
    np.random.shuffle(indices)
    split_indices = indices[:int(num_samples * split_size)]
    sampler = SubsetRandomSampler(split_indices)
    data_loader = DataLoader(dataset, batch_size=batch_size, sampler=sampler)
    return data_loader

# 训练函数
def train_baseline(model, train_loader, criterion, optimizer, num_epochs=50):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for data, target in train_loader:
            data, target = data.cuda(), target.squeeze().long().cuda()
            optimizer.zero_grad()
            logits = model(data)
            loss = criterion(logits, target)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        #print(f'Epoch {epoch}, Loss: {total_loss / len(train_loader)}')

# 验证函数
def validate(model, val_loader):
    model.eval()
    all_preds = []
    all_targets = []
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.cuda(), target.squeeze().long().cuda()
            logits = model(data)
            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_targets.extend(target.cpu().numpy())
    accuracy = accuracy_score(all_targets, all_preds)
    return accuracy

# 定义验证数据加载器
val_loader = DataLoader(dataset=val_dataset, batch_size=256, shuffle=False)

# 实例化模型，定义损失函数和优化器
out_dim = 128
num_classes = 9
baseline_model = ResNet18Baseline(out_dim=out_dim, num_classes=num_classes).cuda()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(baseline_model.parameters(), lr=3e-4, weight_decay=1e-6)

# 训练和验证不同数据比例下的模型
data_splits = [0.01, 0.1, 1.0]
for split in data_splits:
    train_loader = get_data_loader(train_dataset, split)
    print(f'Training with {int(split*100)}% of the data:')
    train_baseline(baseline_model, train_loader, criterion, optimizer)
    accuracy = validate(baseline_model, val_loader)
    print(f'Validation Accuracy: {accuracy:.4f}')



Using downloaded and verified file: C:\Users\xie\.medmnist\pathmnist.npz
Using downloaded and verified file: C:\Users\xie\.medmnist\pathmnist.npz




Training with 1% of the data:
Validation Accuracy: 0.5411
Training with 10% of the data:
Validation Accuracy: 0.7594
Training with 100% of the data:
Validation Accuracy: 0.8791
