# 实战 Kaggle 比赛：图像分类 (CIFAR-10)

比赛的网址是https://www.kaggle.com/c/cifar-10

In [None]:
import sys
import collections
import math
import os
import shutil
import pandas as pd
import torch
import numpy as np
import torchvision
from torch import nn
sys.path.insert(0, "..")
from d2l import torch as d2l

我们提供包含前1000个训练图像和5个随机测试图像的数据集的小规模样本

In [None]:
d2l.DATA_HUB['cifar10_tiny'] = (d2l.DATA_URL + 'kaggle_cifar10_tiny.zip',
                                '2068874e4b9a9f0fb07ebe0ad2b29754449ccacd')

demo = True

if demo:
    data_dir = d2l.download_extract('cifar10_tiny')
else:
    data_dir = '../data/cifar-10/'

读取数据集标签

In [None]:
def read_csv_labels(fname):
    """读取fname来给标签字典返回一个文件名"""
    with open(fname, 'r') as f:
        lines = f.readlines()[1:]
    tokens = [l.rstrip().split(',') for l in lines]
    return dict(((name, label) for name, label in tokens))

labels = read_csv_labels(os.path.join(data_dir, 'trainLabels.csv'))
# TASK 1.1 （5分）输出数据集中都有哪些类
# TASK 1.2 （5分）输出数据集中的图片总数

从数据集中拆分出训练集和验证集

In [None]:
def copyfile(filename, target_dir):
    """将文件复制到目标目录"""
    os.makedirs(target_dir, exist_ok=True)
    shutil.copy(filename, target_dir)

def reorg_train_valid(data_dir, labels, valid_ratio):
    """将验证集从原始的训练集中拆分出来"""
    n = collections.Counter(labels.values()).most_common()[-1][1]
    n_valid_per_label = max(1, math.floor(n * valid_ratio))
    label_count = {}
    for train_file in os.listdir(os.path.join(data_dir, 'train')):
        label = labels[train_file.split('.')[0]]
        fname = os.path.join(data_dir, 'train', train_file)
        copyfile(fname, os.path.join(data_dir, 'train_valid_test',
                                     'train_valid', label))
        if label not in label_count or label_count[label] < n_valid_per_label:
            copyfile(fname, os.path.join(data_dir, 'train_valid_test',
                                         'valid', label))
            label_count[label] = label_count.get(label, 0) + 1
        else:
            copyfile(fname, os.path.join(data_dir, 'train_valid_test',
                                         'train', label))
    return n_valid_per_label

def reorg_test(data_dir):
    """在预测期间整理测试集，以方便读取"""
    for test_file in os.listdir(os.path.join(data_dir, 'test')):
        copyfile(os.path.join(data_dir, 'test', test_file),
                 os.path.join(data_dir, 'train_valid_test', 'test',
                              'unknown'))

def reorg_cifar10_data(data_dir, valid_ratio):
    labels = read_csv_labels(os.path.join(data_dir, 'trainLabels.csv'))
    reorg_train_valid(data_dir, labels, valid_ratio)
    reorg_test(data_dir)
    return labels

In [None]:
# TASK 2.1 （5分）建立cifar10数据集，并使用20%数据作为验证集。在注释中解释函数的参数代表的意义。

# TASK 2.2 （20分）展示训练集所有类别的前10张图片。类别按照字母顺序进行排序，并将他们放在同一张大图中进行展示。
# 每一行代码都需要写好注释，表明该行代码的作用。如果你调用了numpy/torch的函数，请在注释中解释参数的意义。
# 提示：请仔细阅读 reorg_cifar10_data, reorg_train_valid, reorg_test 函数的定义。


图像增广

In [None]:
# TASK 3.1 （5分）修改以下代码，加入随机水平翻转进行图片增广
transform_train = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.4914, 0.4822, 0.4465],
                                     [0.2023, 0.1994, 0.2010])])

transform_test = torchvision.transforms.Compose([
    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize([0.4914, 0.4822, 0.4465],
                                     [0.2023, 0.1994, 0.2010])])

读取由原始图像组成的数据集

In [None]:
train_ds, train_valid_ds = [torchvision.datasets.ImageFolder(
    os.path.join(data_dir, 'train_valid_test', folder),
    transform=transform_train) for folder in ['train', 'train_valid']]

valid_ds, test_ds = [torchvision.datasets.ImageFolder(
    os.path.join(data_dir, 'train_valid_test', folder),
    transform=transform_test) for folder in ['valid', 'test']]

指定上面定义的所有图像增广操作

In [None]:
batch_size = 32
train_dl = torch.utils.data.DataLoader(
    train_ds, batch_size, shuffle=True, drop_last=True)

# TASK 3.2 （10分）定义验证集和测试集的DataLoader，并以val_dl和test_dl命名

模型

In [None]:
def get_net():
    num_classes = 10
    net = d2l.resnet18(num_classes, 3)
    return net

criterion = nn.CrossEntropyLoss()

训练函数

In [None]:
def train(net, train_dl, val_dl, num_epochs, lr, wd, devices, lr_period,
          lr_decay):
    optimizer = torch.optim.SGD(net.parameters(), lr=lr, momentum=0.9,
                              weight_decay=wd)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, lr_period, lr_decay)
    num_batches = len(train_dl)
    timer = d2l.Timer()
    legend = ['train loss', 'train acc']
    if val_dl is not None:
        legend.append('valid acc')
    animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs],
                            legend=legend)
    net = nn.DataParallel(net, device_ids=devices).to(devices[0])
    for epoch in range(num_epochs):
        net.train()
        metric = d2l.Accumulator(3)
        for i, (x, y) in enumerate(train_dl):
            timer.start()
            optimizer.zero_grad()
            
            # TASK 4.1 （20分）完成网络的前传、后传。请用注释给出每一个语句的意义。
            
            # TASK 4.2 (5分) 计算对于此次数据x，有多少个样例被分对了，并将之记录进count中
            
            metric.add(loss, count, y.shape[0])
            timer.stop()
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                animator.add(epoch + (i + 1) / num_batches,
                             (metric[0] / metric[2], metric[1] / metric[2],
                              None))
        if val_dl is not None:
            val_acc = d2l.evaluate_accuracy_gpu(net, val_dl)
            animator.add(epoch + 1, (None, None, val_acc))
        scheduler.step()
    measures = (f'train loss {metric[0] / metric[2]:.3f}, '
                f'train acc {metric[1] / metric[2]:.3f}')
    if valid_iter is not None:
        measures += f', valid acc {val_acc:.3f}'
    print(measures + f'\n{metric[2] * num_epochs / timer.sum():.1f}'
          f' examples/sec on {str(devices)}')

训练和验证模型

In [None]:
devices, num_epochs, lr, wd = d2l.try_all_gpus(), 20, 2e-4, 5e-4
lr_period, lr_decay, net = 4, 0.9, get_net()

# TASK 4.3 (5分) 在注释中给出每一个参数的意义

# TASK 4.4 (20分) 请训练网络并保存loss、accuracy图像。

In [None]:
for X, _ in test_iter:
    y_hat = net(X.to(devices[0]))
    preds.extend(y_hat.argmax(dim=1).type(torch.int32).cpu().numpy())
sorted_ids = list(range(1, len(test_ds) + 1))
sorted_ids.sort(key=lambda x: str(x))
df = pd.DataFrame({'id': sorted_ids, 'label': preds})
df['label'] = df['label'].apply(lambda x: train_valid_ds.classes[x])
df.to_csv('submission.csv', index=False)