In [1]:
# -*- coding: utf-8 -*-

""" 
实战Kaggle比赛：classify leaf 
https://www.kaggle.com/competitions/classify-leaves
"""
import pandas as pd
import os
from PIL import Image
import torch
from torch import nn
from torchvision import models, transforms
from torch.utils import data
from torch.utils.data import Dataset, DataLoader

In [None]:
# 一、下载数据集 
# kaggle competitions download -c classify-leaves

# 二、读取数据集
# current_file_dir = os.getcwd()
script_dir = os.path.dirname(os.path.abspath(__file__))
# print(current_file_dir)

In [2]:
"""
训练数据集包括18353个样本，每个样本个特征和1个标签，
而测试数据集包含8800个样本，
共176种类
"""
train = pd.read_csv(os.path.join(script_dir, "data", "train.csv"))
test = pd.read_csv(os.path.join(script_dir, "data", "test.csv"))
# print(train.shape)
# print(test.shape)

(18353, 2)
(8800, 1)


In [7]:

train_labels = list(train["label"])
# 获取训练集中标签的种类
labels_categories = list(set(train_labels))
# 将训练集对应的标签转化为数字索引 labels_categories -> index
labels_num = []
for i in range(len(train_labels)):
    labels_num.append(labels_categories.index(train_labels[i]))

train["number"] = labels_num
# print(train.shape)
# index=False 不在文件的第一列加索引
# train.to_csv("./data/train_num_label.csv", index=False) 

In [None]:
# 三、预处理数据
class Leaf_Train_Dataset(Dataset):
    '''
    树叶数据集的训练集 自定义Dataset
    '''
    def __init__(self, train_path, transform = None) -> None:
        '''
        train_path : 传入记录图像路径及其标号的csv文件
        transform : 对图像进行的变换
        '''
        super().__init__()
        self.train_csv = pd.read_csv(train_path)
        # 以列表的形式记录图像所在地址
        self.images_list = list(self.train_csv["image"])
        # 图像的标号记录
        self.label_nums = list(self.train_csv["number"])
        self.transform = transform

    def __getitem__(self, idx):
        '''
        idx : 所需要获取的图像的索引
        return : image， label
        '''
        image = Image.open(os.path.join("/home/qlf/d2l/chapter7/7_8_classify_leaf/data", self.images_list[idx]))
        if(self.transform != None):
            image = self.transform(image)
        label = self.label_nums[idx]
        return image, label
    
    def __len__(self):
        return len(self.images_list)
    
transforms_train = transforms.Compose(
    [
        transforms.RandomHorizontalFlip(p=0.5),
        transforms.RandomVerticalFlip(p=0.5),
        transforms.ColorJitter(brightness=0.5, contrast=0.5, saturation=0.5, hue=0.5),
        transforms.RandomResizedCrop(size=(224, 224), scale=(0.5 ,1), ratio=(3/4, 4/3)),
        transforms.ToTensor()
    ]
)
transforms_test = transforms.Compose(
    [
        transforms.Resize((224, 224)),
        transforms.ToTensor()
    ]
)


In [None]:
import matplotlib
matplotlib.use("Agg")  # 这一句一定要放在下面这句的前面
from matplotlib import pyplot as plt

def use_svg_display():
    """使用svg格式在Jupyter中显示绘图"""
    #可以试试加上这个代码，%config InlineBackend.figure_format = 'svg'
    # backend_inline.set_matplotlib_formats('svg')

def set_figsize(figsize=(3.5, 2.5)):
    """设置matplotlib的图表大小"""
    use_svg_display()
    plt.rcParams['figure.figsize'] = figsize

def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend):
    """设置matplotlib的轴"""
    axes.set_xlabel(xlabel)
    axes.set_ylabel(ylabel)
    axes.set_xscale(xscale)
    axes.set_yscale(yscale)
    axes.set_xlim(xlim)
    axes.set_ylim(ylim)
    if legend:
        axes.legend(legend)
    axes.grid()

#通过以上三个用于图形配置的函数，定义一个plot函数来简洁地绘制多条曲线， 因为我们需要在整个书中可视化许多曲线。
def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None,
         ylim=None, xscale='linear', yscale='linear',
         fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None):
    """绘制数据点"""
    if legend is None:
        legend = []

    set_figsize(figsize)
    axes = axes if axes else plt.gca()

    # 如果X有一个轴，输出True
    def has_one_axis(X):
        return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list)
                and not hasattr(X[0], "__len__"))

    if has_one_axis(X):
        X = [X]
    if Y is None:
        X, Y = [[]] * len(X), X
    elif has_one_axis(Y):
        Y = [Y]
    if len(X) != len(Y):
        X = X * len(Y)
    axes.cla()
    for x, y, fmt in zip(X, Y, fmts):
        if len(x):
            axes.plot(x, y, fmt)
        else:
            axes.plot(y, fmt)
    set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)


class Animator:
    """在动画中绘制数据"""
    def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None,
                 ylim=None, xscale='linear', yscale='linear',
                 fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1,
                 figsize=(7, 5)):
        # 增量地绘制多条线
        if legend is None:
            legend = []
        use_svg_display()
        self.fig, self.axes = plt.subplots(nrows, ncols, figsize=figsize)
        if nrows * ncols == 1:
            self.axes = [self.axes, ]
        # 使用lambda函数捕获参数
        self.config_axes = lambda: set_axes(
            self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
        self.X, self.Y, self.fmts = None, None, fmts

    def add(self, x, y):
        # 向图表中添加多个数据点
        if not hasattr(y, "__len__"):
            y = [y]
        n = len(y)
        if not hasattr(x, "__len__"):
            x = [x] * n
        if not self.X:
            self.X = [[] for _ in range(n)]
        if not self.Y:
            self.Y = [[] for _ in range(n)]
        for i, (a, b) in enumerate(zip(x, y)):
            if a is not None and b is not None:
                self.X[i].append(a)
                self.Y[i].append(b)
        self.axes[0].cla()
        for x, y, fmt in zip(self.X, self.Y, self.fmts):
            self.axes[0].plot(x, y, fmt)
        self.config_axes()
        plt.show()

class Accumulator:
    """在n个变量上累加"""

    def __init__(self, n):
        self.data = [0.0] * n

    def add(self, *args):
        self.data = [a + float(b) for a, b in zip(self.data, args)]

    def reset(self):
        self.data = [0.0] * len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

def accuracy(y_hat, y):
    """计算预测正确的数量"""
    if len(y_hat.shape) > 1 and y_hat.shape[1] > 1:
        y_hat = y_hat.argmax(axis=1)  # 获得每行中最大元素的索引来获得预测类别
    cmp = y_hat.type(y.dtype) == y  #
    return float(cmp.type(y.dtype).sum())  # 返回预测正确的个数

import time
import numpy as np

class Timer:
    """记录多次运行时间"""

    def __init__(self):
        self.times = []
        self.lastTimeSum = 0
        self.start()

    def start(self):
        """启动计时器"""
        self.tik = time.time()

    def stop(self):
        """停止计时器并将时间记录在列表中"""
        self.times.append(time.time() - self.tik)
        return self.times[-1]

    def avg(self):
        """返回平均时间"""
        return sum(self.times) / len(self.times)
    
    def sum(self):
        """返回时间总和"""
        self.lastTimeSum = sum(self.times)
        return self.lastTimeSum

    def cumsum(self):
        """返回累计时间"""
        return np.array(self.times).cumsum().tolist()

def evaluate_accuracy_gpu(net, data_iter, device=None):
    """使用GPU计算模型在数据集上的精度"""
    if isinstance(net, torch.nn.Module):
        net.eval()  # 设置为评估模式，关闭Dropout和直接结算所有batch的均值和方差
        if not device:
            # 使用参数来构建一个虚拟的计算图，然后从计算图中获取一个参数张量，然后通过 .device 属性获取这个参数张量所在的设备。这个参数张量位于模型的第一个参数（通常是一个权重矩阵）。
            device = next(iter(net.parameters())).device
    # 正确预测的数量，总预测的数量
    metric = Accumulator(2)
    with torch.no_grad():
        for X, y in data_iter:
            if isinstance(X, list):
                # BERT微调所需要的
                X = [x.to(device) for x in X]
            else:
                X = X.to(device)
            y = y.to(device)
            metric.add(accuracy(net(X), y), y.numel())
    return metric[0] / metric[1]

In [None]:
from tensorboardX import SummaryWriter
from torchinfo import summary
import datetime
import sys

class Logger():

    def __init__(self, filename):
        self.terminal = sys.stdout
        self.log = open(filename, "w")

    def write(self, message):
        self.terminal.write(message)
        self.log.write(message)

    def flush(self):
        pass

# 获取当前日期和时间
current_datetime = datetime.datetime.now()
# 将日期和时间格式化为字符串，例如：2023-09-10-14-30-00
formatted_datetime = current_datetime.strftime("%Y-%m-%d-%H-%M-%S")
log_dir = os.path.join(script_dir, "logs", f"DenseNet_{formatted_datetime}")

# 实例化SummaryWriter对象
tb_writer = SummaryWriter(log_dir = log_dir)
# 实例化Logger对象
sys.stdout = Logger(log_dir + f"/output_{formatted_datetime}.txt")


In [None]:
# 训练
def try_gpu(i=0):
    """如果存在，则返回gpu(i)，否则返回cpu()"""
    if torch.cuda.device_count() >= i + 1:
        return torch.device(f"cuda:{i}")
    return torch.device("cpu")

def train(net, train_loader, valid_loader, num_epochs, lr, device = try_gpu()):
    if(net.parameters().__next__().device != device):
        net.to(device)
        print("training on ", device)
    optimizer = torch.optim.SGD(net.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()

    animator = Animator(xlabel=f'epoch, lr={lr}, , batch_size={batch_size}, {device}', xlim=[1, num_epochs],
                            legend=['train loss', 'train acc', 'valid acc'])    
    timer  = Timer()

    # num_batches 表示 total_samples / batch_size
    num_batches = len(train_loader) 
    for epoch in range(num_epochs):
        net.train()
        # 训练损失之和，训练准确率之和，样本数
        metric = Accumulator(3)
        # 循环的次数为 num_batches
        for i, (X, y) in enumerate(train_loader):
            timer.start()
            optimizer.zero_grad()
            X, y = X.to(device), y.to(device)
            y_hat = net(X)
            l = loss(y_hat, y)
            l.backward()
            optimizer.step()
            with torch.no_grad():
                # 这部分代码计算了当前批次的损失乘以批次的大小（样本数量）。
                # 这是为了得到当前批次的总损失。通常，损失是对单个样本的损失，将其乘以批次大小可以得到批次的总损失。
                # X.shape[0] 经常= batch_size ，但最后一个loader一般小于batch_size
                metric.add(l * X.shape[0], accuracy(y_hat, y), X.shape[0])
            timer.stop()
            if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1:
                train_loss = metric[0] / metric[2]
                train_acc = metric[1] / metric[2]
                animator.add(
                    epoch + (i + 1) / num_batches,
                    (train_loss, train_acc, None)
                )
        valid_acc = evaluate_accuracy_gpu(net, valid_loader)
        animator.add(epoch + 1, (None, None, valid_acc))  
        print(f"{timer.sum():.1f} sec,", "[epoch: {}] train_loss: {:.3f}, train_acc: {:.3f}, valid_acc: {:.3f}".format(epoch, train_loss, train_acc, valid_acc))
        
        tb_writer.add_scalar("train_loss", train_loss, epoch + 1)
        tb_writer.add_scalar("train_acc", train_acc, epoch + 1)
        tb_writer.add_scalar("valid_acc", valid_acc, epoch + 1)
    
    print(f"[Total] loss {train_loss:.3f}, train acc {train_acc:.3f}," f"valid acc {valid_acc:.3f}")
    print(
        f"[Total] {timer.sum():.1f} sec, {metric[2] * num_epochs / timer.sum():.1f} examples/sec"
        f"on {str(device)}"
    )
    return train_loss, train_acc, valid_acc 

"""
[**K折交叉验证**]有助于模型选择和超参数调整。
在$K$折交叉验证过程中返回第$i$折的数据。
具体地说，它选择第$i$个切片作为验证数据，其余部分作为训练数据。
注意，这并不是处理数据的最有效方法，如果我们的数据集大得多，会有其他解决办法。
"""

def get_k_fold_dataset(k, i, X, y):
    assert k > 1 , "K折交叉验证 require k > 1"
    fold_size = X.shape[0] // k # //是向下取整
    X_train, y_train = None, None
    for j in range(k):
        # slice() 函数实现切片对象，主要用在切片操作函数里的参数传递。
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            # torch.cat(tensors, dim=0, out=None) -> Tensor, 一个将张量沿着指定维度拼接起来的函数
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    
    train_dataset = Leaf_Train_Dataset(X_train, y_train)
    valid_dataset = Leaf_Train_Dataset(X_valid, y_valid)
    
    return train_dataset, valid_dataset

def k_fold_cross_train(k, X_train, y_train, num_epochs, lr, batch_size):
    train_loss, train_acc, valid_acc = [], [], []
    # train_loss_sum, train_acc_sum, valid_acc_sum = 0, 0, 0

    for i in range(k):
        train_dataset, valid_dataset = get_k_fold_dataset(k, i, X_train, y_train, batch_size)
        train_loader = data.DataLoader(train_dataset, batch_size, shuffle=True)
        valid_loader = data.DataLoader(valid_dataset, batch_size, shuffle=True)
        print(f'折{i + 1}')
        train_loss[i], train_acc[i], valid_acc[i] = train(net, train_loader, valid_loader,
                     num_epochs, lr, try_gpu())
        
    return sum(train_loss) / k, sum(train_acc) / k, sum(valid_acc) / k


In [None]:
# 选择网络

resnet50 = models.resnet50(weights = models.ResNet50_Weights.IMAGENET1K_V1)

def init_weights(m):
    if type(m) == nn.Linear or type(m) == nn.Conv2d:
        nn.init.xavier_uniform_(m.weight)

# net.apply(init_weights)


In [None]:
# train_dataset = Leaf_Train_Dataset(os.path.join(script_dir, "data", "train_num_label.csv"), transform = transforms_train)
# valid_dataset = Leaf_Train_Dataset(os.path.join(script_dir, "data", "test.csv"), transform = transforms_test)
# train_loader = data.DataLoader(train_dataset, batch_size = 64, shuffle=True)
# valid_loader = data.DataLoader(valid_dataset, batch_size = 64, shuffle=True)

train(resnet50, train_loader, valid_loader, 10, 0.01)

In [None]:
# 预测并生成csv文件
train_dataset = Leaf_Train_Dataset(os.path.join(script_dir, "data", "train_num_label.csv"), transform = transforms_train)

In [3]:
import datetime
def get_datetime():
    # 获取当前日期和时间
    current_datetime = datetime.datetime.now()
    # 将日期和时间格式化为字符串，例如：2023-09-10-14-30-00
    formatted_datetime = current_datetime.strftime("%Y-%m-%d-%H-%M-%S")
    return formatted_datetime
get_datetime()

'2023-10-06-20-35-27'