## Import Libs

In [None]:
import os
import warnings # 避免一些可以忽略的报错
warnings.filterwarnings('ignore')
import random
import gc
import copy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm import tqdm # 进度条
import time

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from collections import defaultdict # 记录 loss lr 等相关参数的变化
# 改变 终端颜色 方便观察
from colorama import Fore, Back, Style
b_ = Fore.BLUE
sr_ = Style.RESET_ALL

## CONFIG

In [None]:
is_debug = False

class CONFIG:
    seed = 308
    
    epochs = 10 if not is_debug else 2
    now_cv = 0
    
    train_batch_size = 64
    valid_batch_size = 512
    
    in_features = 784
    n_classes = 10

    n_workers = 1
    
    learning_rate = 1e-3
    weight_decay = 1e-6 # 一个参数而已
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    
    train_csv = "/kaggle/input/digit-recognizer/train.csv"
    ckpt_save_path = "./output"

## Set Random Seed

In [None]:
def set_seed(seed=308):
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
set_seed(CONFIG.seed) # 固定随机种子，方便结果复现

## Data Progress

In [None]:
train = pd.read_csv(CONFIG.train_csv) # 读取训练的 .csv
train

In [None]:
# 切分训练集测试集 前 80 % 数据作训练集，后 20 % 数据作验证集
num_train = len(train)

df_train = train.iloc[: num_train // 5 * 4, :].reset_index(drop=True) # .reset_index(drop=True) 重置索引
df_valid = train.iloc[num_train // 5 * 4:, :].reset_index(drop=True)
df_valid

## Dataset and DataLoader

In [None]:
class MyDataset(Dataset):
    def __init__(self, df):
        super().__init__()
        self.df = df
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx, :] # 从 df 中取出 idx 这一行
        label = row.label
        X = row["pixel0":].values
        
        return X, label # Dataset 一般一次返回一条 X(特征), y(标签) 这样的形式

In [None]:
def prepare_loaders():
    train_datasets = MyDataset(df=df_train)
    valid_datasets = MyDataset(df=df_valid)
    
    train_loader = DataLoader(train_datasets, batch_size=CONFIG.train_batch_size, num_workers=CONFIG.n_workers, shuffle=True, pin_memory=True)
    valid_loader = DataLoader(valid_datasets, batch_size=CONFIG.valid_batch_size, num_workers=CONFIG.n_workers, shuffle=False, pin_memory=True)
    # valid_loader 一般不行打乱操作 所以 shuffle 为 False
    
    return train_loader, valid_loader

## Evaluation

In [None]:
def cal_ACC(y_true, y_preds):
    if len(y_true) != len(y_preds):
        raise("len(y_true) != len(y_preds)")
    length = len(y_true)
    acc = (y_true == y_preds).sum() / length
    
    return acc

## Model

In [None]:
def MLP_Layer(in_features, out_features): # MLP网络 一般的 block 由 1层全连接层 1层正则化层 1层激活函数层 组成
    linear = nn.Linear(in_features=in_features, out_features=out_features)
    norm = nn.BatchNorm1d(out_features)
    act = nn.LeakyReLU(0.01)
    
    return nn.Sequential(
        linear,
        norm,
        act
    )

In [None]:
class DigitRecognizerModel(nn.Module):
    def __init__(self):
        super(DigitRecognizerModel, self).__init__()
        # 3层 MLP网络
        self.model = nn.Sequential(
            MLP_Layer(in_features=CONFIG.in_features, out_features=1024),
            MLP_Layer(in_features=1024, out_features=384),
            MLP_Layer(in_features=384, out_features=CONFIG.n_classes)
        )
        
    def forward(self, x):
        output = self.model(x)
        return output

In [None]:
model = DigitRecognizerModel() # 实例化模型
model.to(CONFIG.device)

## Train and Valid Function

In [None]:
criterion = nn.CrossEntropyLoss() # 实例化损失函数 多分类可选 交叉熵损失

In [None]:
def train_one_epoch(model, optimizer, train_loader, epoch):
    model.train()
    
    y_preds = []
    y_trues = []
    
    dataset_size = 0
    running_loss = 0.0
    bar = tqdm(enumerate(train_loader), total=len(train_loader)) # 进度条
    for step, (X, labels) in bar:
        optimizer.zero_grad()
        
        batch_size = X.size(0)
        X = X.to(CONFIG.device, dtype=torch.float) # 传入模型的特征矩阵一般为 float 类型，这里的 torch.float 默认为 float32
        labels = labels.to(CONFIG.device, dtype=torch.long) # 对于 nn.CrossEntropyLoss() 损失函数，非独热编码的标签需要为 long 类型
            
        outputs = model(X)
        outputs = F.softmax(outputs) # 模型输出的预测结果经过 softmax 激活函数 转换成概率值
        loss = criterion(outputs, labels)
        loss.backward()
        
        optimizer.step()

        y_preds.append(outputs.argmax(1).detach().cpu().numpy())
        y_trues.append(labels.detach().cpu().numpy())
        _y_preds = np.concatenate(y_preds)
        _y_trues = np.concatenate(y_trues)

        train_cv = cal_ACC(_y_trues, _y_preds)

        running_loss += (loss.item() * batch_size)

        dataset_size += batch_size
        
        epoch_loss = running_loss / dataset_size
        
        # 设置让进度条显示的数据
        bar.set_postfix(Epoch=epoch,
                        Train_Loss=epoch_loss,
                        Train_ACC=train_cv,
                        LR=optimizer.param_groups[0]['lr'])
 
    return epoch_loss, train_cv

In [None]:
@torch.inference_mode()
def valid_one_epoch(model, valid_loader, epoch):
    model.eval()
    
    y_preds = []
    y_trues = []
    dataset_size = 0
    running_loss = 0.0
    bar = tqdm(enumerate(valid_loader), total=len(valid_loader))
    with torch.no_grad():
        for step, (X, labels) in bar:
            batch_size = X.size(0)
            
            X = X.to(CONFIG.device, dtype=torch.float)
            labels = labels.to(CONFIG.device, dtype=torch.long)

            outputs = model(X)
            outputs = F.softmax(outputs)
            loss = criterion(outputs, labels)

            y_preds.append(outputs.argmax(1).detach().cpu().numpy())
            y_trues.append(labels.detach().cpu().numpy())
            _y_preds = np.concatenate(y_preds)
            _y_trues = np.concatenate(y_trues)
            
            valid_cv = cal_ACC(_y_trues, _y_preds)
        
            running_loss += (loss.item() * batch_size)

            dataset_size += batch_size

            epoch_loss = running_loss / dataset_size

            bar.set_postfix(Epoch=epoch,
                            Valid_Loss=epoch_loss,
                            Valid_ACC=valid_cv,
                            LR=optimizer.param_groups[0]['lr'])

    return epoch_loss, valid_cv

In [None]:
def run_training(model, optimizer, train_loader, valid_loader, num_epochs=CONFIG.epochs, now_cv=CONFIG.now_cv):
    if torch.cuda.is_available():
        print("[INFO] Using GPU: {} x {}\n".format(torch.cuda.get_device_name(), torch.cuda.device_count()))
    
    start = time.time()
    best_model_wts = copy.deepcopy(model.state_dict()) # 用于 存储最好的 cv 的模型权重
    best_epoch_cv = now_cv
    best_model_path = None # # 用于 存储最好的 cv 的模型权重的路径
    history = defaultdict(list)
    
    for epoch in range(1, num_epochs + 1):
        gc.collect() # 立即回收，清除缓存中的垃圾
        train_epoch_loss, train_epoch_cv = train_one_epoch(model, optimizer, train_loader, epoch)
        valid_epoch_loss, valid_epoch_cv = valid_one_epoch(model, valid_loader, epoch)
        print(f"epoch: {epoch}, LOSS = {valid_epoch_loss}, CV(Acc) = {valid_epoch_cv}")
        
        history['Train Loss'].append(train_epoch_loss)
        history['Valid Loss'].append(valid_epoch_loss)
        history['Train CV(Acc)'].append(train_epoch_cv)
        history['Valid CV(Acc)'].append(valid_epoch_cv)
        history['lr'].append(optimizer.param_groups[0]['lr'])
        
        # deep copy the model
        if valid_epoch_cv >= best_epoch_cv:
            print(f"{b_}epoch: {epoch}, Validation CV(Acc) Improved ({best_epoch_cv} ---> {valid_epoch_cv}))")
            best_epoch_cv = valid_epoch_cv
            best_model_wts = copy.deepcopy(model.state_dict())
            if os.path.exists(CONFIG.ckpt_save_path) is False: # 如果该路径不存在，创建相关路径
                os.makedirs(CONFIG.ckpt_save_path)
                
            PATH = "{}/CV_{:.4f}_Loss{:.4f}_epoch{:.0f}.bin".format(CONFIG.ckpt_save_path, best_epoch_cv, valid_epoch_loss, epoch)
            best_model_path = PATH
            torch.save(model.state_dict(), PATH) # 只保存模型的权重参数到指定路径
            print(f"Model Saved{sr_}")
            
        print()
    
    # 计算训练总消耗时间
    end = time.time()
    time_elapsed = end - start
    print('Training complete in {:.0f}h {:.0f}m {:.0f}s'.format(
        time_elapsed // 3600, (time_elapsed % 3600) // 60, (time_elapsed % 3600) % 60))
    print("Best CV(Acc): {:.4f}".format(best_epoch_cv))

    # load best model weights
    model.load_state_dict(best_model_wts)

    return model, history, best_model_path

## Optimizer

In [None]:
# 指定优化器为 Adam
optimizer = torch.optim.Adam(model.parameters(), lr=CONFIG.learning_rate, 
                             weight_decay=CONFIG.weight_decay)

## Start Training

In [None]:
train_loader, valid_loader = prepare_loaders()

In [None]:
model, history, best_model_path = run_training(model, optimizer, train_loader, valid_loader, 
                                               num_epochs=CONFIG.epochs, now_cv=CONFIG.now_cv)

## Logs

In [None]:
plt.plot( range(len(history["Train Loss"])), history["Train Loss"], label="Train Loss")
plt.plot( range(len(history["Valid Loss"])), history["Valid Loss"], label="Valid Loss")
plt.xlabel("epochs")
plt.ylabel("Loss")
plt.grid()
plt.legend()
plt.show()

In [None]:
plt.plot( range(len(history["Train CV(Acc)"])), history["Train CV(Acc)"], label="Train CV(Acc)")
plt.plot( range(len(history["Valid CV(Acc)"])), history["Valid CV(Acc)"], label="Valid CV(Acc)")
plt.xlabel("epochs")
plt.ylabel("CV(Acc)")
plt.grid()
plt.legend()
plt.show()

In [None]:
plt.plot( range(len(history["lr"])), history["lr"], label="lr")
plt.xlabel("epochs")
plt.ylabel("lr")
plt.grid()
plt.legend()
plt.show()