# Reference
- [transfer-1](https://officeguide.cc/pytorch-transfer-learning-resnet18-classify-mnist-tutorial-examples/)
- [transfer-2](https://pytorch.org/tutorials/beginner/transfer_learning_tutorial.html)
- [train, val subset](https://stackoverflow.com/questions/50544730/how-do-i-split-a-custom-dataset-into-training-and-test-datasets/50544887#50544887)

---
- [pytorch训练模型时出现nan原因整合](https://blog.csdn.net/ytusdc/article/details/122321907)
```
5、进行梯度減枝
对超出值域范围的梯度进行约束，避免梯度持续大于1，造成梯度爆炸。（没办法规避梯度消失）
 pytorch 使用 nn.utils.clip_grad_value(parameters, clip_value)．将所有的参数剪裁到［-clip_value , clip_value]。如 clip_value =1,[100,0.1]=>[1,0.1]，该操作会改变梯度的方向
使用 nn.utils.clip_grad_norm_ 按照范数大小进行归一化，当参数的范数（ norm_type=2 范数）大于最大值时，则会将其归约到最大值。该方法可以保证梯度的方向是完全一致的，可能会导致梯度值被缩放到特别小（如［100,0.1]=>[1,0.0001])。
```

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import pydicom
import copy
import time

import torch
from torch import nn, optim
from torchvision import transforms, io
from torchvision.transforms import functional as F

from package.dataset import DicomDataset
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

In [2]:
def display(tr: torch.Tensor):
    infos = {
        'min': torch.amin(tr),
        'max': torch.amax(tr),
        'dtype': tr.dtype,
        'size': tr.size()
    }

    return infos

# Parameters

In [3]:
device = "cuda:0" if torch.cuda.is_available() else "cpu"
# device = "cpu"
print(f"Using {device} device")

Using cuda:0 device


In [4]:
batch_size = 1
validation_split = .2
shuffle_dataset = True
random_seed= 2022
num_epochs = 10
conv_threshold = 30

lr = 1e4

# Data

In [5]:
pd.read_csv("./data/DICOM/train.csv")

Unnamed: 0,ID,Age,Gender,FilePath,index,Stage
0,A175204,69,0,/DICOM/A175204/00010018,7,1
1,A122221,56,0,/DICOM/A122221/00010034,15,1
2,A54671,82,0,/DICOM/A54671/00010021,8,1
3,A31117,71,1,/DICOM/A31117/00010022,10,1
4,A653195,68,0,/DICOM/A653195/00010016,7,1
...,...,...,...,...,...,...
156,A717094,80,1,/DICOM/A717094/00010022,11,3
157,A741758,54,1,/DICOM/A741758/00010017,9,3
158,A646753,75,1,/DICOM/A646753/00010023,12,3
159,A679904,62,1,/DICOM/A679904/00010020,9,3


In [6]:
preprocess = transforms.Compose([
    transforms.CenterCrop(50), transforms.Resize(224),
])

In [7]:
dataset = DicomDataset(root="./data", transform=preprocess)

In [8]:
classes = ('Stage 1', 'Stage 2', 'Stage 3')

In [9]:
print(len(dataset))
display(dataset[0][0])

161


{'min': tensor(0.),
 'max': tensor(258.3909),
 'dtype': torch.float32,
 'size': torch.Size([3, 224, 224])}

In [10]:
#  Creating data indices for training and validation splits:
dataset_size = len(dataset)
indices = list(range(dataset_size))
split = int(np.floor(validation_split * dataset_size))
if shuffle_dataset :
    np.random.seed(random_seed)
    np.random.shuffle(indices)
train_indices, val_indices = indices[split:], indices[:split]
print(f"Train Data Length: { len(train_indices) }")
print(f"Validation Data Length: { len(val_indices) }")

dataset_sizes = {
    'train': len(train_indices),
    'val': len(val_indices),
}

Train Data Length: 129
Validation Data Length: 32


In [11]:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(val_indices)

dataloaders = {
    'train': DataLoader(dataset, batch_size=batch_size, sampler=train_sampler),
    'val': DataLoader(dataset, batch_size=batch_size, sampler=valid_sampler)
}

# Model

In [12]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'vgg16_bn', pretrained=True)

Using cache found in /home/azetry/.cache/torch/hub/pytorch_vision_v0.10.0


In [13]:
print(model)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256

In [14]:
model.classifier[6] = nn.Linear(in_features=4096, out_features=3, bias=True)
model.to(device)

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (7): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (8): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (9): ReLU(inplace=True)
    (10): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (12): ReLU(inplace=True)
    (13): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (14): Conv2d(128, 256

In [15]:
# test = training_data[0][0].unsqueeze(0)
# with torch.no_grad():
#     output = model(test)

# print(output[0])
# print( torch.nn.functional.softmax(output[0], dim=0) )

# Loss Function and Optimizer

In [16]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9)

# Training

In [17]:
def train_model(dataloader, model, loss_fn, optimizer, num_epochs):
    since = time.time()

    # 儲存最佳參數
    prev_acc = 0.0
    best_acc = 0.0
    best_model_wts = copy.deepcopy(model.state_dict())

    # 計算是否收斂和提前結束
    count_cont = 0
    finish = False

    # Level: Epoch
    for epoch in range(num_epochs):
        print(f"Epoch {epoch}/{num_epochs-1}:")
        print("-"*8)

        # 每次 epoch 都要跑一次 training 和 validation
        # Level: Phase (train, val)
        for phase in ['train', 'val']:
            if phase == 'train': model.train()
            else: model.eval()

            running_loss = 0.0
            running_corrects = 0

            # 批次讀取資料進行訓練
            # Level: Batch Data
            for batch, (X, y) in enumerate(dataloader[phase]):
                # 將資料放置於 GPU 或 CPU
                X, y = X.to(device), y.to(device)

                # optimizer.zero_grad() # 重設參數梯度（gradient）

                # forward
                # 只有在訓練階段才要計算梯度
                with torch.set_grad_enabled(phase == 'train'): # phase = True or False
                    outputs = model(X)                  # 計算預測值
                    _, preds = torch.max(outputs, 1)    # 計算預測結果
                    loss = loss_fn(outputs, y)          # 計算損失值（loss）

                    # 只有在訓練階段才要優化
                    if phase == 'train':
                        optimizer.zero_grad()           # 重設參數梯度（gradient）
                        loss.backward()                 # 反向傳播（backpropagation）
                        nn.utils.clip_grad_norm(model.parameters(), max_norm=20, norm_type=2) # 梯度剪枝
                        optimizer.step()                # 更新參數

                # 統計
                running_loss += loss.item() * Ｘ.size(0) # Batch size
                running_corrects += torch.sum(preds == y.data)
            # End of Level: Batch Data

            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]

            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')

            if phase == 'train':
                if epoch_acc == prev_acc: count_cont += 1
                else: count_cont = 0
                prev_acc = epoch_acc

                if count_cont > conv_threshold: 
                    print("Convergence. Ｅnd training early.")
                    finish = True
                    break

            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                best_model_wts = copy.deepcopy(model.state_dict())
        # End of Level: Phase (train, val)

        print("-"*8)
        if finish: break
    # End of Level: Epoch

    time_elapsed = time.time() - since

    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best val Acc: {best_acc:4f}')

    # 載入模型最佳參數
    model.load_state_dict(best_model_wts)

    return model



In [18]:
model = train_model(dataloaders, model, criterion, optimizer, num_epochs)

Epoch 0/9:
--------


  nn.utils.clip_grad_norm(model.parameters(), max_norm=20, norm_type=2) # 梯度剪枝


train Loss: 36733470857200427597824.0000 Acc: 0.3953
val Loss: 14220262773965289684992.0000 Acc: 0.3750
--------
Epoch 1/9:
--------
train Loss: 125357153412159872.0000 Acc: 0.3333
val Loss: 7671780191182836989952.0000 Acc: 0.2812
--------
Epoch 2/9:
--------
train Loss: 2102187898145836288.0000 Acc: 0.3566
val Loss: 19617767220877117620224.0000 Acc: 0.1562
--------
Epoch 3/9:
--------
train Loss: 81647817217.8177 Acc: 0.3566
val Loss: 2963793992823158603776.0000 Acc: 0.1875
--------
Epoch 4/9:
--------
train Loss: 84866364296264008335360.0000 Acc: 0.3256
val Loss: 5301879829043695952330752.0000 Acc: 0.4062
--------
Epoch 5/9:
--------
train Loss: 12925304729907612688056320.0000 Acc: 0.2946
val Loss: 11640191369984647547060224.0000 Acc: 0.3438
--------
Epoch 6/9:
--------
train Loss: 17422895297608890842087424.0000 Acc: 0.2946
val Loss: 10807005947341918527553536.0000 Acc: 0.3438
--------
Epoch 7/9:
--------
train Loss: 17845934989859230473781248.0000 Acc: 0.3023
val Loss: 114516977111

訓練成功但是 loss 太詭異了

In [19]:
torch.save(model, "20221030003_clipgrad.pth")