In [1]:
import time
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.cuda.comm
import torch.optim
import torch.utils.data
import torch.utils.data.distributed
from models.ST_Former import GenerateModel
from dataloader.dataset_NIA import train_data_loader, test_data_loader

In [2]:
model = GenerateModel()
model = torch.nn.DataParallel(model).cuda()

model.to('cuda')
train_on_gpu=True

In [3]:
use_cuda = torch.cuda.is_available()
if use_cuda:
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)
else:
    print("Can't use CUDA")
    print('__CUDNN VERSION:', torch.backends.cudnn.version())
    print('__Number CUDA Devices:', torch.cuda.device_count())
    print('__CUDA Device Name:',torch.cuda.get_device_name(0))
    print('__CUDA Device Total Memory [GB]:',torch.cuda.get_device_properties(0).total_memory/1e9)

__CUDNN VERSION: 8500
__Number CUDA Devices: 1
__CUDA Device Name: NVIDIA GeForce RTX 3090
__CUDA Device Total Memory [GB]: 25.438322688


## Optimizer

In [4]:
import torch.optim as optim
import matplotlib.pyplot as plt 

FastAI training

In [5]:
from fastai.vision.all import *
import torchvision

import fastai
from fastai.optimizer import OptimWrapper

from fastai.data.core import DataLoaders
from fastai.learner import Learner
from fastai.callback.progress import ProgressCallback

In [6]:
import datetime
import wandb
from fastai.callback.wandb import *

In [14]:
num_workers = 0
batch_size = 16
valid_size = 0.2
data_set = 0

In [15]:
now = datetime.datetime.now()
time_str = now.strftime("[%m-%d]-[%H:%M]-")
project_path = '/media/di/data/lee/nia/Former-DFER/nia/data/'
log_txt_path = project_path + 'log/' + time_str + 'set' + str(data_set) + '-log.txt'
log_curve_path = project_path + 'log/' + time_str + 'set' + str(data_set) + '-log.png'
checkpoint_path = project_path + 'checkpoint/' + time_str + 'set' + str(data_set) + '-model.pth'
best_checkpoint_path = project_path + 'checkpoint/' + time_str + 'set' + str(data_set) + '-model_best.pth'

In [16]:
criterion = nn.CrossEntropyLoss().cuda()

In [17]:
train_data = train_data_loader(project_dir=project_path, 
                               data_set=data_set)
test_data = test_data_loader(project_dir=project_path,
                             data_set=data_set)
train_loader = torch.utils.data.DataLoader(train_data,
                                           batch_size=batch_size,
                                           shuffle=True,
                                           num_workers=num_workers,
                                           pin_memory=True,
                                           drop_last=True)
test_loader = torch.utils.data.DataLoader(test_data,
                                         batch_size=batch_size,
                                         shuffle=False,
                                         num_workers=num_workers,
                                         pin_memory=True)

video number:2822
video number:806


In [19]:
wandb.init(project='nia')

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666947233316023, max=1.0)…

In [20]:
dls = DataLoaders(train_loader, test_loader)

criterion = nn.CrossEntropyLoss()
learn = Learner(dls, model, loss_func=criterion, cbs=WandbCallback(), metrics=accuracy)#, opt_func=opt_func)#, cbs=[CudaCallback])

In [22]:
learn.fine_tune(200)

Could not gather input dimensions


epoch,train_loss,valid_loss,accuracy,time
0,1.876085,1.999463,0.19603,01:17


Could not gather input dimensions


epoch,train_loss,valid_loss,accuracy,time
0,1.607912,1.656329,0.212159,01:17
1,1.598132,1.615565,0.241935,01:16
2,1.584327,1.591616,0.230769,01:17
3,1.577623,1.617465,0.224566,01:17
4,1.580426,1.616443,0.227047,01:17
5,1.575541,1.590098,0.246898,01:17
6,1.558449,1.601376,0.220844,01:17
7,1.564648,1.620848,0.213399,01:17
8,1.562739,1.594287,0.240695,01:17
9,1.565897,1.632263,0.251861,01:17


# Save checkpoint

In [28]:
learn.save(open("test.sav", "wb"), with_opt=True)

<_io.BufferedWriter name='test.sav'>

In [38]:
torch.save(learn.state_dict, "test.pth")

In [35]:
learn.parameters

<bound method Module.parameters of DataParallel(
  (module): GenerateModel(
    (s_former): ResNet(
      (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (layer1): Sequential(
        (0): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu): ReLU(inplace=True)
          (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (1): BasicBlock(
          (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=

In [34]:
learn.load_state_dict?

[0;31mSignature:[0m [0mlearn[0m[0;34m.[0m[0mload_state_dict[0m[0;34m([0m[0mstate_dict[0m[0;34m:[0m [0mMapping[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m,[0m [0mstrict[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mTrue[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Copies parameters and buffers from :attr:`state_dict` into
this module and its descendants. If :attr:`strict` is ``True``, then
the keys of :attr:`state_dict` must exactly match the keys returned
by this module's :meth:`~torch.nn.Module.state_dict` function.

Args:
    state_dict (dict): a dict containing parameters and
        persistent buffers.
    strict (bool, optional): whether to strictly enforce that the keys
        in :attr:`state_dict` match the keys returned by this module's
        :meth:`~torch.nn.Module.state_dict` function. Default: ``True``

Returns:
    ``NamedTuple`` with ``missing_keys`` and ``unexpected_keys`` fields:
        * **missing_keys

In [39]:
ls -laht

total 4.0G
-rw-rw-r--  1 di di  69M 11월  9 13:35 test.pth
drwxrwxr-x 14 di di 4.0K 11월  9 13:35 [0m[01;34m.[0m/
-rw-rw-r--  1 di di  75K 11월  9 13:33 train_nia_fastai.ipynb
-rw-rw-r--  1 di di 207M 11월  9 13:31 test.sav
-rw-rw-r--  1 di di 876K 11월  9 10:00 prepare_dataset_new.ipynb
drwxrwxr-x 40 di di 4.0K 11월  8 15:56 [01;34mwandb[0m/
drwxrwxr-x  2 di di 4.0K 11월  8 12:25 [01;34m.ipynb_checkpoints[0m/
-rw-rw-r--  1 di di  11K 11월  8 10:02 [01;31mmodel.tar.gz[0m
drwxrwxr-x  7 di di 4.0K 11월  8 10:01 [01;34m..[0m/
-rw-rw-r--  1 di di  39K 11월  8 09:09 CIFAR10_ResNet9_train.ipynb
drwxrwxr-x  2 di di 4.0K 11월  8 09:07 [01;34mdata[0m/
-rw-rw-r--  1 di di 432K 11월  7 20:39 train_nia_new.ipynb
-rw-rw-r--  1 di di 5.3K 11월  7 09:54 Make_datasets_new.ipynb
drwxrwxr-x  4 di di 4.0K 11월  7 09:45 [01;34mmodels[0m/
drwxrwxr-x  4 di di 4.0K 11월  4 15:52 [01;34mdataloader[0m/
-rw-rw-r--  1 di di  26K 11월  3 15:03 train_nia.ipynb
drwxrwxr-x  2 di di 4.0K 11월  3 15:00 [01;34m__pycac