필요한 라이브러리 불러옴

In [1]:
import os
import cv2
import json  # json 파일 다루는 라이브러리
import time
import random
import logging
import easydict  ###
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm  # 프로그램 진행사항 보여주는 라이브러리
from glob import glob 
from pathlib import Path  ###
from natsort import natsorted 
from os.path import join as opj  ###
from ptflops import get_model_complexity_info  ###
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold  ###
from PIL import Image  ##

import timm  ###
import torch
import torch.nn as nn
import torch_optimizer as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, grad_scaler
from torchvision import transforms

import warnings
warnings.filterwarnings("ignore")  ###

모델 및 학습의 하이퍼파라미터를 정의한다.

In [2]:
args = easydict.EasyDict(
    {
      'exp_num':'0',
      'experiment':'Base',
      'tag':'Default',

      # 경로 설정
      'data_path':'../data',
      'fold':4,
      'Kfold':5,
      'model_path':'results/',

      #  model 파라미터 설정
     'encoder_name':'regnety_040',
     'drop_path_rate':0.2,

     # training 파라미터 설정
     ## 기본 파라미터
     'img_size':288,
     'batch_size':16,
     'epochs':60,
     'optimizer':'Lamb',
     'initial_lr':5e-6,
     'weight_decay':1e-3,

     ## augumentation
     'aug_ver':2,
     'flipaug_ratio':0.3,
     'margin':50,
     'random_margin':True,

     ## Scheduler
     'scheduler':'cycle',
     'warm_epoch':5,
     ### Cosine Annealing
     'min_lr':5e-6,
     'tmax':145,
     ### OnecycleLR
     'max_lr':1e-3,

     ## 그외 training 파라미터
     'patience':50,
     'clipping':None,

     # 하드웨어 설정
     'amp':True,
     'multi_gpu':False,
     'logging':False,
     'num_workers':4,
     'seed':42

        
    }
)

data를 불러오고 train의 성능을 높이기 위하여 augmentation을 사용하여 data를 다양하게 만든다.  
augmentation:이미지 회전과 같은 변환을 적용하여 training data의 다양성을 증가시키는 기술

In [3]:
# keypoint를 기준으로 이미지를 crop하기 위한 함수를 정의한다.
# train과 test시 해당 함수가 적용된 crop 이미지가 inputs으로 들어가게 된다.
def crop_image(imgs, point, margin=100):
  image = np.array(Image.open(imgs).convert('RGB'))
  point = point['data']
  max_point = np.max(np.array(point), axis=0).astype(int) + margin
  min_point = np.min(np.array(point), axis=0).astype(int) - margin
  max_point = max_point[:-1]  # remove Z order(label)
  min_point = min_point[:-1]  # remove Z order(label)

  max_x, max_y = max_point
  min_x, min_y = min_point
  max_y += margin  # 손목까지 인지하게끔

  # 데이터 포인트의 크기는 원이미지보다 작아야한다
  max_x = max_x if max_x < 1920 else 1920
  max_y = max_y if max_y < 1080 else 1080
  min_x = min_x if min_x > 0 else 0
  min_y = min_y if min_y > 0 else 0

  crop_image = image[min_y:max_y, min_x:max_x]

  return crop_image

In [5]:
# dataloader에서 사용할 dataframe을 만든다
train_path = '../data/train'
train_folders = natsorted(glob(train_path + '/*'))
# natsorted: 텍스트로된 숫자를 정렬해준다
# glob('*.exe') >>> ['python.exe', 'pythonw.exe']

answers = []
for train_folder in train_folders:
  json_path = glob(train_folder + '/*.json')[0]
  js = json.load(open(json_path))
  # json 파일은 Key-Value 쌍의 형태를 가진다.
  ### ['action']
  cat = js.get('action')[0]  # 로드한 json파일의 key값이 'action'인 value의 첫번째 값을 cat에 저장 
  cat_name = js.get('action')[1]  # 로드한 json파일의 key값이 'action'인 value의 두번째 값을 cat_name에 저장 

  images_list = glob(train_folder + '/*.png')
  for image_name in images_list:
    answers.append([image_name, cat, cat_name])  # [train 경로, training data 이미지, training data 라벨]

answers = pd.DataFrame(answers, columns = ['train_path', 'answer', 'answer_name'])
answers.to_csv('../data/df_train.csv', index=False)

# 클래스가 1개뿐인 폴더들을 Augmentation해서 이미지 생성 후 dataframe을 재정의한다.
## 새롭게 정의한 dataframe을 학습에 이용하면 약간의 성능 향상을 얻을 수 있었다.
seed = 42
os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)
np.random.seed(seed)

data_path = '../data'
df_train = pd.read_csv(opj(data_path, 'df_train.csv'))
df_info = pd.read_csv(opj(data_path, 'hand_gesture_pose.csv'))
df_train = df_train.merge(df_info[['pose_id', 'gesture_type', 'hand_type']],
                          how='left', left_on='answer', right_on='pose_id')
save_folder = 'train'
for i in range(649, 649+5):
  if not os.path.exists(opj(data_path, save_folder, str(i))):
    os.makedirs(opj(data_path, save_folder, str(i)))

## flip aug 가능한 label: 131, 47 (각각 하나의 sample)
oslabel_fliplabel = [(131,156), (47,22)]  #(one sample label, flip label)
folders = ['649', '650']  # train 648번 folder에 이은 number 생성
for label, folder in tqdm(zip(oslabel_fliplabel, folders)):
  idx = 0
  os_label, f_label = label[0], label[1]
  one_sample = df_train[df_train['answer']==os_label].reset_index(drop=True)
  temp = df_train[df_train['answer']==f_label].reset_index(drop=True)
  train_folders = natsorted(temp['train_path'].apply(lambda x:x[-6]).unique())
  for train_folders in trainfolders:
    json_path = glob(train_folder + '/*.json')[0]
    js = json.load(open(json_path))
    keypoints = js['annotations']  ### ['annotation']
    images_list = natsorted(glob(train_folder + '/*.png'))
    for _, (point, image_name) in enumerate(zip(keypoints, images_list)):
      cropped_image = crop_image(image_name, point, margin=50)
      flip_img = cv2.flip(cropped_image, 1)
      save_path = opj(data_path, save_folder, folder, f'{idx}.png')
      idx += 1 
      cv2.imwrite(save_path, flip_img)
      df_train.loc[len(df_train)] = [save_path] + one_sample.iloc[0][1:].values.tolist()

def rotation(imgm, angle):
  angle = int(random.uniform(-angle, angle))
  h, w = img.shape[:2]
  M = cv2.getRotationMatrix2D((int(w/2), int(h/2), angle, 1))
  img = cv2.wrapAffine(img, M, (w, h))
  return img

oslabel = [92, 188, 145]
folder = ['651', '652', '653']
for label, folder in tqdm(zip(oslabel, folder)):
  idx = 0
  one_sample = df_train[df_train['answer']==label].reset_index(drop=True)
  train_folders = natsorted(temp['train_path'].apply(lambda x:x[:-6]).unique())
  for train_folder in train_folders:
    json_path = glob(train_folder + '/*.json')[0]
    js = json.load(open(json_path))
    keypoints = js['annotations']
    images_list = natsorted(glob(train_folder+'/*.png'))
    for _, (point, image_name) in enumerate(zip(keypoints, images_list)):
      cropped_image = crop_image(image_name, point, margin=50)
      aug_img = rotation(cropped_image, 30)
      save_path = opj(data_path, save_folder, folder, f'{idx}.png')
      idx += 1 
      cv2.imwrite(save_path, flip_img)
      df_train.loc[len(df_train)] = [save_path] + one_sample.iloc[0][1:].values.tolist()
      
  df_train.to_csv('../data/df_train_add.csv', index=False)

OSError: Cannot save file into a non-existent directory: '../data'

data를 train할때 사용할 함수 정의

In [None]:
# Train dataset에 475, 543 폴더는 의도하지 않은, 나머지 손에 대해서도 Keypoint가 잡히게 된다.
# json의 keypoint를 사용하기 위해 475, 543 폴더인 경우 해당 부분 keypoint를 제거한다.
def remove_keypoints(folder_num, points):
  list_ = []
  for x,y,x in points:
    cond1 = x<250 and y>800
    cond2 = x>1400 and y<400
    if not (cond1 or cond2):
      list_.append([x, y, z])
    return list_

class Train_Dataset(Dataset):
  def __init__(self, df, transform=None, df_flip_info=None, flipaug_ratio=0, label_encoder=None, margin=50, random_margin=True):
    self.id = df['train_path'].values
    self.target = df['answer'].values
    self.transform = transform
    self.margin = margin
    self.random_margin = random_margin

    # Flip augmentation(target class를 바꾼다)
    if df_flip_info is not None:
      self.use_flip = True
      print('Use Flip Augmentation')
      left = label_encoder.transform(df_flip_info['left'])
      right = label_encoder.transform(df_flip_info['right'])
      left_to_right = dict(zip(left, right))
      right_to_left = dict(zip(right, left))
      
      self.flip_info = left_to_right.copy()
      self.flip_info.update(right_to_left)
      self.flip_possible_class = list(set(np.concatenate([left, right])))
    self.flipaug_ratio = flipaug_ratio

    print(f'Dataset size:{len(self.id)}')

  def __getitem__(self, idx):
    image = np.array(Image.open(self.id[idx]).convert('RGB'))
    target = self.target[idx]

    # load json file 
    try:
      image_num = int(Path(self.id[idx].stem))
      dir = os.path.dirname(self.id[idx])
      folder_num = os.path.basename(dir)
      json_path = opj(dir, folder_num + '.json')
      js = json.load(open(json_path))
      keypoints = js['annoatations'][image_num]['data']  # 해당 이미지에 해당하는 keypoints
    except:  # Augmentation으로 직접 새로 만든 folder는 json이 없으므로 바로 return한다(미리 손 부분이 crop된 상태)
      image = self.tranform(Image.fromarray(image))
      return image, np.array(target)
    
    if folder_num in ['475', '543']:
      keypoints = remove_keypoints(folder_num, keypoints)
    
    # keypoints를 사용하여 image를 crop한다
    max_point = np.max(np.array(keypoints), axis=0).astype(int) + self.margin
    min_point = np.min(np.array(keypoints), axis=0).astype(int) - self.margin
    max_point = max_point[:-1]  # remove Z order
    min_point = min_point[:-1]  # remove Z order
    
    max_x, max_y = max_point
    min_x, min_y = min_point
    max_y += 100  # 손몬 부분까지 여유를 주기 위함

    # 매 에폭마다 margin을 조금씩 다르게 주면 한 폴더 내 비슷한 이미지들의 overfiting을 방지하는 효과를 얻을 수 있다 (train에서만)
    if self.random_margin:
      if random.random() < 0.5:
        max_x += self.margin
      if random.random() < 0.5:
        max_y += self.margin
      if random.random() < 0.5:
        min_x -= self.margin
      if random.random() < 0.5:
        min_y -= self.margin
    else:
      max_x += self.margin
      max_y += self.margin
      min_x -= self.margin
      min_y -= self.margin
    
    # 데이터 포인트의 크기는 원 이미지보다 크면 안된다
    max_x = max_x if max_x < 1920 else 1920
    max_y = max_y if max_y < 1080 else 1080
    min_x = min_x if min_x > 0 else 0
    min_y = min_y if min_y > 0 else 0

    image = image[min_y:max_y, min_x:max_x]

    # Flip Aug
    if (random.random() < self.flipaug_ratio) and (target in self.flip_possible_class):
      image = np.flip(image, axis=1)  # (H,W,C)에서 width 축 flip
      target = self.flip_info[target]
    
    image = self.transform(Image.fromarray(image))
    return image, np.array(target)

    def __len__(self):
      return len(self.id)

def get_loader(df, batch_size, shuffle, num_workers, transform, df_flip_info=None,
               flipaug_ratio=0, label_encoder=Ndne, margin=50, random_margin=True):
  dataset = Train_Dataset(df, transform, df_flip_info=df_flip_info, flipaug_ratio=flipaug_ratio,
                          label_encoder=label_encoderm, margin=margin, random_margin=random_margin)
  data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=num_workers, pin_memory=True, drop_last=False)
  return data_loader

def get_train_augmentation(img_size, ver):
  if ver==1:
    # For test
    transform = transforms.Compose([
        transforms.Resize((img_size, img_size)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225]),
    ])
  
  if ver==2:
    # For train
    transform = transforms.Compose([
        transforms.RandomAffine(20),
        transforms.RandomPerspective(),
        transforms.ToTensor(),
        transforms.Resize((img_size, img_size)),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                        std=[0.229, 0.224, 0.225]),
    ])
  return transform

Network(모델) 정의:  
pytorch image models(timm) 라이브러리를 화용하여 generalization performance에 강점을 가지는 RegNet을 base 모델로 사용하였다.

In [None]:
class Pose_Network(nn.Module):
  def __init__(self, args):
    super().__init__()
    self.encoder = timm.create_model(args.encoder_name, pretrained=True,
                                     drop_path_rate=args.drop_path_rate,)
    num_head = self.encoder.head.fc.in_features
    self.encoder.head.fc = nn.Linear(num_head, 157)

  def forward(self, x):
    return self.encoder(x)

logging가 avgMeter를 통해 실험 기록을 log파일로 남도록 저장하였다. 추가로 실험마다 비교를 쉽게 하기 위해 neptune을 활용하였는데 코드에서는 제거하였다.

In [None]:
# warmup learning rate scheduler
from torch.optim.lr_scheduler import _LRScheduler
class WarmUpLR(_LRScheduler):
    """warmup_training learning rate scheduler
    Args:
        optimizer: optimzier(e.g. SGD)
        total_iters: totoal_iters of warmup phase
    """
    def __init__(self, optimizer, total_iters, last_epoch=-1):
        
        self.total_iters = total_iters
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        """we will use the first m batches, and set the learning
        rate to base_lr * m / total_iters
        """
        return [base_lr * self.last_epoch / (self.total_iters + 1e-8) for base_lr in self.base_lrs]

# Logging
def get_root_logger(logger_name='basicsr',
                    log_level=logging.INFO,
                    log_file=None):

    logger = logging.getLogger(logger_name)
    # if the logger has been initialized, just return it
    if logger.hasHandlers():
        return logger

    format_str = '%(asctime)s %(levelname)s: %(message)s'
    logging.basicConfig(format=format_str, level=log_level)

    if log_file is not None:
        file_handler = logging.FileHandler(log_file, 'w')
        file_handler.setFormatter(logging.Formatter(format_str))
        file_handler.setLevel(log_level)
        logger.addHandler(file_handler)

    return logger

class AvgMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        self.losses = []

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count
        self.losses.append(val)


Trainer  
모델의 학습(training function)과 검증(validation)을 위한 class

In [None]:
class Trainer():
  def __init__(self, args, save_path):
    """
    args: arguments
    save_path: model 가중치 저장 경로
    """
    super(Trainer, self).__init__()
    self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Logging
    log_file = os.path.join(save_path, 'log.log')
    self.logger = get_root_logger(logger_name='IR', log_level=logging.INFO, log_file=log_file)
    self.logger.info(args)
    self.logger.info(args.tag)

    # load train, valid set
    df_train = pd.read_csv(opj(args.data_path, 'df_train_add.csv'))
    df_info = pd.read_csv(opj(args.data_path, 'hand_gesture_pose.csv'))

    df_train = df_train.merge(df_info[['pose_id', 'gesture_type', 'hand_type']],
                              how='left', left_on='answer', right_on='pose_id')
    # 폴더별(group)으로 각 번호 부여
    df_train['groups'] = df_train['train_path'].apply(lambda x:x.split('/')[3])
    df_train.loc[:,:] = natsorted(df_train.values)
    # 노이즈 이미지 제거: 596번 주먹쥐기 이미지에 손바닥을 펴는 노이즈 이미지 5장 존재, 0번 폴더의 9번 이미지도 노이즈
    drop_idx = df_train[df_train['groups'].isin(['596'])].index.tolist()[3:8] + [9]
    df_train = df_train.drop(drop_idx).reset_index(drop=True)
    le = LabelEncoder()
    df_train['answer'] = le.fit_transform(df_train['answer'])

    # split fold: 오류가 심한 클래스에 대한 분석을 수행하기 위함
    # StratifiedKFold는 클래스 비율을 고려하여 랜덤하게 데이터를 섞어 데이터를 split해준다
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=args.seed)
    for fold, (train_idx, val_idx) in enumerate(kf.split(df_train, y=df_train['answer'])):
      df_train.loc[val_idx, 'fold'] = fold
    df_val = df_train[df_train['fold']==args.fold].reset_index(drop=True)
    df_train = df_train[df_train['fold']!=args.fold].reset_index(drop=True)

    # Augmentation
    self.train_transform = get_train_augmentation(img_size=args.img_size, ver=args.aug_ver)
    self.test_transform = get_train_augmentation(img_size=args.img_size, ver=1)

    #########################################################################################
    # filp augmentation을 위한 mapping dataframe
    df_info = pd.read_csv('../data/hand_gesture_pose.csv')
    df_info = df_info[df_info['hand_type'] != 'both']
    # drop idx, 동일한 약속, gesture_type, hand_type인데 다른 클래스인 경우 존재 -> 약속 1과 2로 이름을 나누어준다.
    df_info.loc[[105, 128], 'pose_name'] = '약속 1'
    df_info.loc[[101, 124], 'pose_name'] = '약속 2'

    # drop 41 idx, 동일한 약속, my hand, right class가 49와 54로 두 개있어 Mapping df만들 때 문제가 발생하여 미리 49번 클래스 처리
    df_info = df_info.drop(41)

    # Make a mapping dataframe
    df_info = df_info.groupby(['pose_name', 'view_type', 'gesture_type', 'hand_type']).sum().unstack().reset_index().dropna(axis=0)
    df_info['left'] = df_info.pose_id.left.apply(int)
    df_info['right'] = df_info.pose_id.right.apply(int)
    df_flip_info = df_info.drop('pose_id', axis=1).droplevel('hand_type', axis=1).reset_index(drop=True)
    print('Mapping dataframe Length', df_flip_info.shape)   

    # TrainLoader #
    self.train_loader = get_loader(df_train, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, transform=self.train_transform,
                                   df_flip_info=df_flip_info, flipaug_ratio=args.flipaug_ratio, label_encoder=le, margin=args.margin, random_margin=args.random_margin)
    self.val_loader = get_loader(df_val, batch_size=args.batch_size, shuffle=False,
                                   num_workers=args.num_workers, transform=self.test_transform)
    
    # Network #
    self.model = Pose_Network(args).to(self.device)
    macs, params = get_model_complexity_info(self.model, (3, args.img_size, args.img_size), as_strings=True,
                                             print_per_layer_stat=False, verbose=False)
    self.logger.info('{:<30} {:<8}'.format('Computational complexity: ', macs))
    self.logger.info('{:<30} {:<8}'.format('Number of parameters: ', params))

    # Loss #
    self.criterion = nn.CrossEntropyLoss()

    # Optimizer & Scheduler #
    self.optimizer = optim.Lamb(elf.model.parameters(), lr=args.initial_lr, weight_decay=args.weight_decay)

    iter_per_epoch = len(self.train_loader)
    self.warmup_scheduler = WarmUpLR(self.optimizer, iter_per_epoch*args.warm_epoch)

    if args.scheduler == 'cos':
      tmax = args.tmax 
      self.scheduler = torch.optim.lr_scheduler.CosingAnnealingLR(self.optimizer, T_max=tmax, eta_min=args.min_lr, verbose=True)
    elif args.scheduler == 'cycle':
      self.scheduler = torch.optim.lr_scheduler.OneCycleLR(self.optimizer, max_lr=args.max_lr, steps_per_epoch=iter_per_epoch, epochs=args.epochs)

    if args.multi_gpu:
      self.model = nn.DataParallel(self.model).to(self.device)
    
    # Train / validate
    best_loss = np.inf
    best_acc = 0
    early_stopping = 0
    start = time.time()
    for epoch in range(1, args.epochs+1):
      self.epoch = epoch

      if args.scheduler == 'cos':
        if epoch > args.warm_epoch:
          self.scheduler.step()
      
      # Training
      train_loss, train_acc = self.training(args)

      # Model weight in multiple gpu or single gpu
      state_dict = self.odel.module.state_dict() if args.multi_gpu else self.model.state_dict()

      # validation
      val_loss, val_acc = self.validate()

      # sace models
      if val_loss < best_loss:
        early_stopping = 0
        best_epoch = epoch
        best_loss = val_loss
        best_acc = val_acc

        torch.save({'epoch': epoch,
                    'state_dict':state_dict,
                    'optimizer':self.optimizer.state_dict(),
                    'scheduler':self.scheduler.state_dict(),
                    }, os.path.join(save_path, 'best_model.pth'))
        self.logger.info(f'------------------SAVE:{best_epoch}epoch-------------------')
      else:
        early_stopping += 1

      # early stopping
      if early_stopping == args.patience:
        break

    self.logger.info(f'\nBest Val Epoch:{best_epoch} | Val Loss:{best_loss:.4f} | Val Acc{best_acc:.4f}')
    end = time.time()
    self.logger.info(f'Total Process time:{(end-start) / 60:.3f}Minute')

  # Training
  def training(self, args):
    self.model.train()
    train_loss = AvgMeter()
    train_acc = 0

    scaler = grad_scaler.GradScaler()
    for i, (images, targets) in enumerate(tqdm(self.train_loader)):
      images = torch.tensor(images, device=self.device, dtype=torch.float32)
      targets = torch.tensor(targets, device=self.device, dtype=torch.long)

      if self.epoch <= args.warm_epoch:
        self.warmup_scheduler.step()

      self.model.zero_grad(set_to_none=True)
      if args.amp:
        with autocast():
          preds = self.model(images)
          loss = self.criterion(preds, targets)
        scaler.scale(loss).backward()

        # Gradient clipping
        if args.clipping is not None:
          scaler.unscale_(self.optimizer)
          nn.utils.clip_grad_norm_(self.model.parameters(), args.clipping)

        scaler.step(self.optimizer)
        scaler.update()

      else:
        preds = self.model(images)
        loss = self.criterion(preds, targets)
        loss.backward()
        nn.utils.clip_grad_norm_(self.model.parameters(), args.clipping)
        self.optimizer.step()

      if args.scheduler == 'cycle':
        if self.epoch >  args.warm_epoch:
          self.scheduler.step()

      # metric
      train_acc += (preds.argmax(dim=1) == targets).sum().item()
      
      # log
      train_loss.update(loss.item(), n=images.size(0))

    train_acc /= len(self.train_loader.dataset)

    self.logger.info(f'Epoch:[{self.epoch:03d}/{args.epochs:03d}]')
    self.logger.info(f'Train Loss:{train_loss.avg:.3f} | Acc{train_acc:.4f}')
    return train_loss.avg, train_acc

  # validation or dev
  def validate(self):
    self.model.eval()
    with torch.no_grad():
      val_loss = AvgMeter()
      val_acc = 0

      for _, (images, targets) in enumerate(self.val_loader):
        images = torch.tensor(images, device=self.device, dtype=torch.float32)
        targets = torch.tensor(targets, device=self.device, dtype=torch.long)

        preds = self.model(images)
        loss = self.criterion(preds, targets)

        # metric 
        val_acc += (preds.argmax(dim=1) == targets).sum().item()
        
        # log
        val_loss.update(loss.item(), n=images.size(0))
      val_acc /= len(self.val_loader.dataset)

      self.logger.info(f'Valid Loss:{val_loss.avg:.3f} | Acc{val_acc:.4f}')
    return val_loss.avg, val_acc

위이 코드와 맨 마지막 cell의 main함수가 주요 소스코드이고 main함수 전까지의 아래의 코드는 모델이 클래스를 잘못 예측하는 경우들에 대해 코드를 수정하여 모델의 정확도를 높이고자 한 것이다.

In [None]:
# case 1: 손동작이 유사한 숫자 1과 검지 흔들기의 사진의 분류를 비교적 낮은 정확도로 수행하였다. 
## 두 클래스에 대해 계산된 검지의 x좌표 변화량들에 대해 적절한 임계치 값을 설정해 클래스를 구분짓게 하였다.

# case 2: 손동작이 유사한 주먹쥐기와 주먹 내밀기의 이미지에 대한 분류의 정확도가 낮게 측정되었다.
## 손의 움직임을 통해, keypoints 중 가장 오른쪽의 x의 변화량에 대해서 적절한 임계치를 설정해 두 클래스를 구분짓게 하였다.

# 임계치를 계산하기 위해 각 case에 대한 변화량을 계산하는 함수 정의
def check_stats(find_list, ver):
  train_path = '../data/train'
  train_folders = natsorted(glob(train_path + '/*'))
  stat_list = []
  for _, train_folder in tqdm(enumerate(train_folders)):
    try:
      json_path = glob(train_folder + '/*.json')[0]
      js = json.load(open(json_path))
      cat = js.get('action')[0]
      keypoints = js['annotations']
      keypoints = np.array([point['data'] for point in keypoints])  # (N-이미지개수, 21 or 42(keypoints), 3(x,y,z 좌표))
    except:
      pass
    if cat in find_list:
      # case 1의 숫자 1과 검지 흔들기 구분
      # 검지는 이미지 내 keypoints들 중 가장 작은 y값(이미지 내 가장 높은 위치)을 갖는 point이다.
      # 해당 point의 x값을 뽑는다.(x의 변화량을 보기 위하여)
      if ver==1:
        keypoints = keypoints[:,:,:2] #(N, 21 or 42, 2)
        '''
        point_per_img[:,0][point_per_img[:,1].argmin()]
        : 여러 x값들에 대해 y값이 가장 큰 point값들을 나타냄
        point_per_img[:,0]은 x좌표
        point_per_img[:,1]은 y좌표 나타냄
        '''
        x_l_finger = [point_per_img[:,0][point_per_img[:,1].argmin()] for point_per_img in keypoints]  # (N, 2) Y축으로 가장 작은 포인트 두개 추출
        stat_list.append(np.max(x_l_finger)-np.min(x_l_finger))

      # case 2의 주먹쥐기와 주먹 내밀기 구분
      # keypoints 중 가장 큰 x값(이미지 내 가장 우측 위치)을 갖는 point를 본다. 
      # case2같은 경우는 left 손목이 없기 때문에 해당 logic이 잘 작동한다
      elif ver == 2:
        keypoints = keypoints[:,:,0]  # x좌표 값들만 가져옴
        x_values = [point_per_img[point_per_img.argmax()] for point_per_img in keypoints]
        stat_list.append(np.max(x_values)-np.min(x_values))

  print(stat_list)
  return stat_list

## label ##
############ ver1 #############
find_list0 = [0, 10, 100, 110]  # ['숫자 1', '숫자1']  my hand, your hand 좌우
find_list1 = [42, 67, 142, 167]  # ['부정(검지 흔들기)'] my hand, your hand 좌우

############ ver2 #############
find_list2 = [146]  # ['주먹쥐기']  Your hand 우
find_list3 = [163]  # ['경고(주먹 내밀기)'] Your hand 우

find_list4 = [171]  # ['주먹쥐기']  Your hand Both
find_list5 = [191]  # ['경고(주먹 내밀기)'] Your hand Both


# 숫자1 & 검지 흔들기
li0 = check_stats(find_list0,1)  #숫자1 or 숫자 1
li1 = check_stats(find_list1,1)  #부정(검지 흔들기)
threshold_ver1 = max(li0) + 5    # Margin 5
print(f'\n{threshold_ver1:.3f}보다 크면 부정(검지 흔들기) 클래스')

# 주먹쥐기 vs 주먹 내밀기 Right
li2 = check_stats(find_list2,2)   
li3 = check_stats(find_list3,2)   
threshold_ver2 = max(li2) + 5       # Margin 5
print(f'\n{threshold_ver2:.3f}보다 크면 주먹 내밀기(right) 클래스')

# 주먹쥐기 vs 주먹 내밀기 Both
li4 = check_stats(find_list4,2)   
li4 = li4[1:]                       # 596번 폴더 변화량(218.313) Outlier -> 제외
li5 = check_stats(find_list5,2)   
threshold_ver2_both = max(li4) + 5  # Margin 5
print(f'\n{threshold_ver2_both:.3f}보다 크면 주먹 내밀기(both) 클래스')


Main function

In [None]:
def main(args):
  # random seed
  seed = args.seed
  os.environ['PYTHONHASHSEED'] = str(seed)
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.cuda.manual_seed_all(seed)
  torch.backends.cudnn.bechnmark = True

  save_path = os.path.join(args.model_path, (args.exp_num).zfill(3))
  # create model directory
  os.makedirs(save_path, exist_ok=True)
  Trainer(args, save_path)

if __name__ == '__main__':
  for i in range(5):
    args.fold = i
    args.exp_num = str(i)
    main(args)

Inference with ensemble  
- 위에서 구한 임계치를 사용하여 rule base inference를 구축한다.  
- replace_dict이라는 변수를 통해 헷갈리는 두 클래스를 매칭한다.

In [None]:
# test의 keypoints(json) 변화량을 구하기 위한 함수 정의
def Refiner(keypoints, ver):
  keypoints = np.array([point['data'] for point in keypoints])
  # 숫자 1과 검지 흔들기 구분
  if ver == 1:
    keypoints = keypoints[:,:,:2]
    x_l_finger = [point_per_img[:,0][point_per_img[:,1].argmin()] for point_per_img in keypoints]
    query_value = np.max(x_l_finger) - np,min(x_l_finger)

  # 주먹쥐기와 주먹 내밀기 구분
  elif ver == 2:
    keypoints = keypoints[:,:,0]
    x_values = [point_per_img[point_per_img.argmax()] for point_per_img in keypoints]
    query_value = np.max(x_values) - np.min(x_values)
  
  return query_value

download the pretrained weight to path of ./results/

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
test_path = f'../data/test' 
test_folders = natsorted(glob(test_path + '/*'))

args = easydict.EasyDict({'encoder_name':'regnety_040',
                        'drop_path_rate':0,
                        })

load_pretrain = True        # Use Pretrained weights
ensemble_test = True        # Ensemble or Single
refine = True              # Use Refiner (Rule-base)


if load_pretrain:  # Github로부터 Pretrained Weight Load
  model_path0 = './results/0Fold_model.pth' # fold0
  model_path1 = './results/1Fold_model.pth' # fold1
  model_path2 = './results/2Fold_model.pth' # fold2
  model_path3 = './results/3Fold_model.pth' # fold3
  model_path4 = './results/4Fold_model.pth' # fold4

else:  # 위에서 학습한 모델 Weight Load
  model_path0 = './results/000/best_model.pth' # fold0
  model_path1 = './results/001/best_model.pth' # fold1
  model_path2 = './results/002/best_model.pth' # fold2
  model_path3 = './results/003/best_model.pth' # fold3
  model_path4 = './results/004/best_model.pth' # fold4


# 5Fold Ensemble
if ensemble_test:
  model0 = Pose_Network(args).to(device)
  model0.load_state_dict(torch.load(model_path0)['state_dict'])
  model0.eval()

  model1 = Pose_Network(args).to(device)
  model1.load_state_dict(torch.load(model_path1)['state_dict'])
  model1.eval()

  model2 = Pose_Network(args).to(device)
  model2.load_state_dict(torch.load(model_path2)['state_dict'])
  model2.eval()

  model3 = Pose_Network(args).to(device)
  model3.load_state_dict(torch.load(model_path3)['state_dict'])
  model3.eval()

  model4 = Pose_Network(args).to(device)
  model4.load_state_dict(torch.load(model_path4)['state_dict'])
  model4.eval()

  model_list = [model0, model1, model2, model3, model4]

else:  # Single Best Model (Using the pretrained weight)
  model_path = './results/single_best_model.pth'
  single_best = Pose_Network(args).to(device)
  single_best.load_state_dict(torch.load(model_path)['state_dict'])
  single_best.eval()
  model_list = [single_best]


img_size = 288
transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Resize((img_size, img_size)),
        transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225]),
        ])

sub = pd.read_csv('../data/sample_submission.csv')
df_info = pd.read_csv('../data/hand_gesture_pose.csv')
le = LabelEncoder()
le.fit(df_info['pose_id'])
trans = le.transform

# Class Mapping dict
ver1_list = trans([0, 42, 10, 67, 100, 142, 110, 167])   
ver2_list = trans([146, 163, 171, 191])
replace_dict = {146:163, 171:191, 0:42, 10:67, 100:142, 110:167}
replace_dict = dict([trans(x) for x in list(replace_dict.items())])   # Mapping (Origin:0~195 to 0~156)

total_list = np.concatenate([ver1_list, ver2_list]).tolist()


for i, test_folder in tqdm(enumerate(test_folders)):
  dir = os.path.dirname(test_folder)
  folder_num = os.path.basename(test_folder)
  json_path = opj(dir, folder_num, folder_num+'.json')
  js = json.load(open(json_path))
  keypoints = js['annotations']  # 해당 이미지에 해당하는 Keypoints
  images_list = natsorted(glob(test_folder + '/*.png'))
  images = []
  for _, (point, image_name) in enumerate(zip(keypoints, images_list)):
    croped_image = crop_image(image_name, point, margin=100)
    image = transform(croped_image)
    images.append(image)

  images = torch.stack(images).to(device)
  ensemble = np.zeros((157,), dtype=np.float32)
  for model in model_list:
    preds = model(images)
    preds = torch.softmax(preds, dim=1)
    preds = torch.mean(preds, dim=0).detach().cpu().numpy()    # shape:(157,)
    ensemble += preds
  preds = ensemble / len(model_list)
  pred_class = preds.argmax().item()
  if refine and (pred_class in total_list):
    idx = list(replace_dict.keys()).index(pred_class) if pred_class in replace_dict.keys() else list(replace_dict.values()).index(pred_class)
    cand1, cand2 = list(replace_dict.items())[idx]

    if pred_class in ver1_list:
      query_value = Refiner(keypoints, ver=1)
      answer = cand1 if query_value < threshold_ver1 else cand2

    elif pred_class in ver2_list:
      query_value = Refiner(keypoints, ver=2)
      answer = cand1 if query_value < threshold_ver2_both else cand2

    preds[answer] = 1
    preds = np.where(preds != 1, 0, preds)  # Refiner를 통해 나온 class를 제외한 나머지의 확률값은 모두 0으로 변환

  sub.iloc[i, 1:] = preds.astype(float)

sub.to_csv('./results/submission_train_add_ensemble_rule.csv',index=False)