## [Dacon] AI프렌즈 시즌2 강수량 산출 경진대회
## giba.kim (팀명)
## 2020년 5월 29일 (제출날짜)

## Model 3

#### CNN 기반 Custom Model Single

>* Data: Original Data, -9999 제거, 추가 데이터 사용 약 8300개 (-9999제거, 위 경도가 심하게 일치하지 않은 것 제외)
* Cross Validation: train_test_split(sorted(glob.glob(train_path + '/*')), test_size=0.1, random_state=31014)
* Loss: MOFLoss
* Optimizer: RAdam + LARS + LookAHead (https://github.com/mgrankin/over9000)
* Scheduler: CosineAnnealingWarmRestarts(optimizer, 10, 2,eta_min=1e-6)
* Model: CNN 기반 Custom Model
* Batch: 128
* Epoch: 150


## 1. 라이브러리 및 데이터
## Library & Data

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import time
import datetime

import glob
import os
import gc
import random
from tqdm.notebook import tqdm

import matplotlib.pyplot as plt
import seaborn as sns

import torch
from torch.utils.data import Dataset, DataLoader
from torch.utils.data import Dataset as BaseDataset
import torchvision
import torchvision.transforms.functional as TF

from torch import nn
import torch.nn.functional as F
from torch.utils.data import SubsetRandomSampler
from torch.optim import Adam,lr_scheduler,AdamW
from torchvision import transforms,models
from torch.autograd import Variable

import pretrainedmodels
from efficientnet_pytorch import EfficientNet

from torch.optim.optimizer import Optimizer
from collections import defaultdict

import math
import copy
import segmentation_models_pytorch as smp
from sklearn.model_selection import train_test_split

## Utility Function 정의

#### Optimizer
RAdam + LARS + LookAHead

Lookahead implementation from https://github.com/lonePatient/lookahead_pytorch/blob/master/optimizer.py
RAdam + LARS implementation from https://gist.github.com/redknightlois/c4023d393eb8f92bb44b2ab582d7ec20

In [2]:
class Lookahead(Optimizer):
    def __init__(self, base_optimizer, alpha=0.5, k=6):
        if not 0.0 <= alpha <= 1.0:
            raise ValueError(f'Invalid slow update rate: {alpha}')
        if not 1 <= k:
            raise ValueError(f'Invalid lookahead steps: {k}')
        defaults = dict(lookahead_alpha=alpha, lookahead_k=k, lookahead_step=0)
        self.base_optimizer = base_optimizer
        self.param_groups = self.base_optimizer.param_groups
        self.defaults = base_optimizer.defaults
        self.defaults.update(defaults)
        self.state = defaultdict(dict)
        # manually add our defaults to the param groups
        for name, default in defaults.items():
            for group in self.param_groups:
                group.setdefault(name, default)

    def update_slow(self, group):
        for fast_p in group["params"]:
            if fast_p.grad is None:
                continue
            param_state = self.state[fast_p]
            if 'slow_buffer' not in param_state:
                param_state['slow_buffer'] = torch.empty_like(fast_p.data)
                param_state['slow_buffer'].copy_(fast_p.data)
            slow = param_state['slow_buffer']
            slow.add_(group['lookahead_alpha'], fast_p.data - slow)
            fast_p.data.copy_(slow)

    def sync_lookahead(self):
        for group in self.param_groups:
            self.update_slow(group)

    def step(self, closure=None):
        # print(self.k)
        #assert id(self.param_groups) == id(self.base_optimizer.param_groups)
        loss = self.base_optimizer.step(closure)
        for group in self.param_groups:
            group['lookahead_step'] += 1
            if group['lookahead_step'] % group['lookahead_k'] == 0:
                self.update_slow(group)
        return loss

    def state_dict(self):
        fast_state_dict = self.base_optimizer.state_dict()
        slow_state = {
            (id(k) if isinstance(k, torch.Tensor) else k): v
            for k, v in self.state.items()
        }
        fast_state = fast_state_dict['state']
        param_groups = fast_state_dict['param_groups']
        return {
            'state': fast_state,
            'slow_state': slow_state,
            'param_groups': param_groups,
        }

    def load_state_dict(self, state_dict):
        fast_state_dict = {
            'state': state_dict['state'],
            'param_groups': state_dict['param_groups'],
        }
        self.base_optimizer.load_state_dict(fast_state_dict)

        # We want to restore the slow state, but share param_groups reference
        # with base_optimizer. This is a bit redundant but least code
        slow_state_new = False
        if 'slow_state' not in state_dict:
            print('Loading state_dict from optimizer without Lookahead applied.')
            state_dict['slow_state'] = defaultdict(dict)
            slow_state_new = True
        slow_state_dict = {
            'state': state_dict['slow_state'],
            'param_groups': state_dict['param_groups'],  # this is pointless but saves code
        }
        super(Lookahead, self).load_state_dict(slow_state_dict)
        self.param_groups = self.base_optimizer.param_groups  # make both ref same container
        if slow_state_new:
            # reapply defaults to catch missing lookahead specific ones
            for name, default in self.defaults.items():
                for group in self.param_groups:
                    group.setdefault(name, default)
                    
class Ralamb(Optimizer):

    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0):
        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
        self.buffer = [[None, None, None] for ind in range(10)]
        super(Ralamb, self).__init__(params, defaults)

    def __setstate__(self, state):
        super(Ralamb, self).__setstate__(state)

    def step(self, closure=None):

        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:

            for p in group['params']:
                if p.grad is None:
                    continue
                grad = p.grad.data.float()
                if grad.is_sparse:
                    raise RuntimeError('Ralamb does not support sparse gradients')

                p_data_fp32 = p.data.float()

                state = self.state[p]

                if len(state) == 0:
                    state['step'] = 0
                    state['exp_avg'] = torch.zeros_like(p_data_fp32)
                    state['exp_avg_sq'] = torch.zeros_like(p_data_fp32)
                else:
                    state['exp_avg'] = state['exp_avg'].type_as(p_data_fp32)
                    state['exp_avg_sq'] = state['exp_avg_sq'].type_as(p_data_fp32)

                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
                beta1, beta2 = group['betas']

                # Decay the first and second moment running average coefficient
                # m_t
                exp_avg.mul_(beta1).add_(1 - beta1, grad)
                # v_t
                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)

                state['step'] += 1
                buffered = self.buffer[int(state['step'] % 10)]

                if state['step'] == buffered[0]:
                    N_sma, radam_step_size = buffered[1], buffered[2]
                else:
                    buffered[0] = state['step']
                    beta2_t = beta2 ** state['step']
                    N_sma_max = 2 / (1 - beta2) - 1
                    N_sma = N_sma_max - 2 * state['step'] * beta2_t / (1 - beta2_t)
                    buffered[1] = N_sma

                    # more conservative since it's an approximated value
                    if N_sma >= 5:
                        radam_step_size = math.sqrt((1 - beta2_t) * (N_sma - 4) / (N_sma_max - 4) * (N_sma - 2) / N_sma * N_sma_max / (N_sma_max - 2)) / (1 - beta1 ** state['step'])
                    else:
                        radam_step_size = 1.0 / (1 - beta1 ** state['step'])
                    buffered[2] = radam_step_size

                if group['weight_decay'] != 0:
                    p_data_fp32.add_(-group['weight_decay'] * group['lr'], p_data_fp32)

                # more conservative since it's an approximated value
                radam_step = p_data_fp32.clone()
                if N_sma >= 5:
                    denom = exp_avg_sq.sqrt().add_(group['eps'])
                    radam_step.addcdiv_(-radam_step_size * group['lr'], exp_avg, denom)
                else:
                    radam_step.add_(-radam_step_size * group['lr'], exp_avg)

                radam_norm = radam_step.pow(2).sum().sqrt()
                weight_norm = p.data.pow(2).sum().sqrt().clamp(0, 10)
                if weight_norm == 0 or radam_norm == 0:
                    trust_ratio = 1
                else:
                    trust_ratio = weight_norm / radam_norm

                state['weight_norm'] = weight_norm
                state['adam_norm'] = radam_norm
                state['trust_ratio'] = trust_ratio

                if N_sma >= 5:
                    p_data_fp32.addcdiv_(-radam_step_size * group['lr'] * trust_ratio, exp_avg, denom)
                else:
                    p_data_fp32.add_(-radam_step_size * group['lr'] * trust_ratio, exp_avg)

                p.data.copy_(p_data_fp32)

        return loss

#### 대회 Metric Code

In [3]:
from sklearn.metrics import f1_score

def mae(y_true, y_pred) :
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    y_true = y_true.reshape(1, -1)[0]
    
    y_pred = y_pred.reshape(1, -1)[0]
    
    over_threshold = y_true >= 0.1
    
    return np.mean(np.abs(y_true[over_threshold] - y_pred[over_threshold]))

def fscore(y_true, y_pred):
    
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    
    y_true = y_true.reshape(1, -1)[0]
    
    y_pred = y_pred.reshape(1, -1)[0]
    
    remove_NAs = y_true >= 0
    
    y_true = np.where(y_true[remove_NAs] >= 0.1, 1, 0)
    
    y_pred = np.where(y_pred[remove_NAs] >= 0.1, 1, 0)
    
    return(f1_score(y_true, y_pred))

def maeOverFscore(y_true, y_pred):
    _fs = fscore(y_true, y_pred)
    _mae = mae(y_true, y_pred)
    print("F-Score: ", _fs)
    print("MAE: ", _mae)
    return _mae / (_fs + 1e-07)

#### Seed Fixed

In [4]:
# seed value fix
# seed 값을 고정해야 hyper parameter 바꿀 때마다 결과를 비교할 수 있습니다.
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 0
seed_everything(SEED)

#### Dataset 정의

In [5]:
class Dataset(BaseDataset):
    def __init__(self, train_files, is_test_or_not=False, is_transform=False, aug_ratio=0):
        """
        train_files: train file list
        is_test_or_not: test or not
        is_transform: True augmentation
        aug_ratio: augmentation ratio
        """
        self.train_files = train_files
        self.is_test_or_not=is_test_or_not
        self.is_transform = is_transform
        self.aug_ratio = aug_ratio
    
    # data augmenation 6개
    # rot90, rot180, rot270, vflip, hflip, transpose
    def aug_flip(self, feature, target):
        switch = np.random.choice(6) # random 선택
        if switch==0: # rot90
            feature_aug=np.rot90(feature,k=1,axes=[1,2]).copy()
            target_aug=np.rot90(target,k=1,axes=[1,2]).copy()  
        elif switch==1: # rot180
            feature_aug=np.rot90(feature,k=2,axes=[1,2]).copy()
            target_aug=np.rot90(target,k=2,axes=[1,2]).copy()
        elif switch==2:  # rot270
            feature_aug=np.rot90(feature,k=3,axes=[1,2]).copy()
            target_aug=np.rot90(target,k=3,axes=[1,2]).copy()
        elif switch==3:  # vflip
            feature_aug=np.flip(feature,axis=[1]).copy()
            target_aug=np.flip(target,axis=[1]).copy()
        elif switch==4:  # hflip
            feature_aug=np.flip(feature,axis=[2]).copy()
            target_aug=np.flip(target,axis=[2]).copy()
        elif switch==5:  # transpose
            feature_aug=np.transpose(feature,[0,2,1]).copy()
            target_aug=np.transpose(target,[0,2,1]).copy()

        return feature_aug, target_aug

    def augmetation(self, feature, target):
        # uniform한 확률분포를 가져와서 지정된 aug_ratio보다 작으면 augmentation을 하지 않습니다.
        aug_prop = np.random.uniform()
        is_aug = aug_prop <= self.aug_ratio
        if not is_aug:
            return feature, target
        
        feature_aug, target_aug = self.aug_flip(feature, target)
            
        return feature_aug, target_aug

    def __getitem__(self, i):
        try:
            dataset = np.load(self.train_files[i]) 
        except:
            dataset = np.load(self.train_files[i].replace('\\','/')) 
        
        # target 값 정의
        target= np.moveaxis(dataset[:,:,-1].reshape(40,40,1),-1,0).astype(np.float32)
        
        # train일 때 target값은 제외하고 전처리 수행하기 위해서
        if not self.is_test_or_not:
            dataset = dataset[:,:,:-1]
        
        # GMI, DPR의 위/경도 diff
        dataset[:,:,10] = dataset[:,:,10] - dataset[:,:,12]
        dataset[:,:,11] = dataset[:,:,11] - dataset[:,:,13]
        
        # StandardScaling
        norm_temp = (dataset[:,:,:12]-mean_vector[None,None,:12])/std_vector[None,None,:12]
        feature = np.moveaxis(norm_temp,-1,0).astype(np.float32)
        
        # test와 augmentation을 하지 않을 경우 그대로 image와 target return
        if self.is_test_or_not or not self.is_transform:
            return feature, target
        
        # augmentation 수행하고 return
        return self.augmetation(feature, target)
        
    def __len__(self):
        return len(self.train_files)

validset batch size를 2로하면 mae_over_fscore가 계산이 잘 안되는 문제가 있어 validset 전체에 대하여 Metric을 계산하는 Utility Function

In [6]:
def get_mof_valid(nn_model, valid_loader):
    print("Valid All MOF")
    nn_model.eval()
    val_results = []
    val_targets = []
    for batch_idx, (feature, target) in enumerate(valid_loader):
        feature_copy = copy.deepcopy(feature)
        target_copy = copy.deepcopy(target)
        val_results.append(nn_model.predict(feature_copy.cuda()).view(-1,1600).cpu().numpy())
        val_targets.append(target_copy.view(-1,1600).cpu().numpy())
        del feature
        del target
    mof = maeOverFscore(np.concatenate(val_targets),np.concatenate(val_results))
    print(mof)
    return mof

## 2. 데이터 전처리
## Data Cleansing & Pre-Processing  

1. train feature파일을 load하여 강수량이 0보다 큰 것만 선택합니다. (-9999 제외)
2. GMI와 DPR의 위/경도 차이를 계산합니다.
3. Dataset에서 StandardScale을 하기 위해 mean_vector와 std_vector를 계산합니다.

In [7]:
train_df = pd.read_feather('../input/train.ftr')
train_df = train_df.loc[train_df['precipitation']>=0].reset_index(drop=True)

train_df['long_GMI'] = train_df['long_GMI'] - train_df['long_DPR']
train_df['lat_GMI'] = train_df['lat_GMI'] - train_df['lat_DPR']

train_columns = [c for c in train_df.columns if c not in ['precipitation', 'orbit', 'subset', 'pixel']]
mean_vector = []
std_vector = []
for c in train_columns:
    train_df[c] = train_df[c].astype(np.float32)
    _m = train_df[c].mean()
    mean_vector.append(_m)
    _s = train_df[c].std()
    std_vector.append(_s)
mean_vector = np.array(mean_vector)
std_vector = np.array(std_vector)

#### Dataset에 전달할 train file list와 valid file list를 만드는 코드입니다.
1. 기존에 저장된 file이 있다면 pickle library를 사용하여 load하고 없다면 새로 만듭니다.
2. train path를 정의합니다. train_path = '../input/train'
2. train_test_split을 활용하여 validset size는 전체 train에 10%로 할당합니다.
3. train 데이터는 target값에 0보다 작은 값이 하나라도 있으면 사용하지 않습니다.
4. 외부데이터 폴더를 정의합니다. '../input/newtrain'
6. 외부데이터도 마찬가지로 -9999는 사용하지 않고 추가적으로 GMI와 DPR의 위/경도 차가 너무 크면 사용하지 않습니다.
7. ValidSet도 마찬가지로 -9999를 사용하지 않고 위/경도 차가 크면 사용하지 않습니다.
8. 추후 이 파일을 재사용하기 위하여 위에서 지정한 pickle 파일 이름으로 저장합니다.

In [8]:
import pickle

TRAIN_FILE = 'train_files_rmna_new_train2'
VALID_FILE = 'valid_' + '_'.join(TRAIN_FILE.split('_')[1:])
print(TRAIN_FILE, VALID_FILE)
if os.path.exists(TRAIN_FILE):
    train_files_rmna=pickle.load(open(TRAIN_FILE,'rb'))
    valid_files_rmna=pickle.load(open(VALID_FILE,'rb'))
else:
    print("Make file")
    train_path = '../input/train'
    train_files, valid_files = train_test_split(sorted(glob.glob(train_path + '/*')),test_size=0.1,random_state=31014)
    train_files = [file.replace('\\','/') for file in train_files]
    valid_files = [file.replace('\\','/') for file in valid_files]
    train_files_rmna = []
    for file in tqdm(train_files):
        dataset = np.load(file)
        target= np.moveaxis(dataset[:,:,-1].reshape(40,40,1),-1,0).astype(np.float32)
        if np.sum(target <0) == 0:
            train_files_rmna.append(file)
    
    new_train_path = '../input/newtrain'
    new_train_files = sorted(glob.glob(new_train_path + '/*'))
    new_train_files = [file.replace('\\','/') for file in new_train_files]
    new_train_add_files = []
    for file in tqdm(new_train_files):
        dataset = np.load(file)
        abs_sum = np.sum(np.abs(dataset[:,:,10] - dataset[:,:,12]) + np.abs(dataset[:,:,11] - dataset[:,:,13]))
        target= np.moveaxis(dataset[:,:,-1].reshape(40,40,1),-1,0).astype(np.float32)
        if np.sum(target <0) == 0 and abs_sum < 100:
            train_files_rmna.append(file)
            new_train_add_files.append(file)
    
    print("Add NewTrain: ", len(new_train_add_files))
    
    valid_files_rmna = []
    for file in tqdm(valid_files):
        dataset = np.load(file)
        target= np.moveaxis(dataset[:,:,-1].reshape(40,40,1),-1,0).astype(np.float32)
        abs_sum = np.sum(np.abs(dataset[:,:,10] - dataset[:,:,12]) + np.abs(dataset[:,:,11] - dataset[:,:,13]))
        if np.sum(target <0) == 0 and abs_sum < 100:
            valid_files_rmna.append(file)
            
    pickle.dump(train_files_rmna,open(TRAIN_FILE,'wb'))
    pickle.dump(valid_files_rmna,open(VALID_FILE,'wb'))
    
print(len(train_files_rmna))
print(len(valid_files_rmna))

train_files_rmna_new_train2 valid_files_rmna_new_train2
76702
7585


#### Dataset
>* Train: 데이터의 50%를 Augmentation 수행, BatchSize 128
* Valid: Augmentation 수행 안함, BatchSize 2

In [9]:
print(len(train_files_rmna))
train_dataset = Dataset(
    train_files_rmna,
    is_test_or_not = False,
    is_transform=True,
    aug_ratio=0.5
)

valid_dataset = Dataset(
    valid_files_rmna,
    is_test_or_not=False
)

train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=2, shuffle=False)

76702


## 5. 모델 학습 및 검증
## Model Tuning & Evaluation

In [10]:
class SimpleConv(nn.Module):
    def __init__(self,):
        super().__init__()
        self.bn0 = nn.BatchNorm2d(12)
        self.conv0 =  nn.Conv2d(12, 64, kernel_size=1, stride=1, bias=False)
        self.bn_128 = nn.BatchNorm2d(64)

        self.conv1_1 =  nn.Conv2d(64, 32, kernel_size=1, stride=1, bias=False)
        self.conv1_2 =  nn.Conv2d(64, 32, kernel_size=3, stride=1, bias=False, padding=1)
        self.conv1_3 =  nn.Conv2d(64, 32, kernel_size=5, stride=1, bias=False, padding=2)

        self.conv2_1 =  nn.Conv2d(64, 32, kernel_size=7, stride=1, bias=False, padding=3)
        self.conv2_2 =  nn.Conv2d(64, 32, kernel_size=9, stride=1, bias=False, padding=4)
        self.conv2_3 =  nn.Conv2d(64, 32, kernel_size=11, stride=1, bias=False, padding=5)

        self.bn_concat = nn.BatchNorm2d(64+(32+32+32)*2)

        self.bottle_1 = nn.Conv2d(64+(32+32+32)*2,128,kernel_size=1, stride=1, bias=False)
        self.bottle_2 = nn.Conv2d(128,64, kernel_size=3, stride=1, bias=False, padding=1)

        self.bottle_3 = nn.Conv2d(64+(32+32+32)*2,128,kernel_size=3, stride=1, bias=False, padding=1)
        self.bottle_4 = nn.Conv2d(128, 64, kernel_size=1, stride=1, bias=False)

        self.bn_bottleneck24 = nn.BatchNorm2d(64+(32+32+32)*2+64+64)

        self.bottle_5 = nn.Conv2d(64+(32+32+32)*2+64+64,256,kernel_size=1, stride=1, bias=False)
        self.bottle_6 = nn.Conv2d(256,128,kernel_size=3, stride=1, bias=False, padding=1)

        self.bn_output = nn.BatchNorm2d(128)
        self.conv_out = nn.Conv2d(128, 1, kernel_size=1, stride=1, bias=False)
        self.relu =  nn.ReLU()


    def forward(self, inputs):

        x = self.bn0(inputs)
        conv0 = self.conv0(x)
        conv0 = self.bn_128(conv0)
        conv0 = self.relu(conv0)

        conv1_1 = self.conv1_1(conv0)
        conv1_2 = self.conv1_2(conv0)
        conv1_3 = self.conv1_3(conv0)

        conv2_1 = self.conv2_1(conv0)
        conv2_2 = self.conv2_2(conv0)
        conv2_3 = self.conv2_3(conv0)

        concat = torch.cat([conv0, 
                            conv1_1, conv1_2, conv1_3, 
                            conv2_1, conv2_2, conv2_3
                            ],axis=1)

        concat = self.relu(concat)
        bn_concat = self.bn_concat(concat)

        bottle_1 = self.bottle_1(bn_concat)
        bottle_1 = self.relu(bottle_1)
        bottle_2 = self.bottle_2(bottle_1)
        bottle_2 = self.relu(bottle_2)

        bottle_3 = self.bottle_3(bn_concat)
        bottle_3 = self.relu(bottle_3)
        bottle_4 = self.bottle_4(bottle_3)
        bottle_4 = self.relu(bottle_4)

        bottle24_concat = torch.cat([bn_concat, bottle_2, bottle_4],axis=1)
        bottle24_concat = self.bn_bottleneck24(bottle24_concat)

        bottle_5 = self.bottle_5(bottle24_concat)
        bottle_5 = self.relu(bottle_5)
        bottle_6 = self.bottle_6(bottle_5)
        bottle_6 = self.relu(bottle_6)

        conv_out = self.bn_output(bottle_6)
        out = self.conv_out(conv_out)
        out = self.relu(out)

        return out 

    def predict(self, x):
        if self.training:
            print('get eval')
            self.eval()

        with torch.no_grad():
            x = self.forward(x)

        return x

#### Model, Loss, Metrics, optimizer, scheduler 정의

In [11]:
model = SimpleConv()

# MOFLoss란 mae_over_fscore의 줄인말로 계산 방식은 동일합니다.
# pytorch로 구현되었습니다.
loss = smp.utils.losses.MOFLoss()

# 마찬가지로 mae_over_fscore의 pytorch metric version입니다.
metrics = [ smp.utils.metrics.MAEOVERFSCORE()]

base_optimizer = Ralamb(model.parameters(), weight_decay=0.01)
optimizer = Lookahead(base_optimizer)

scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, 10, 2,eta_min=1e-6) # 1e-6

#### 학습을 도와주는 Epoch Module에 Model, loss, metrics, optimizer를 넣고 epoch 객체 생성

In [12]:
DEVICE = 'cuda'
train_epoch = smp.utils.train.TrainEpoch(
    model, 
    loss=loss, 
    metrics=metrics, 
    optimizer=optimizer,
    device=DEVICE,
    verbose=True,
)

valid_epoch = smp.utils.train.ValidEpoch(
    model, 
    loss=loss, 
    metrics=metrics, 
    device=DEVICE,
    verbose=True,
)

#### 150 epoch까지 학습 수행
얼리스탑핑 없이 150epoch까지 수행 후 가장 점수가 좋은 저장된 Model을 불러옵니다.<br>
2080ti 한 대에서 약 10~12시간 정도 소요됩니다. <br>
Ensemble시 사용한 가장 좋은 Model이름은 아래와 같습니다.<br>
code/best_model/NewTrain8330_CustomModel_YM_mof_loss_1.3669144273527496_epoch_130.pth

In [13]:
NUM_EPOCH = 150

min_score = np.Inf
MODEL = "CustomModel_YM"
EXP = "NewTrain8330"

for i in range(0, NUM_EPOCH):
    
    print('\nEpoch: {}'.format(i))
    train_logs = train_epoch.run(train_loader)
    mof = get_mof_valid(model, valid_loader)
    scheduler.step()
    
    if min_score > mof:
        min_score = mof
        torch.save(model, f'best_model/{EXP}_{MODEL}_{loss.__name__}_{mof}_epoch_{i}.pth')
        torch.save(model, f'best_model/{EXP}_{MODEL}_{loss.__name__}.pth')
        print('Model saved!') 


Epoch: 0
train:   0%|                                                                                   | 0/600 [00:00<?, ?it/s]

	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha)


train: 100%|████████████████████████████████| 600/600 [04:17<00:00,  2.33it/s, mof_loss - 2.128, maeoverfscore - 2.128]
Valid All MOF
F-Score:  0.7471034805644287
MAE:  1.4267327
1.9096852116034886
Model saved!

Epoch: 1
train:   0%|                                                                                   | 0/600 [00:00<?, ?it/s]

  "type " + obj.__name__ + ". It won't be checked "


train:  12%|████▏                            | 75/600 [00:33<03:51,  2.27it/s, mof_loss - 1.972, maeoverfscore - 1.972]


KeyboardInterrupt: 

## 6. 결과 및 결언
## Conclusion & Discussion

1. test file의 경로를 지정하고 Dataset class에 test file list를 넘겨서 객체를 생성합니다.
2. DataLoader에 batch_size 1로 하여 test dataloader를 만듭니다.
3. savedmodel/NewTrain8330_CustomModel_YM_mof_loss.pth을 Load합니다. 재현 시 이 파일이 아닌 다른 파일의 점수가 제일 좋다면 그것으로 사용하여도 큰 차이가 발생하지는 않을 것입니다. <br>

In [14]:
test_path = '../input/test'
test_files = sorted(glob.glob(test_path + '/*'))

test_dataset = Dataset(
    test_files, True
)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

#### Best Model Load

In [15]:
# best_model = torch.load('best_model_2_0.12573673413876285.pth') LB 2.8759346226
best_model = torch.load('savedmodel/NewTrain8330_CustomModel_YM_mof_loss.pth')

#### Predict 수행

In [16]:
best_model.eval()
results = []
for batch_idx, (feature, target) in tqdm(enumerate(test_dataloader)):
    results.append(best_model.predict(feature.cuda()).view(-1,1600).cpu().numpy())

HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




#### 0이하는 0으로 Clip하고 Model3 제출 파일 완성

In [17]:
preds = np.concatenate(results,axis=0)
submission = pd.read_csv('../input/sample_submission.csv')
submission.iloc[:,1:] =np.clip(preds,0,np.inf)
submission.to_csv('../output/NewTrain8330_CustomModel_YM_mof_loss_1.3669144273527496_epoch_130.csv', index = False)