# **üìÑ Document type classification baseline code**
> Î¨∏ÏÑú ÌÉÄÏûÖ Î∂ÑÎ•ò ÎåÄÌöåÏóê Ïò§Ïã† Ïó¨Îü¨Î∂Ñ ÌôòÏòÅÌï©ÎãàÎã§! üéâ     
> ÏïÑÎûò baselineÏóêÏÑúÎäî ResNet Î™®Îç∏ÏùÑ Î°úÎìúÌïòÏó¨, Î™®Îç∏ÏùÑ ÌïôÏäµ Î∞è ÏòàÏ∏° ÌååÏùº ÏÉùÏÑ±ÌïòÎäî ÌîÑÎ°úÏÑ∏Ïä§Ïóê ÎåÄÌï¥ ÏïåÏïÑÎ≥¥Í≤†ÏäµÎãàÎã§.

## Contents
- Prepare Environments
- Import Library & Define Functions
- Hyper-parameters
- Load Data
- Train Model
- Inference & Save File


## 1. Prepare Environments

* Îç∞Ïù¥ÌÑ∞ Î°úÎìúÎ•º ÏúÑÌïú Íµ¨Í∏Ä ÎìúÎùºÏù¥Î∏åÎ•º ÎßàÏö¥Ìä∏Ìï©ÎãàÎã§.
* ÌïÑÏöîÌïú ÎùºÏù¥Î∏åÎü¨Î¶¨Î•º ÏÑ§ÏπòÌï©ÎãàÎã§.

In [1]:
# Íµ¨Í∏Ä ÎìúÎùºÏù¥Î∏å ÎßàÏö¥Ìä∏, ColabÏùÑ Ïù¥Ïö©ÌïòÏßÄ ÏïäÎäîÎã§Î©¥ Ìå®Ïä§Ìï¥ÎèÑ Îê©ÎãàÎã§.
from google.colab import drive
drive.mount('/gdrive', force_remount=True)
drive.mount('/content/drive')

Mounted at /gdrive
Mounted at /content/drive


In [2]:
# Íµ¨Í∏Ä ÎìúÎùºÏù¥Î∏åÏóê ÏóÖÎ°úÎìúÎêú ÎåÄÌöå Îç∞Ïù¥ÌÑ∞Î•º ÏïïÏ∂ï Ìï¥Ï†úÌïòÍ≥† Î°úÏª¨Ïóê Ï†ÄÏû•Ìï©ÎãàÎã§.
!tar -xvf drive/MyDrive/datasets_fin.tar > /dev/null

In [1]:
# ÌïÑÏöîÌïú ÎùºÏù¥Î∏åÎü¨Î¶¨Î•º ÏÑ§ÏπòÌï©ÎãàÎã§.
!pip install timm

[0m

## 2. Import Library & Define Functions
* ÌïôÏäµ Î∞è Ï∂îÎ°†Ïóê ÌïÑÏöîÌïú ÎùºÏù¥Î∏åÎü¨Î¶¨Î•º Î°úÎìúÌï©ÎãàÎã§.
* ÌïôÏäµ Î∞è Ï∂îÎ°†Ïóê ÌïÑÏöîÌïú Ìï®ÏàòÏôÄ ÌÅ¥ÎûòÏä§Î•º Ï†ïÏùòÌï©ÎãàÎã§.

In [2]:
import os
import time
import random

import timm
import torch
import albumentations as A
import pandas as pd
import numpy as np
import torch.nn as nn
from albumentations.pytorch import ToTensorV2
from torch.optim import Adam
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import accuracy_score, f1_score

In [3]:
# ÏãúÎìúÎ•º Í≥†Ï†ïÌï©ÎãàÎã§.
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.benchmark = True

In [4]:
# Îç∞Ïù¥ÌÑ∞ÏÖã ÌÅ¥ÎûòÏä§Î•º Ï†ïÏùòÌï©ÎãàÎã§.
class ImageDataset(Dataset):
    def __init__(self, csv, path, transform=None):
        self.df = pd.read_csv(csv).values
        self.path = path
        self.transform = transform

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        name, target = self.df[idx]
        img = np.array(Image.open(os.path.join(self.path, name)))
        if self.transform:
            img = self.transform(image=img)['image']
        return img, target

In [5]:
# one epoch ÌïôÏäµÏùÑ ÏúÑÌïú Ìï®ÏàòÏûÖÎãàÎã§.
def train_one_epoch(loader, model, optimizer, loss_fn, device):
    model.train()
    train_loss = 0
    preds_list = []
    targets_list = []

    pbar = tqdm(loader)
    for image, targets in pbar:
        image = image.to(device)
        targets = targets.to(device)

        model.zero_grad(set_to_none=True)

        preds = model(image)
        loss = loss_fn(preds, targets)
        loss.backward()
        optimizer.step()

        train_loss += loss.item()
        preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())
        targets_list.extend(targets.detach().cpu().numpy())

        pbar.set_description(f"Loss: {loss.item():.4f}")
    
    train_loss /= len(loader)
    train_acc = accuracy_score(targets_list, preds_list)
    train_f1 = f1_score(targets_list, preds_list, average='macro')

    ret = {
        "train_loss": train_loss,
        "train_acc": train_acc,
        "train_f1": train_f1,
    }

    return ret

## 3. Hyper-parameters
* ÌïôÏäµ Î∞è Ï∂îÎ°†Ïóê ÌïÑÏöîÌïú ÌïòÏù¥ÌçºÌååÎùºÎØ∏ÌÑ∞Îì§ÏùÑ Ï†ïÏùòÌï©ÎãàÎã§.

In [6]:
# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# data config
data_path = 'datasets_fin/'

# model config
model_name = 'resnet34' # 'resnet50' 'efficientnet-b0', ...

# training config
img_size = 32
LR = 1e-3
EPOCHS = 1
BATCH_SIZE = 32
num_workers = 0

## 4. Load Data
* ÌïôÏäµ, ÌÖåÏä§Ìä∏ Îç∞Ïù¥ÌÑ∞ÏÖãÍ≥º Î°úÎçîÎ•º Ï†ïÏùòÌï©ÎãàÎã§.

In [7]:
# augmentationÏùÑ ÏúÑÌïú transform ÏΩîÎìú
trn_transform = A.Compose([
    # Ïù¥ÎØ∏ÏßÄ ÌÅ¨Í∏∞ Ï°∞Ï†ï
    A.Resize(height=img_size, width=img_size),
    # images normalization
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    # numpy Ïù¥ÎØ∏ÏßÄÎÇò PIL Ïù¥ÎØ∏ÏßÄÎ•º PyTorch ÌÖêÏÑúÎ°ú Î≥ÄÌôò
    ToTensorV2(),
])

# test image Î≥ÄÌôòÏùÑ ÏúÑÌïú transform ÏΩîÎìú
tst_transform = A.Compose([
    A.Resize(height=img_size, width=img_size),
    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ToTensorV2(),
])

In [8]:
# Dataset Ï†ïÏùò
trn_dataset = ImageDataset(
    "datasets_fin/train.csv",
    "datasets_fin/train/",
    transform=trn_transform
)
tst_dataset = ImageDataset(
    "datasets_fin/sample_submission.csv",
    "datasets_fin/test/",
    transform=tst_transform
)
print(len(trn_dataset), len(tst_dataset))

FileNotFoundError: [Errno 2] No such file or directory: 'datasets_fin/train.csv'

In [None]:
# DataLoader Ï†ïÏùò
trn_loader = DataLoader(
    trn_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=num_workers,
    pin_memory=True,
    drop_last=False
)
tst_loader = DataLoader(
    tst_dataset,
    batch_size=BATCH_SIZE,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

## 5. Train Model
* Î™®Îç∏ÏùÑ Î°úÎìúÌïòÍ≥†, ÌïôÏäµÏùÑ ÏßÑÌñâÌï©ÎãàÎã§.

In [None]:
# load model
model = timm.create_model(
    model_name,
    pretrained=True,
    num_classes=17
).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=LR)

In [None]:
for epoch in range(EPOCHS):
    ret = train_one_epoch(trn_loader, model, optimizer, loss_fn, device=device)
    ret['epoch'] = epoch

    log = ""
    for k, v in ret.items():
      log += f"{k}: {v:.4f}\n"
    print(log)

# 6. Inference & Save File
* ÌÖåÏä§Ìä∏ Ïù¥ÎØ∏ÏßÄÏóê ÎåÄÌïú Ï∂îÎ°†ÏùÑ ÏßÑÌñâÌïòÍ≥†, Í≤∞Í≥º ÌååÏùºÏùÑ Ï†ÄÏû•Ìï©ÎãàÎã§.

In [None]:
preds_list = []

model.eval()
for image, _ in tqdm(tst_loader):
    image = image.to(device)

    with torch.no_grad():
        preds = model(image)
    preds_list.extend(preds.argmax(dim=1).detach().cpu().numpy())

In [None]:
    pred_df = pd.DataFrame(tst_dataset.df, columns=['ID', 'target'])
    pred_df['target'] = preds_list

In [None]:
sample_submission_df = pd.read_csv("datasets_fin/sample_submission.csv")
assert (sample_submission_df['ID'] == pred_df['ID']).all()

In [None]:
pred_df.to_csv("pred.csv", index=False)

In [None]:
pred_df.head()