# 소상공인 QnA 카테고리 분류

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ln -s /content/drive/MyDrive/ /gdrive

In [3]:
!mkdir data/

In [4]:
!cp -r /gdrive/aicon/aicon_retrailer/train data/
!cp -r /gdrive/aicon/aicon_retrailer/val data/
!cp -r /gdrive/aicon/aicon_retrailer/test data/
!cp /gdrive/aicon/aicon_retrailer/sample_submission.csv data/

In [5]:
import logging

import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from datetime import datetime, timezone, timedelta
import numpy as np
import torch
import torch.optim as optim
import random
from sklearn.metrics import accuracy_score
from torch.utils.data import DataLoader

In [6]:
logging.disable(logging.WARNING)

# os.environ["CUDA_VISIBLE_DEVICES"]="0"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [7]:
# 시드(seed) 설정

RANDOM_SEED = 2021
torch.manual_seed(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

In [8]:
# working directory 지정
ROOT_PATH = './data'
DATA_DIR = os.path.join(ROOT_PATH)
RESULT_DIR = os.path.join(ROOT_PATH, 'result')
if not os.path.isdir(RESULT_DIR):
  os.makedirs(RESULT_DIR)

# hyper-parameters
EPOCHS = 30
BATCH_SIZE = 8
LEARNING_RATE = 0.0005
EARLY_STOPPING_PATIENCE = 5

In [9]:
def get_logger(name: str, file_path: str, stream=False) -> logging.RootLogger:
    logger = logging.getLogger(name)
    logger.setLevel(logging.INFO)

    formatter = logging.Formatter('%(asctime)s | %(name)s | %(levelname)s | %(message)s')
    stream_handler = logging.StreamHandler()
    file_handler = logging.FileHandler(file_path)

    stream_handler.setFormatter(formatter)
    file_handler.setFormatter(formatter)

    if stream:
        logger.addHandler(stream_handler)
    logger.addHandler(file_handler)

    return logger

# Set system logger
system_logger = get_logger(name='train',file_path=os.path.join(ROOT_PATH,'/02_retailer_train_log.log'))

## Dataloader

In [10]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 8.3 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 62.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 70.2 MB/s 
[?25hCollecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 50.7 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully u

In [11]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from itertools import chain
import sys


class CustomDataset(Dataset):
    def __init__(self, data_dir, mode):
        self.mode = mode
        self.data_dir = data_dir
        self.intents = {'AS_날짜_요청': 0,'AS_날짜_질문': 1,'AS_방법_요청': 2,'AS_방법_질문': 3,'AS_비용_요청': 4,'AS_비용_질문': 5,
                        'AS_시간_질문': 6,'AS_일반_질문': 7,'결제_방식_질문': 8,'결제_수단_질문': 9,'결제_시기_질문': 10,'결제_영수증_질문': 11,
                        '결제_오류_질문': 12,'결제_일반_질문': 13,'결제_일반_확인': 14,'결제_재결제_질문': 15,'결제_추가_질문': 16,
                        '결제_취소_질문': 17,'결제_할인_질문': 18,'교환|반품|환불_방법_요청': 19,'교환|반품|환불_방법_질문': 20,
                        '교환|반품|환불_방법_확인': 21,'교환|반품|환불_비용_질문': 22,'교환|반품|환불_시간_요청': 23,
                        '교환|반품|환불_시간_질문': 24,'교환|반품|환불_일반_요청': 25,'교환|반품|환불_일반_질문': 26,
                        '교환|반품|환불_일반_확인': 27,'구매_예약_요청': 28,'구매_예약_질문': 29,'구매_제품_요청': 30,'구매_제품_질문': 31,
                        '매장_이용_요청': 32,'매장_이용_질문': 33,'매장_정보_질문': 34,'멤버십_사용_질문': 35,'멤버십_적립_질문': 36,
                        '배송_날짜_요청': 37,'배송_날짜_질문': 38,'배송_날짜_확인': 39,'배송_방법_요청': 40,'배송_방법_질문': 41,
                        '배송_방법_확인': 42,'배송_비용_질문': 43,'배송_오류_질문': 44,'배송_오류_확인': 45,'배송_일반_요청': 46,
                        '배송_일반_질문': 47,'배송_일반_확인': 48,'배송_지역_요청': 49,'배송_지역_질문': 50,'배송_택배사_질문': 51,
                        '부가서비스_날짜_요청': 52,'부가서비스_날짜_질문': 53,'부가서비스_방법_요청': 54,'부가서비스_방법_질문': 55,
                        '부가서비스_비용_요청': 56,'부가서비스_비용_질문': 57,'웹사이트_사용_질문': 58,'웹사이트_오류_질문': 59,
                        '제품_가격_비교': 60,'제품_가격_요청': 61,'제품_가격_질문': 62,'제품_가격_확인': 63,'제품_구성_요청': 64,
                        '제품_구성_질문': 65,'제품_구성_확인': 66,'제품_날짜_질문': 67,'제품_방법_요청': 68,'제품_방법_질문': 69,
                        '제품_방법_확인': 70,'제품_불량_요청': 71,'제품_불량_질문': 72,'제품_불량_확인': 73,'제품_소재_질문': 74,
                        '제품_시용_요청': 75,'제품_시용_질문': 76,'제품_용도_질문': 77,'제품_용도_확인': 78,'제품_원산지_질문': 79,
                        '제품_일반_비교': 80,'제품_일반_요청': 81,'제품_일반_질문': 82,'제품_일반_확인': 83,'제품_입고_요청': 84,
                        '제품_입고_질문': 85,'제품_재고_요청': 86,'제품_재고_질문': 87,'제품_재고_확인': 88,'제품_정보_비교': 89,
                        '제품_정보_요청': 90,'제품_정보_질문': 91,'제품_정보_확인': 92,'제품_추천_비교': 93,'제품_추천_요청': 94,
                        '제품_추천_질문': 95,'제품_추천_확인': 96,'제품_커스텀_요청': 97,'제품_커스텀_질문': 98,'제품_품질_비교': 99,
                        '제품_품질_요청': 100,'제품_품질_질문': 101,'제품_품질_확인': 102,'제품_호환_질문': 103,'제품_호환_확인': 104,
                        '포장_방식_요청': 105,'포장_방식_질문': 106,'포장_비용_질문': 107,'포장_일반_질문': 108,'행사_기간_질문': 109,
                        '행사_기간_확인': 110,'행사_날짜_질문': 111,'행사_유형_질문': 112,'행사_유형_확인': 113,'행사_일반_질문': 114,
                        '행사_일반_확인': 115,'행사_정보_요청': 116,'행사_정보_질문': 117}
        self.num_labels = len(self.intents)
        self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
        # Load data
        self.inputs, self.labels = self.data_loader(data_dir)


    def data_loader(self, path):
        print('Loading ' + self.mode + ' dataset..')
        # check if preprocessed data directory exists
        if not os.path.isdir(self.data_dir):
            print(f'!!! Cannot find {self.data_dir}... !!!')
            sys.exit()

        if os.path.isfile(os.path.join(path, self.mode, self.mode + '_X.pt')):
            inputs = torch.load(os.path.join(path, self.mode, self.mode + '_X.pt'))
            labels = torch.load(os.path.join(path, self.mode, self.mode + '_Y.pt'))

        else:
            file_path = os.path.join(path, self.mode, self.mode + '.csv')
            df = pd.read_csv(file_path)
            df = df.dropna(axis=0, how='all')
            inputs = df[df.columns[2:]]
            labels = df['intent']

            # Preprocessing
            inputs, labels = self.preprocessing(inputs, labels)
            # Save data
            torch.save(inputs ,os.path.join(path, self.mode, self.mode + '_X.pt'))
            torch.save(labels, os.path.join(path, self.mode, self.mode + '_Y.pt'))

        return inputs, labels

    def pad(self, data, pad_id, max_len):
        padded_data = list(map(lambda x : torch.cat([x, torch.tensor([pad_id] * (max_len - len(x)))]), data))
        return padded_data

    def preprocessing(self, inputs, labels):
        print('Preprocessing ' + self.mode + ' dataset..')
        #Encoding original
        src_tensor = []
        seg_tensor = []
        for i in range(len(inputs)):
            src_tensor.append(torch.tensor(list(chain.from_iterable([self.tokenizer.encode(inputs[col][i], add_special_tokens=True) \
                                                                     for col in inputs.columns if inputs[col][i] == inputs[col][i]]))))
            clss = torch.cat([torch.where(src_tensor[i] == 2)[0], torch.tensor([len(src_tensor[i])])])
            seg_tensor.append(torch.tensor(list(chain.from_iterable( \
                [[0] * (clss[i + 1] - clss[i]) if i % 2 == 0 else [1] * (clss[i + 1] - clss[i]) \
                for i, val in enumerate(clss[:-1])]))))

        #Padding
        max_encoding_len = max(list(map(lambda x: len(x), src_tensor)))
        assert max_encoding_len < 512, 'Encoding length is longer than maximum processing length.'
        src_tensor = self.pad(src_tensor, 0, max_encoding_len)
        seg_tensor = self.pad(seg_tensor, 0, max_encoding_len)

        #Convert to list of tensor to 2d tensor
        src_tensor = torch.stack(src_tensor, dim=0)
        seg_tensor = torch.stack(seg_tensor, dim=0)
        mask_tensor = (~ (src_tensor == 0))

        #Encoding labels
        label_tensor = torch.tensor(self.label_encoder(labels.values))


        #Integrate the tensor {1st dimension : {src, seg, mask}, 2nd dim : {number of samples}, 3rd dim : {encoding dimension}}
        input_tensor = torch.cat([src_tensor.unsqueeze(dim=1) , seg_tensor.unsqueeze(dim=1), mask_tensor.unsqueeze(dim=1)], dim=1)

        return input_tensor, label_tensor

    def label_encoder(self, labels):
        try:
            labels = list(map(lambda x : self.intents[x], labels))
            return labels
        except:
            assert 'Invalid intent'

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return self.inputs[index, :, :], self.labels[index]


In [12]:
train_dataset = CustomDataset(data_dir=DATA_DIR, mode='train')
validation_dataset = CustomDataset(data_dir=DATA_DIR, mode='val')
train_dataloader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
validation_dataloader = DataLoader(dataset=validation_dataset, batch_size=BATCH_SIZE, shuffle=False)

Downloading:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/504 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/396k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/124 [00:00<?, ?B/s]

Loading train dataset..
Loading val dataset..


## Model

In [13]:
!pip install pytorch-pretrained-bert

Collecting pytorch-pretrained-bert
  Downloading pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123 kB)
[?25l[K     |██▋                             | 10 kB 36.6 MB/s eta 0:00:01[K     |█████▎                          | 20 kB 26.0 MB/s eta 0:00:01[K     |████████                        | 30 kB 18.4 MB/s eta 0:00:01[K     |██████████▋                     | 40 kB 16.2 MB/s eta 0:00:01[K     |█████████████▎                  | 51 kB 7.2 MB/s eta 0:00:01[K     |███████████████▉                | 61 kB 8.4 MB/s eta 0:00:01[K     |██████████████████▌             | 71 kB 8.0 MB/s eta 0:00:01[K     |█████████████████████▏          | 81 kB 9.0 MB/s eta 0:00:01[K     |███████████████████████▉        | 92 kB 9.5 MB/s eta 0:00:01[K     |██████████████████████████▌     | 102 kB 7.2 MB/s eta 0:00:01[K     |█████████████████████████████▏  | 112 kB 7.2 MB/s eta 0:00:01[K     |███████████████████████████████▊| 122 kB 7.2 MB/s eta 0:00:01[K     |████████████████████████████████

In [14]:
from pytorch_pretrained_bert import BertModel
import pytorch_pretrained_bert
import transformers

class IntentClassifier(nn.Module):

    def __init__(self):
        """
        """
        super(IntentClassifier, self).__init__()
        self.model = transformers.BertForSequenceClassification.from_pretrained("beomi/KcELECTRA-base", num_labels = 118)
        self.hidden_dim = 768
        self.fc = nn.Linear(self.hidden_dim, 6)

    def forward(self, x, mask, segs, target=None):
        if target is not None:
            output = self.model(input_ids = x.long(), attention_mask = mask.float(), token_type_ids = segs.long(), labels=target.unsqueeze(1))
        else:
            output = self.model(input_ids = x.long(), attention_mask = mask.float(), token_type_ids = segs.long())
        return output


In [15]:
class LossEarlyStopper():
    """Early stopper
    
    Attributes:
        patience (int): loss가 줄어들지 않아도 학습할 epoch 수
        verbose (bool): 로그 출력 여부, True 일 때 로그 출력
        patience_counter (int): loss 가 줄어들지 않을 때 마다 1씩 증가
        min_loss (float): 최소 loss
        stop (bool): True 일 때 학습 중단

    """

    def __init__(self, patience: int, verbose: bool, logger:logging.RootLogger=None)-> None:
        """ 초기화

        Args:
            patience (int): loss가 줄어들지 않아도 학습할 epoch 수
            weight_path (str): weight 저장경로
            verbose (bool): 로그 출력 여부, True 일 때 로그 출력
        """
        self.patience = patience
        self.verbose = verbose

        self.patience_counter = 0
        self.min_loss = np.Inf
        self.logger = logger
        self.stop = False

    def check_early_stopping(self, loss: float)-> None:
        """Early stopping 여부 판단

        Args:
            loss (float):

        Examples:
            
        Note:
            
        """  

        if self.min_loss == np.Inf:
            self.min_loss = loss
            # self.save_checkpoint(loss=loss, model=model)

        elif loss > self.min_loss:
            self.patience_counter += 1
            msg = f"Early stopper, Early stopping counter {self.patience_counter}/{self.patience}"

            if self.patience_counter == self.patience:
                self.stop = True

            if self.verbose:
                self.logger.info(msg) if self.logger else print(msg)
                
        elif loss <= self.min_loss:
            self.save_model = True
            msg = f"Early stopper, Validation loss decreased {self.min_loss} -> {loss}"
            self.min_loss = loss
            # self.save_checkpoint(loss=loss, model=model)

            if self.verbose:
                self.logger.info(msg) if self.logger else print(msg)

In [16]:
class Trainer():

    def __init__(self, model, device, loss_fn, metric_fn, optimizer=None, scheduler=None, logger=None):
        """ 초기화
        """
        self.model = model
        self.device = device
        self.optimizer = optimizer
        self.loss_fn = loss_fn
        self.logger = logger
        self.scheduler = scheduler
        self.metric_fn = metric_fn

    def train_epoch(self, dataloader, epoch_index):
        """ 한 epoch에서 수행되는 학습 절차

        Args:
            dataloader (`dataloader`)
            epoch_index (int)
        """
        self.model.train()
        self.train_total_loss = 0
        target_lst = []
        pred_lst = []
        for batch_index, (data, target) in enumerate(dataloader):
            data = data.to(self.device)
            target = target.to(self.device)
            src = data[:, 0, :]
            segs = data[:, 1, :]
            mask = data[:, 2, :]
            output = self.model(src, mask, segs, target)
            loss = output.loss
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()
            self.scheduler.step()
            self.train_total_loss += loss
            target_lst.extend(target.cpu().tolist())
            pred = output.logits.argmax(dim=1)
            pred_lst.extend(pred.cpu().tolist())
            
            batch_score = self.metric_fn(target_lst, pred_lst)
            
            msg = f"Epoch {epoch_index} train batch {batch_index}/{len(dataloader)}: {batch_index * dataloader.batch_size}/{len(dataloader)} mean loss: {loss} score: {batch_score}"
            if batch_index%100 == 0:
                if self.logger:
                    self.logger.info(msg)
                print(msg)
            
        self.train_mean_loss = self.train_total_loss / len(dataloader)
        self.train_score = accuracy_score(y_true=target_lst, y_pred=pred_lst)
        msg = f'Epoch {epoch_index}, Train, loss: {self.train_mean_loss}, Score: {self.train_score}'
        print(msg)
        self.logger.info(msg) if self.logger else print(msg)

    def validate_epoch(self, dataloader, epoch_index):
        """ 한 epoch에서 수행되는 검증 절차

        Args:
            dataloader (`dataloader`)
            epoch_index (int)
        """
        self.model.eval()
        self.val_total_loss = 0
        target_lst = []
        pred_lst = []
        with torch.no_grad():
            for batch_index, (data, target) in enumerate(dataloader):
                data = data.to(self.device)
                target = target.to(self.device)
                src = data[:, 0, :]
                segs = data[:, 1, :]
                mask = data[:, 2, :]
                output = self.model(src, mask, segs, target)
                loss = output.loss
                self.val_total_loss += loss
                target_lst.extend(target.tolist())
                pred_lst.extend(output.logits.argmax(dim=1).tolist())
            self.val_mean_loss = self.val_total_loss / len(dataloader)
            self.validation_score = accuracy_score(y_true=target_lst, y_pred=pred_lst)
            msg = f'Epoch {epoch_index}, Validation, loss: {self.val_mean_loss}, Score: {self.validation_score}'
            print(msg)
            self.logger.info(msg) if self.logger else print(msg)

    def test_epoch(self, dataloader, epoch_index):
        """ 한 epoch에서 수행되는 검증 절차

        Args:
            dataloader (`dataloader`)
            epoch_index (int)
        """
        self.model.eval()
        pred_lst = []
        with torch.no_grad():
            for batch_index, (data) in enumerate(dataloader):
                data = data.to(self.device)
                src = data[:, 0, :]
                segs = data[:, 1, :]
                mask = data[:, 2, :]
                output = self.model(src, mask, segs)
                pred_lst.extend(output.logits.argmax(dim=1).tolist())
                
                if batch_index % 100 == 0:
                    print(f'Prediction: {batch_index} batch completed')
        return pred_lst

In [17]:
# Load Model
model = IntentClassifier().to(device)

# Save Initial Model
# torch.save({'model':model.state_dict()}, os.path.join(RESULT_DIR, 'initial.pt'))

print('===== Review Model Architecture =====')
print(f'{model} \n')

# Set optimizer, scheduler, loss function, metric function
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()
scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e5, max_lr=0.0001, epochs=EPOCHS, steps_per_epoch=len(train_dataloader))

# Set metrics
metric_fn = accuracy_score

# Set trainer
trainer = Trainer(model, device, loss_fn, metric_fn ,optimizer, scheduler, logger=system_logger)

# Set earlystopper
early_stopper = LossEarlyStopper(patience=EARLY_STOPPING_PATIENCE, verbose=True, logger=system_logger)

Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

===== Review Model Architecture =====
IntentClassifier(
  (model): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(50135, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in

In [None]:
criterion = 0
# Train
for epoch_index in range(EPOCHS):
    trainer.train_epoch(train_dataloader, epoch_index=epoch_index)
    trainer.validate_epoch(validation_dataloader, epoch_index=epoch_index)

    # early_stopping check
    early_stopper.check_early_stopping(loss=trainer.val_mean_loss)

    if early_stopper.stop:
        print('Early stopped')
        break

    if trainer.validation_score > criterion:
        criterion = trainer.validation_score
        check_point = {
            'model': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict()
        }
        torch.save(check_point, os.path.join(RESULT_DIR, 'best.pt'))

Epoch 0 train batch 0/40985: 0/40985 mean loss: 4.867671489715576 score: 0.0
Epoch 0 train batch 100/40985: 800/40985 mean loss: 4.581225872039795 score: 0.0
Epoch 0 train batch 200/40985: 1600/40985 mean loss: 4.722387313842773 score: 0.0
Epoch 0 train batch 300/40985: 2400/40985 mean loss: 4.769691467285156 score: 0.0
Epoch 0 train batch 400/40985: 3200/40985 mean loss: 4.675554275512695 score: 0.0
Epoch 0 train batch 500/40985: 4000/40985 mean loss: 4.775263786315918 score: 0.000499001996007984
Epoch 0 train batch 600/40985: 4800/40985 mean loss: 4.728079795837402 score: 0.0008319467554076539
Epoch 0 train batch 700/40985: 5600/40985 mean loss: 4.612095355987549 score: 0.000891583452211127
Epoch 0 train batch 800/40985: 6400/40985 mean loss: 4.711632251739502 score: 0.0012484394506866417
Epoch 0 train batch 900/40985: 7200/40985 mean loss: 4.885817050933838 score: 0.0015260821309655938
Epoch 0 train batch 1000/40985: 8000/40985 mean loss: 4.548514366149902 score: 0.00212287712287712

batch == 8 : Epoch 0 train batch 0/40985: 0/40985 mean loss: 4.864230632781982 score: 0.0

## Inference

In [None]:
TRAINED_MODEL_PATH = os.path.join(RESULT_DIR, 'best.pt')

BATCH_SIZE = 32

In [None]:
from torch.utils.data import Dataset
from transformers import AutoTokenizer
from itertools import chain
import sys

class TestDataset(Dataset):
    def __init__(self, data_dir, mode):
        self.mode = mode
        self.data_dir = data_dir
        self.mode = mode
        self.data_dir = data_dir
        self.tokenizer = AutoTokenizer.from_pretrained("beomi/KcELECTRA-base")
        
        self.intents = {0:'AS_날짜_요청' ,1:'AS_날짜_질문' ,2:'AS_방법_요청' ,3:'AS_방법_질문' ,4:'AS_비용_요청' ,5:'AS_비용_질문' ,
                        6:'AS_시간_질문' ,7:'AS_일반_질문' ,8:'결제_방식_질문' ,9:'결제_수단_질문' ,10:'결제_시기_질문' ,11:'결제_영수증_질문' ,
                        12:'결제_오류_질문' ,13:'결제_일반_질문' ,14:'결제_일반_확인' ,15:'결제_재결제_질문' ,16:'결제_추가_질문' ,
                        17:'결제_취소_질문' ,18:'결제_할인_질문' ,19:'교환|반품|환불_방법_요청' ,20:'교환|반품|환불_방법_질문' ,
                        21:'교환|반품|환불_방법_확인' ,22:'교환|반품|환불_비용_질문' ,23:'교환|반품|환불_시간_요청' ,
                        24:'교환|반품|환불_시간_질문' ,25:'교환|반품|환불_일반_요청' ,26:'교환|반품|환불_일반_질문' ,
                        27:'교환|반품|환불_일반_확인' ,28:'구매_예약_요청' ,29:'구매_예약_질문' ,30:'구매_제품_요청' ,31:'구매_제품_질문' ,
                        32:'매장_이용_요청' ,33:'매장_이용_질문' ,34:'매장_정보_질문' ,35:'멤버십_사용_질문' ,36:'멤버십_적립_질문' ,
                        37:'배송_날짜_요청' ,38:'배송_날짜_질문' ,39:'배송_날짜_확인' ,40:'배송_방법_요청' ,41:'배송_방법_질문' ,
                        42:'배송_방법_확인' ,43:'배송_비용_질문' ,44:'배송_오류_질문' ,45:'배송_오류_확인' ,46:'배송_일반_요청' ,
                        47:'배송_일반_질문' ,48:'배송_일반_확인' ,49:'배송_지역_요청' ,50:'배송_지역_질문' ,51:'배송_택배사_질문' ,
                        52:'부가서비스_날짜_요청' ,53:'부가서비스_날짜_질문' ,54:'부가서비스_방법_요청' ,55:'부가서비스_방법_질문' ,
                        56:'부가서비스_비용_요청' ,57:'부가서비스_비용_질문' ,58:'웹사이트_사용_질문' ,59:'웹사이트_오류_질문' ,
                        60:'제품_가격_비교' ,61:'제품_가격_요청' ,62:'제품_가격_질문' ,63:'제품_가격_확인' ,64:'제품_구성_요청' ,
                        65:'제품_구성_질문' ,66:'제품_구성_확인' ,67:'제품_날짜_질문' ,68:'제품_방법_요청' ,69:'제품_방법_질문' ,
                        70:'제품_방법_확인' ,71:'제품_불량_요청' ,72:'제품_불량_질문' ,73:'제품_불량_확인' ,74:'제품_소재_질문' ,
                        75:'제품_시용_요청' ,76:'제품_시용_질문' ,77:'제품_용도_질문' ,78:'제품_용도_확인' ,79:'제품_원산지_질문' ,
                        80:'제품_일반_비교' ,81:'제품_일반_요청' ,82:'제품_일반_질문' ,83:'제품_일반_확인' ,84:'제품_입고_요청' ,
                        85:'제품_입고_질문' ,86:'제품_재고_요청' ,87:'제품_재고_질문' ,88:'제품_재고_확인' ,89:'제품_정보_비교' ,
                        90:'제품_정보_요청' ,91:'제품_정보_질문' ,92:'제품_정보_확인' ,93:'제품_추천_비교' ,94:'제품_추천_요청' ,
                        95:'제품_추천_질문' ,96:'제품_추천_확인' ,97:'제품_커스텀_요청' ,98:'제품_커스텀_질문' ,99:'제품_품질_비교' ,
                        100:'제품_품질_요청' ,101:'제품_품질_질문' ,102:'제품_품질_확인' ,103:'제품_호환_질문' ,104:'제품_호환_확인' ,
                        105:'포장_방식_요청' ,106:'포장_방식_질문' ,107:'포장_비용_질문' ,108:'포장_일반_질문' ,109:'행사_기간_질문' ,
                        110:'행사_기간_확인' ,111:'행사_날짜_질문' ,112:'행사_유형_질문' ,113:'행사_유형_확인' ,114:'행사_일반_질문' ,
                        115:'행사_일반_확인' ,116:'행사_정보_요청' ,117:'행사_정보_질문' }
        
        # Load data
        self.inputs = self.data_loader(data_dir)
        self.conv_num = pd.read_csv(os.path.join(data_dir,'test','test.csv'))['conv_num']

    def data_loader(self, path):
        print('Loading ' + self.mode + ' dataset..')
        # check if preprocessed data directory exists
        if not os.path.isdir(self.data_dir):
            print(f'!!! Cannot find {self.data_dir}... !!!')
            sys.exit()

        if os.path.isfile(os.path.join(path, self.mode, self.mode + '_X.pt')):
            inputs = torch.load(os.path.join(path, self.mode, self.mode + '_X.pt'))

        else:
            file_path = os.path.join(path, self.mode, self.mode + '.csv')
            df = pd.read_csv(file_path)
            df = df.dropna(axis=0, how='all')
            inputs = df[df.columns[1:]]

            # Preprocessing
            inputs = self.preprocessing(inputs)
            # Save data
            torch.save(inputs ,os.path.join(path, self.mode, self.mode + '_X.pt'))

        return inputs

    def pad(self, data, pad_id, max_len):
        padded_data = list(map(lambda x : torch.cat([x, torch.tensor([pad_id] * (max_len - len(x)))]), data))
        return padded_data

    def preprocessing(self, inputs):
        print('Preprocessing ' + self.mode + ' dataset..')
        #Encoding original
        src_tensor = []
        seg_tensor = []
        for i in range(len(inputs)):
            src_tensor.append(torch.tensor(list(chain.from_iterable([self.tokenizer.encode(inputs[col][i], add_special_tokens=True) \
                                                                     for col in inputs.columns if inputs[col][i] == inputs[col][i]]))))
            clss = torch.cat([torch.where(src_tensor[i] == 2)[0], torch.tensor([len(src_tensor[i])])])
            seg_tensor.append(torch.tensor(list(chain.from_iterable( \
                [[0] * (clss[i + 1] - clss[i]) if i % 2 == 0 else [1] * (clss[i + 1] - clss[i]) \
                for i, val in enumerate(clss[:-1])]))))

        #Padding
        max_encoding_len = max(list(map(lambda x: len(x), src_tensor)))
        assert max_encoding_len < 512, 'Encoding length is longer than maximum processing length.'
        src_tensor = self.pad(src_tensor, 0, max_encoding_len)
        seg_tensor = self.pad(seg_tensor, 0, max_encoding_len)

        #Convert to list of tensor to 2d tensor
        src_tensor = torch.stack(src_tensor, dim=0)
        seg_tensor = torch.stack(seg_tensor, dim=0)
        mask_tensor = (~ (src_tensor == 0))

        #Integrate the tensor {1st dimension : {src, seg, mask}, 2nd dim : {number of samples}, 3rd dim : {encoding dimension}}
        input_tensor = torch.cat([src_tensor.unsqueeze(dim=1) , seg_tensor.unsqueeze(dim=1), mask_tensor.unsqueeze(dim=1)], dim=1)

        return input_tensor

    def label_decoder(self, labels):
        try:
            labels = list(map(lambda x : self.intents[x], labels))
            return labels
        except:
            assert 'Invalid intent'

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        return self.inputs[index, :, :]

In [None]:
# Load dataset & dataloader
test_dataset = TestDataset(data_dir=DATA_DIR, mode='test')
test_dataloader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
# Load Model
model = IntentClassifier().to(device)
model.load_state_dict(torch.load(TRAINED_MODEL_PATH, map_location=device))
""" 현재 제공된 weight 파일은 model.state_dict()만 저장해서 위 코드로 실행하지만
    베이스라인의 학습 과정처럼 check_point를 저장하면 아래 코드로 실행 """
# model.load_state_dict(torch.load(TRAINED_MODEL_PATH)['model'], map_location = device)

# Set metrics & Loss function
metric_fn = accuracy_score
loss_fn = nn.CrossEntropyLoss()

# Set trainer
trainer = Trainer(model, device, loss_fn, metric_fn)

In [None]:
# Predict
pred = []
pred = trainer.test_epoch(test_dataloader, epoch_index=0)
pred = test_dataset.label_decoder(pred)
print('decode completed--')

# Save prediction
pred_df = pd.DataFrame()
pred_df['conv_num'] = test_dataset.conv_num
pred_df['intent'] = pred


pred_df.to_csv(os.path.join(os.path.join(RESULT_DIR,'02_retailer_pred.csv'), index=False))


Predict cell : 실행시간 14분 (GPU 사용시)