# 한국어 문장 관계 분류 경진대회 필사

📌 전제 (premise) 문장을 참고해 가설 (hypothesis) 이 참/거짓/알수 없는 문장인지 판별하는 문제 → 한국어 문장 쌍 분석 모델 개발 (Natural language inference) 에 관한 대회
- BERT 모델에서 나아가, 연산량 측면에서 개선을 보인 ELECTRA 모델과 RoBERT 모델을 활용한 노트북이 상위권 수상을 차지


### 1등 노트북

- PublicLB | 0.896 | Finetuning Electra with Arcface
- https://dacon.io/competitions/official/235875/codeshare/4589?page=1&dtype=recent

**Keypoint**
-  2개의 모델 사용; 두 모델 결과값을 Softvoting Ensemble
    - (1) Tunib's KoElectra-base finetuned with Arcface Head(Public LB 0.896)
    - (2) KLUE Roberta-large finetuned with sentence pooling embeddings and special token embeddings(Public LB 0.902)
- KoElectra-base는 #params으로 따지면 Roberta-large의 1/3 규모의 작은 모델로, Attention Layers 수로 따지면 Roberta-large의 절반 정도 깊이 모델


In [None]:
from IPython.display import clear_output
!pip install adamp
!pip install wandb
!pip install transformers
clear_output()

In [None]:
""" define train data and test data path """
import os
from glob import glob

# environment variable settings
os.environ["TOKENIZERS_PARALLELISM"] = "true"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

# path definitions
ROOT_PATH = os.path.abspath(".")
TRAIN_FILE_PATH = os.path.join(ROOT_PATH, "train_dataset_v6_sentiment_and_score_pseudo_labeled_pororo_added.csv")
TEST_FILE_PATH = os.path.join(ROOT_PATH, 'test_dataset_v6_sentiment_and_score_pseudo_labeled_pororo_added.csv')
SAMPLE_SUBMISSION_PATH = os.path.join(ROOT_PATH, 'sample_submission.csv')

In [None]:
""" Set configuration as dictionary format """

import wandb
from datetime import datetime
from easydict import EasyDict

# login wandb and get today's date until hour and minute
wandb.login()

# CFG Configuration
CFG = wandb.config # wandb.config provides functionality of easydict.EasyDict
CFG.DEBUG = False

# Dataset Config as constants
CFG.num_labels = 3
CFG.num_workers = 2

# Train configuration
CFG.user_name = "snoop2head"
CFG.model_name = "tunib/electra-ko-base"

# Electra Paper's Hyperparameter for Finetuning GLUE details at Table 7: https://arxiv.org/pdf/2003.10555.pdf
CFG.learning_rate = 1e-4
CFG.adam_epsilon = 1e-6
CFG.weight_decay = 0
CFG.num_epochs = 10 
CFG.train_batch_size = 32
CFG.val_batch_size = CFG.train_batch_size
CFG.dropout_rate = 0.1

# However, while observing evaluation loss and f1 score, the following parameter resulted in better performance for KLUE NLI dataset.
# Partially referenced from KRElectra's KorNLI finetuning Hyperparams: https://github.com/snunlp/KR-ELECTRA/blob/master/finetune/config/kornli/kr-electra.json#L15-L31
CFG.learning_rate = 4e-5
CFG.adam_epsilon = 1e-8
CFG.weight_decay = 1e-2
CFG.num_folds = 6 # 1st fold: Movie Review, 2nd fold: Airbnb Review, 3 ~ 6th fold: Other datasets
CFG.num_epochs = 20
CFG.train_batch_size = 3 * 40
CFG.val_batch_size = CFG.train_batch_size

In [None]:
import random
import pandas as pd
import numpy as np
import torch

def seed_everything(seed) :
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
seed_everything(42)

In [None]:
def label_to_num(label):
    label_dict = {"entailment": 0, "contradiction": 1, "neutral": 2}
    num_label = []

    for v in label:
        num_label.append(label_dict[v])
    
    return num_label


In [None]:
from transformers import AutoTokenizer

# get tokenizer
tokenizer = AutoTokenizer.from_pretrained(CFG.model_name)

added_special_tokens = ["[PREMISE]", "[HYPOTHESIS]"]

tokenizer.add_special_tokens({"additional_special_tokens":added_special_tokens})

len(tokenizer) # 32002

In [None]:
from ast import literal_eval

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

def literal_eval_without_error(input_string):
    # string [0.2765555, 0.7233189, 6.404926e-05, 6.1491264e-05] into list [0.2765555, 0.7233189, 6.404926e-05, 6.1491264e-05]
    input_string = input_string[1:-1]
    input_list = input_string.split(",")
    output_list = [item.strip() for item in input_list]
    output_list = [float(item) for item in input_list]
    return output_list

# read pororo dataset
df_pororo_dataset = pd.read_csv(TRAIN_FILE_PATH) 
display(df_pororo_dataset.head(3))

df_pororo_dataset["electra_premise_probability"] = df_pororo_dataset["electra_premise_probability"].apply(literal_eval_without_error)
df_pororo_dataset["electra_hypothesis_probability"] = df_pororo_dataset["electra_hypothesis_probability"].apply(literal_eval_without_error)
df_pororo_dataset["electra_premise_score_probability"] = df_pororo_dataset["electra_premise_score_probability"].apply(literal_eval_without_error)
df_pororo_dataset["electra_hypothesis_score_probability"] = df_pororo_dataset["electra_hypothesis_score_probability"].apply(literal_eval_without_error)

In [None]:
def preprocess_augmentation_electra(
    df:pd.DataFrame, 
    str_premise_start_special_token = "^", 
    str_premise_end_special_token = "^", 
    str_hypothesis_start_special_token = "*",
    str_hypothesis_end_special_token = "*",
    is_test=False
    ):
  CLS_TOKEN_LEN = 1
  SEP_TOKEN_LEN = 1

  premise_list = []
  hypothesis_list = []
  label_list = []
  index_list = []

  # wrap premise sentence with additional hintings and special tokens
  df_return = pd.DataFrame({})

  for index, row in df.iterrows():
    if is_test:
      index = row['index']
    else:
      pass
    premise = row['premise']
    hypothesis = row['hypothesis']
    label = row['label']
    
    # source(or categorical) information
    source_label = row['source_label']
    source_prefix = row['source_label'].split("/")[0]
    source_subfix = row['source_label'].split("/")[1]
    
    if source_label == "영화후기/영화평가" or source_label == "여행후기/여행평가":
      
      # make pororo sentiment labels (긍정적 vs 부정적)
      premise_sentiment = row["premise_sentiment"] # pororo label
      hypothesis_sentiment = row["hypothesis_sentiment"]

      electra_premise_sentiment = row["electra_premise_label"] # electra label
      electra_hypothesis_sentiment = row["electra_hypothesis_label"]

      # if two sentiment labels don't agree then use neutral sentiment label, 
      if premise_sentiment == electra_premise_sentiment:
        pass
      else:
        premise_sentiment = "중립적"
      
      if hypothesis_sentiment == electra_hypothesis_sentiment:
        pass
      else:
        hypothesis_sentiment = "중립적"
      

      # use naver shopping based scoreing
      pororo_premise_score = row["premise_score"]
      electra_premise_score = np.dot([1,2,4,5], row["electra_premise_score_probability"]) # score weighted sum
      premise_score = pororo_premise_score * 0.5 + electra_premise_score * 0.5
      premise_score =  '{:.2f}'.format(round(premise_score, 2))
      
      pororo_hypothesis_score = row["hypothesis_score"]
      electra_hypothesis_score = np.dot([1,2,4,5], row["electra_hypothesis_score_probability"]) # score weighted sum
      hypothesis_score = pororo_hypothesis_score * 0.5 + electra_hypothesis_score * 0.5
      hypothesis_score = '{:.2f}'.format(round(hypothesis_score, 2))
      
      # create span + sentence
      premise = str_premise_start_special_token + \
                str(premise_score) + "점 만큼 " + premise_sentiment + "인 " + \
                source_prefix + ":" + premise + \
                str_premise_end_special_token
      hypothesis = str_hypothesis_start_special_token + \
                str(hypothesis_score) + "점 만큼 " + hypothesis_sentiment + "인 " + \
                source_subfix + ":" + hypothesis + \
                str_hypothesis_end_special_token

    else:
      premise = str_premise_start_special_token + \
                source_prefix + ":" + premise + \
                str_premise_end_special_token
      hypothesis = str_hypothesis_start_special_token + \
                source_subfix + ":" + hypothesis + \
                str_hypothesis_end_special_token
    premise_list.append(premise)
    hypothesis_list.append(hypothesis)
    label_list.append(label)
    if is_test:
      index_list.append(index)
    else:
      pass    

  df_return["premise"] = premise_list
  df_return["hypothesis"] = hypothesis_list
  df_return["label"] = label_list 
  if is_test:
    df_return["index"] = index_list
  # apply tokenizer for premise
  df_return['premise_len'] = df_return['premise'].apply(
      lambda x: len(tokenizer.encode(x, add_special_tokens=False))
  )
  df_return['hypothesis_len'] = df_return['hypothesis'].apply(
      lambda x: len(tokenizer.encode(x, add_special_tokens=False))
  )

  df_return['premise_index'] = CLS_TOKEN_LEN # ^ is next to [CLS] token
  df_return['hypothesis_index'] = CLS_TOKEN_LEN + df_return['premise_len'] + SEP_TOKEN_LEN # * is next to [SEP] token
  df_return['total_length'] = CLS_TOKEN_LEN + df_return['premise_len'] \
      + SEP_TOKEN_LEN + df_return['hypothesis_len'] + SEP_TOKEN_LEN
  return df_return

In [None]:
# set display options
pd.options.display.max_colwidth = None

df_pororo_dataset_preprocessed = preprocess_augmentation_electra(
    df_pororo_dataset, 
    str_premise_start_special_token = "[PREMISE]", 
    str_premise_end_special_token = "[PREMISE]", 
    str_hypothesis_start_special_token = "[HYPOTHESIS]",
    str_hypothesis_end_special_token = "[HYPOTHESIS]",
)

df_pororo_dataset_preprocessed.head(3)

In [None]:
# show distribution of total_length
df_pororo_dataset_preprocessed['total_length'].hist(bins=100)
print(max(df_pororo_dataset_preprocessed['total_length'])) # tunib electra max token length: 97
CFG.max_token_length = max(df_pororo_dataset_preprocessed['total_length'])

In [None]:
from torch.utils.data import Dataset

class RBERTDataset(Dataset):
    def __init__(self, dataset, is_training:bool=True):
        
        # object values with string items
        self.dataset = dataset
        self.premise = self.dataset['premise']
        self.hypothesis = self.dataset['hypothesis']
        self.sentence = self.dataset['premise'] + ' [SEP] ' + self.dataset['hypothesis']
        
        # torch values with integer values
        if is_training:
            self.train_label = label_to_num(self.dataset['label'].values)
        if not is_training:
            self.train_label = self.dataset['label'].values
        self.label = torch.tensor(self.train_label)
        self.premise_start_index = torch.tensor(self.dataset['premise_index'].values)
        self.premise_len = torch.tensor(self.dataset['premise_len'].values)
        self.hypothesis_start_index = torch.tensor(self.dataset['hypothesis_index'].values)
        self.hypothesis_len = torch.tensor(self.dataset['hypothesis_len'].values)
    
    def __getitem__(self, idx):
        # object values with string items
        sentence = self.sentence[idx]
        
        # torch values with integer values
        premise_start_index = self.premise_start_index[idx]
        premise_len = self.premise_len[idx]
        hypothesis_start_index = self.hypothesis_start_index[idx]
        hypothesis_len = self.hypothesis_len[idx]
        label = self.label[idx]
        
        # tokenize
        item = tokenizer(
            sentence,
            return_tensors="pt",
            padding='max_length',
            truncation=True,
            max_length=CFG.max_token_length,
            add_special_tokens=True,
            return_token_type_ids=False, # for RoBERTa
            )
        
        # RoBERTa's provided masks (do not include token_type_ids for RoBERTa)
        item['input_ids'] = item['input_ids'].squeeze(0)
        item['attention_mask'] = item['attention_mask'].squeeze(0)
        
        # add subject and object entity masks where masks notate where the entity is
        premise_mask, hypothesis_mask = self.add_entity_mask(
            premise_start_index,
            hypothesis_start_index,
            premise_len, 
            hypothesis_len,
        )
        item['premise_mask'] = torch.tensor(premise_mask)
        item['hypothesis_mask'] = torch.tensor(hypothesis_mask)

        # fill label
        item['label'] = label

        item['premise_special_token_index'] = premise_start_index
        item['hypothesis_special_token_index'] = hypothesis_start_index
        return item

    def __len__(self):
        return len(self.dataset)

    def add_entity_mask(
            self,
            premise_start_index,
            hypothesis_start_index,
            premise_len, 
            hypothesis_len
        ):
        """ add entity token to input_ids """
        # print("tokenized input ids: \n",item['input_ids'])

        # initialize entity masks
        premise_mask = np.zeros(CFG.max_token_length, dtype=int)
        hypothesis_mask = np.zeros(CFG.max_token_length, dtype=int)

        premise_mask[
          premise_start_index : premise_start_index + premise_len
        ] = 1
        
        hypothesis_mask[
          hypothesis_start_index : hypothesis_start_index + hypothesis_len
        ] = 1
        
        return premise_mask, hypothesis_mask

In [None]:
sample_1 = RBERTDataset(df_pororo_dataset_preprocessed)[0]
sample_2 = RBERTDataset(df_pororo_dataset_preprocessed)[1]
sample_3 = RBERTDataset(df_pororo_dataset_preprocessed)[2]

print(sample_1)

In [None]:
# Debugging add_entity_mask_sample function with sample code

decoded_item_1 = tokenizer.decode(sample_1['input_ids'])
decoded_item_2 = tokenizer.decode(sample_2['input_ids'])
decoded_item_3 = tokenizer.decode(sample_3['input_ids'])

print(decoded_item_1)
print(decoded_item_2)
print(decoded_item_3)

In [None]:
# import nn from torch
from torch import nn
from transformers import AutoConfig, AutoModel

""" R-BERT: https://github.com/monologg/R-BERT """
class FCLayer(nn.Module):
    # both attention dropout and fc dropout is 0.1 on Roberta: https://arxiv.org/pdf/1907.11692.pdf
    def __init__(self, input_dim, output_dim, dropout_rate=0.1, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.activation = nn.GELU() # electra uses gelu whereas BERT or Roberta used tanh in fully connected layer

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.activation(x)
        return self.linear(x)

class RBERT(nn.Module):
    def __init__(
        self, 
        model_name:str=CFG.model_name,
        num_labels:int=3,
        dropout_rate:float=0.1,
        special_tokens_dict:dict=None,
        is_train:bool=True,
        embedding_resizing_length = len(tokenizer),
        ):
        super(RBERT, self).__init__()

        self.model_name = model_name
        config = AutoConfig.from_pretrained(model_name)
        config.num_labels = num_labels
        self.backbone_model = AutoModel.from_pretrained(model_name, config=config)
        self.dropout_rate = dropout_rate
        self.num_labels = num_labels
        
        # add special tokens
        self.special_tokens_dict = special_tokens_dict
        if embedding_resizing_length != 32000:
            self.backbone_model.resize_token_embeddings(embedding_resizing_length)

        self.cls_fc_layer = FCLayer(config.hidden_size, config.hidden_size, self.dropout_rate)
        self.entity_fc_layer = FCLayer(config.hidden_size, config.hidden_size, self.dropout_rate)
        self.label_classifier = FCLayer(
            config.hidden_size * 3,
            self.num_labels,
            self.dropout_rate,
            use_activation=False,
        )

    def entity_average(self, hidden_output, e_mask):
        e_mask_unsqueeze = e_mask.unsqueeze(1)  # [b, 1, j-i+1]
        length_tensor = (e_mask != 0).sum(dim=1).unsqueeze(1)  # [batch_size, 1]

        # [b, 1, j-i+1] * [b, j-i+1, dim] = [b, 1, dim] -> [b, dim]
        sum_vector = torch.bmm(e_mask_unsqueeze.float(), hidden_output).squeeze(1)
        avg_vector = sum_vector.float() / length_tensor.float()  # broadcasting
        return avg_vector
        

    def forward(self, input_ids, attention_mask, premise_mask=None, hypothesis_mask=None, labels=None):
        
        discriminator_hidden_states = self.backbone_model(input_ids = input_ids, attention_mask = attention_mask)
                
        # https://github.com/huggingface/transformers/blob/db7d6a80e82d66127b2a44b6e3382969fdc8b207/src/transformers/models/electra/modeling_electra.py#L932-L951
        sequence_output = discriminator_hidden_states[0]
        pooled_output = sequence_output[:, 0, :]  # [CLS] token's hidden featrues(hidden state)

        # hidden state's average in between entities
        # print(sequence_output.shape, premise_mask.shape)
        e1_h = self.entity_average(sequence_output, premise_mask) # token in between subject entities -> 
        e2_h = self.entity_average(sequence_output, hypothesis_mask) # token in between object entities

        # Dropout -> gelu -> fc_layer (Share FC layer for e1 and e2)
        pooled_output = self.cls_fc_layer(pooled_output) # [CLS] token -> hidden state | green on diagram
        e1_h = self.entity_fc_layer(e1_h) # subject entity's fully connected layer | yellow on diagram
        e2_h = self.entity_fc_layer(e2_h) # object entity's fully connected layer | red on diagram

        # Concat -> fc_layer / [CLS], subject_average, object_average
        concat_h = torch.cat([pooled_output, e1_h, e2_h], dim=-1)
        logits = self.label_classifier(concat_h)
        return logits

In [None]:
class IBDataset(Dataset):
    def __init__(self, dataset, is_training:bool=True):
        
        # pandas.Dataframe dataset
        self.dataset = dataset
        self.premise = self.dataset['premise']
        self.hypothesis = self.dataset['hypothesis']
        self.sentence = self.dataset['premise'] + ' [SEP] ' + self.dataset['hypothesis']
        if is_training:
            self.train_label = label_to_num(self.dataset['label'].values)
        if not is_training:
            self.train_label = self.dataset['label'].values
        self.label = torch.tensor(self.train_label)
        self.premise_special_token_index = torch.tensor(self.dataset['premise_index'].values)
        self.hypothesis_special_token_index = torch.tensor(self.dataset['hypothesis_index'].values)
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        sentence = self.sentence[idx]
        label = self.label[idx]
        premise_special_token_index = self.premise_special_token_index[idx]
        hypothesis_special_token_index = self.hypothesis_special_token_index[idx]

        # tokenize
        item = tokenizer(
            sentence,
            return_tensors="pt",
            padding='max_length',
            truncation=True,
            max_length=CFG.max_token_length,
            add_special_tokens=True,
            return_token_type_ids=False, # for RoBERTa
            )
        
        # RoBERTa's provided masks (do not include token_type_ids for RoBERTa)
        item['input_ids'] = item['input_ids'].squeeze(0)
        item['attention_mask'] = item['attention_mask'].squeeze(0)

        # fill label
        item['label'] = label
        item['premise_special_token_index'] = premise_special_token_index
        item['hypothesis_special_token_index'] = hypothesis_special_token_index
        return item

In [None]:
sample_1 = IBDataset(df_pororo_dataset_preprocessed)[0]
sample_2 = IBDataset(df_pororo_dataset_preprocessed)[1]
sample_3 = IBDataset(df_pororo_dataset_preprocessed)[2]

print(sample_1)

In [None]:
# Debugging add_entity_mask_sample function with sample code

decoded_item_1 = tokenizer.decode(sample_1['input_ids'])
decoded_item_2 = tokenizer.decode(sample_2['input_ids'])
decoded_item_3 = tokenizer.decode(sample_3['input_ids'])

print(decoded_item_1)
print(decoded_item_2)
print(decoded_item_3)

In [None]:
# import nn from torch
from torch import nn
from transformers import AutoConfig, AutoModel

class IBModel(nn.Module):
    """
    'An Improved Baseline for Sentence-level Relation Extraction'
    https://github.com/wzhouad/RE_improved_baseline/blob/main/model.py
    """

    def __init__(self, 
        model_name = "tunib/electra-ko-base", 
        dropout_rate = CFG.dropout_rate,
        use_arcface = False,
        embedding_resizing_length = len(tokenizer),
    ):
        super().__init__()
        config = AutoConfig.from_pretrained(CFG.model_name)
        config.num_labels = CFG.num_labels
        self.backbone_model = AutoModel.from_pretrained(model_name, config=config)
        if embedding_resizing_length != 32000:
          self.backbone_model.resize_token_embeddings(embedding_resizing_length)
        hidden_size = config.hidden_size
        self.dropout_rate = dropout_rate
        self.classifier = nn.Sequential(
            nn.Dropout(p=dropout_rate),
            nn.Linear(hidden_size * 3, hidden_size), # change to hidden_size * 3 if you want to add pooler_output
            nn.GELU(), 
            nn.Dropout(p=dropout_rate),
            nn.Linear(hidden_size, config.num_labels),
        )
    
    def forward(
            self, 
            input_ids, 
            attention_mask,
            premise_special_token_index,
            hypothesis_special_token_index,
            labels=None):
        outputs = self.backbone_model(
            input_ids,
            attention_mask=attention_mask,
        )

        sequence_output = outputs["last_hidden_state"] # sequence output of the last layer
        cls_output = sequence_output[:, 0, :]  # [CLS] token's hidden features(different from pooler output)
        # pooler_output = outputs["pooler_output"]  # [CLS] hidden state passed through pooler: https://github.com/huggingface/transformers/blob/2c2a31ffbcfe03339b1721348781aac4fc05bc5e/src/transformers/models/roberta/modeling_roberta.py#L569-L581

        # extract embedding for special tokens for each premise and hypothesis
        idx_seq = torch.arange(input_ids.size(0)).to(input_ids.device) 
        premise_emb = sequence_output[idx_seq, premise_special_token_index]
        hypothesis_emb = sequence_output[idx_seq, hypothesis_special_token_index]

        # concat [CLS], ^, *
        concat_hidden_states = torch.cat((cls_output, premise_emb, hypothesis_emb), dim=-1)
      
        return self.classifier(concat_hidden_states)

In [None]:
# import nn from torch
from torch import nn
from torch.cuda.amp import autocast
from transformers import AutoConfig, AutoModel

class IBConcatModel(nn.Module):
    def __init__(self, 
        model_name = "tunib/electra-ko-base",
        dropout_rate = CFG.dropout_rate,
        embedding_resizing_length = len(tokenizer),
    ):
        super().__init__()
        config = AutoConfig.from_pretrained(CFG.model_name)
        config.num_labels = CFG.num_labels
        self.backbone_model = AutoModel.from_pretrained(model_name, config=config)
        if embedding_resizing_length != 32000:
          self.backbone_model.resize_token_embeddings(embedding_resizing_length)
        hidden_size = config.hidden_size
        self.dropout_rate = dropout_rate
        self.classifier = nn.Sequential(
            nn.GELU(),
            nn.Dropout(p=dropout_rate),
            nn.Linear(hidden_size * 12, hidden_size * 3), # change to hidden_size * 3 if you want to add pooler_output
            # nn.GELU(), 
            nn.Dropout(p=dropout_rate),
            nn.Linear(hidden_size * 3, config.num_labels),
        )

    # @autocast() # autocast for FP16
    def forward(
            self, 
            input_ids, 
            attention_mask,
            premise_special_token_index,
            hypothesis_special_token_index,
            labels=None):
        
        outputs = self.backbone_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
        )
        
        # sequence output of the last layer
        # https://github.com/huggingface/transformers/blob/db7d6a80e82d66127b2a44b6e3382969fdc8b207/src/transformers/models/electra/modeling_electra.py#L961
        hidden_states = outputs["hidden_states"]
        
        # extract embedding for [CLS] token
        # concating last four hidden states are mentioned in BERT paper: https://github.com/huggingface/transformers/issues/1328
        cls_concat = torch.cat(tuple([hidden_states[i][:, 0, :] for i in [-4, -3, -2, -1]]), dim=-1)

        # extract embedding for special tokens for each premise and hypothesis
        idx_seq = torch.arange(input_ids.size(0)).to(input_ids.device) 
        
        # premise special token embedding
        # concating last four hidden states are mentioned in BERT paper: https://github.com/huggingface/transformers/issues/1328
        premise_concat = torch.cat(tuple([hidden_states[i][idx_seq, premise_special_token_index] for i in [-4, -3, -2, -1]]), dim=-1)
        
        # hypothesis special token embedding
        # concating last four hidden states are mentioned in BERT paper: https://github.com/huggingface/transformers/issues/1328
        hypothesis_concat = torch.cat(tuple([hidden_states[i][idx_seq, hypothesis_special_token_index] for i in [-4, -3, -2, -1]]), dim=-1)

        # concat embeddings for [CLS], ^, *
        knit_hidden_states = torch.cat((
            cls_concat,
            premise_concat,
            hypothesis_concat,
        ), dim=-1)
        
        return self.classifier(knit_hidden_states)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
# from torch.cuda.amp import autocast
from torch.nn import Parameter
import math

class ArcMarginProduct(nn.Module):
    """  
    Reference: https://github.com/ronghuaiyang/arcface-pytorch/blob/master/models/metrics.py
    Implement of large margin arc distance: :
    Args:
        in_features: size of each input sample
        out_features: size of each output sample
        s: norm of input feature: scaling ratio from a small number to a final logit
        m: the additional angular margin
        cos(theta + m)
    """
    def __init__(self, in_features, out_features, s=30.0, m=0.50, easy_margin=False):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.weight = Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m
    
    # @autocast() for FP16 (Automatic Mixed Precision)
    def forward(self, input, label=None):
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt((1.0 - torch.pow(cosine, 2)).clamp(0, 1))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
    
        if label is not None:
          one_hot = torch.zeros(cosine.size(), device='cuda')
          one_hot.scatter_(1, label.view(-1, 1).long(), 1)
          output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        else:
          output = cosine
        
        output *= self.s
        return output

In [None]:
# import nn from torch
from torch import nn
from torch.cuda.amp import autocast
from transformers import AutoConfig, AutoModel

class ArcFaceIBModel(nn.Module):
    """ Modified IBmodel with Arcmarginproduct head """
    def __init__(
        self, 
        model_name, 
        dropout_rate = CFG.dropout_rate,
        embedding_resizing_length = len(tokenizer)
    ):
        super().__init__()
        config = AutoConfig.from_pretrained(CFG.model_name)
        config.num_labels = CFG.num_labels
        self.backbone_model = AutoModel.from_pretrained(model_name, config=config)
        if embedding_resizing_length != 32000:
          self.backbone_model.resize_token_embeddings(embedding_resizing_length)
        hidden_size = config.hidden_size
        self.dropout_rate = dropout_rate
        self.arcface = ArcMarginProduct(config.hidden_size, config.num_labels)
        self.neck = nn.Sequential(
            nn.Dropout(p=dropout_rate),
            nn.Linear(hidden_size * 3, hidden_size),
            nn.GELU(),
            nn.Dropout(p=dropout_rate),
        )
        self.proj_fc_layer = FCLayer(config.hidden_size * 4, config.hidden_size, self.dropout_rate)

    # @autocast()
    def forward(
            self, 
            input_ids, 
            attention_mask,
            premise_special_token_index,
            hypothesis_special_token_index,
            labels=None):
        outputs = self.backbone_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True,
        )
        hidden_states = outputs["hidden_states"]
        cls_concat = torch.cat(tuple([hidden_states[i][:, 0, :] for i in [-4, -3, -2, -1]]), dim=-1)
        cls_output = self.proj_fc_layer(cls_concat)
        
        # extract embedding for special tokens for each premise and hypothesis
        idx_seq = torch.arange(input_ids.size(0)).to(input_ids.device) 
        premise_concat = torch.cat(tuple([hidden_states[i][idx_seq, premise_special_token_index] for i in [-4, -3, -2, -1]]), dim=-1)
        premise_output = self.proj_fc_layer(premise_concat)
        hypothesis_concat = torch.cat(tuple([hidden_states[i][idx_seq, hypothesis_special_token_index] for i in [-4, -3, -2, -1]]), dim=-1)
        hypothesis_output = self.proj_fc_layer(hypothesis_concat)

        # concat [CLS], ^, *
        concat_hidden_states = torch.cat((cls_output, premise_output, hypothesis_output), dim=-1)
      
        # goes through neck for projection
        out_proj = self.neck(concat_hidden_states)
        
        # input to the arcface head for logit
        if labels is not None:
            logits = self.arcface(out_proj, labels)
        else:
            logits = self.arcface(out_proj)
        return logits


In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() and CFG.DEBUG == False else 'cpu')
print(device)

In [None]:
class Metrics(object):
    """ 
    Averaging metrics collected across batches such as loss, f1 and accuracy
    Reference: https://github.com/pytorch/examples/blob/21c240b814658e590b4fa9d4682d39831060c5b9/imagenet/main.py#L367-L385    
    """
    def __init__(self):
        self.reset()
        
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
        
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

In [None]:
run_name = f"ArcfaceIBModel-Electra-4-concat-Augmentation-v7-not-shuffled-CrossEntropyLoss-{CFG.train_batch_size}"

In [None]:
# scikit-learn metrics and folds
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import KFold

# torch data utils
from torch.utils.data import DataLoader
from torch.utils.data.dataset import Subset

# optimizers and schedulers
from adamp import AdamP
from transformers import AdamW # or from torch.optim import AdamW
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup

# tqdm progress bar repetition error fixed on jupyter notebook
from tqdm.notebook import tqdm 

wandb.init(
    project='KLUE-NLI', 
    name=run_name,
    config=CFG
  )

train_data = IBDataset(df_pororo_dataset_preprocessed)
dev_data = IBDataset(df_pororo_dataset_preprocessed)

kfd = KFold(n_splits = CFG.num_folds, shuffle=False)

for fold_num, (train_idx, dev_idx) in enumerate(kfd.split(df_pororo_dataset_preprocessed)):

    criterion = nn.CrossEntropyLoss()
    print("using CE loss function")

    print(f"#################### Fold: {fold_num + 1} ######################")

    RESULT_PATH = f"./results"
    if not os.path.exists(RESULT_PATH):
      os.mkdir(RESULT_PATH)
    
    SAVE_PATH = f"./results/{run_name}"
    if not os.path.exists(SAVE_PATH):
      os.mkdir(SAVE_PATH)

    train_set = Subset(train_data, train_idx)
    dev_set = Subset(dev_data, dev_idx)

    train_loader = DataLoader(
        train_set, 
        batch_size=CFG.train_batch_size, 
        shuffle=False, 
        num_workers=CFG.num_workers,
        drop_last=True,
    )
    dev_loader = DataLoader(
        dev_set, 
        batch_size=CFG.val_batch_size, 
        shuffle=False, 
        num_workers=CFG.num_workers,
        drop_last=True,
    )

    # fetch model
    model = ArcFaceIBModel(
        CFG.model_name, 
        dropout_rate = CFG.dropout_rate,
    )
    model.to(device)

    # fetch loss function, optimizer, scheduler outside of torch library
    # https://github.com/clovaai/AdamP
    optimizer = AdamP(
        model.parameters(), # training all params
        lr=CFG.learning_rate,
        weight_decay=CFG.weight_decay,
        eps=CFG.adam_epsilon,
        betas=(0.9, 0.999),
    )

    CFG.logging_steps = len(train_loader) // 3 # set logging steps according to the length of train_loader
    CFG.warmup_steps = CFG.logging_steps # warmup steps as 1/3 of first epoch
    
    # https://huggingface.co/transformers/main_classes/optimizer_schedules.html
    scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=CFG.warmup_steps, num_training_steps=len(train_loader)*CFG.num_epochs)

    # class for accumulative metrics calculation over batches iteration
    train_acc = Metrics()
    train_loss = Metrics()
    dev_acc = Metrics()
    dev_loss = Metrics()
    train_f1 = Metrics()
    dev_f1 = Metrics()

    best_eval_loss = 5.0
    best_eval_accuracy = 0.80
    best_f1_score = 0.80
    
    steps = 0
    
    # fetch training loop
    for epoch in range(CFG.num_epochs):
        for _ , item in enumerate(tqdm(train_loader)):
            
            # model to training mode
            model.train()
            
            input_ids = item['input_ids'].to(device)
            attention_mask = item['attention_mask'].to(device)
            premise_special_token_index = item['premise_special_token_index'].to(device)
            hypothesis_special_token_index = item['hypothesis_special_token_index'].to(device)
            label = item['label'].to(device)
            
            # assign forward() arguments to the device
            logits = model(
                input_ids, 
                attention_mask,
                premise_special_token_index=premise_special_token_index, 
                hypothesis_special_token_index=hypothesis_special_token_index, 
                labels=label
            )
            
            loss = criterion(logits, label)

            optimizer.zero_grad() # replacing model.zero_grad() with optimizer_zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            # update loss
            train_loss.update(loss.item(), len(input_ids))
            
            # accuracy
            predict = logits.argmax(-1)	
            train_accuracy_score = accuracy_score(label.detach().cpu().numpy(), predict.detach().cpu().numpy())
            train_acc.update(train_accuracy_score, len(input_ids))
            
            # f1 score
            train_f1_score = f1_score(label.detach().cpu().numpy(), predict.detach().cpu().numpy(), average="macro") # macro for even classes: https://www.baeldung.com/cs/multi-class-f1-score
            train_f1.update(train_f1_score, len(input_ids))
            
            steps += 1

            if steps % CFG.logging_steps == 0: # batch
                print('Epoch: {}/{}'.format(epoch+1, CFG.num_epochs), 'Step: {}'.format(steps), 'Train Loss: {:.4f}'.format(train_loss.avg), 'Train Acc: {:.4f}'.format(train_acc.avg))
                for _, dev_item in enumerate(tqdm(dev_loader)):
                    dev_input_ids = dev_item['input_ids'].to(device)
                    dev_attention_mask = dev_item['attention_mask'].to(device)
                    dev_premise_special_token_index = dev_item['premise_special_token_index'].to(device)
                    dev_hypothesis_special_token_index = dev_item['hypothesis_special_token_index'].to(device)
                    dev_label = dev_item['label'].to(device)
                    
                    # switch model to eval mode
                    model.eval()
                    dev_logits = model(
                        dev_input_ids, 
                        dev_attention_mask,
                        premise_special_token_index=dev_premise_special_token_index, 
                        hypothesis_special_token_index=dev_hypothesis_special_token_index, 
                        labels=dev_label
                    )

                    # update loss
                    loss = criterion(dev_logits, dev_label)
                    dev_loss.update(loss.item(), len(dev_input_ids))
                    
                    # accuracy
                    dev_predict = dev_logits.argmax(-1)
                    dev_accuracy_score = accuracy_score(dev_label.detach().cpu().numpy(), dev_predict.detach().cpu().numpy())
                    dev_acc.update(dev_accuracy_score, len(dev_input_ids))

                    # f1 score
                    dev_f1_score = f1_score(dev_label.detach().cpu().numpy(), dev_predict.detach().cpu().numpy(), average="macro") # macro for even classes: https://www.baeldung.com/cs/multi-class-f1-score
                    dev_f1.update(dev_f1_score, len(input_ids))

                # print metrics
                print('Epoch: {}/{}'.format(epoch+1, CFG.num_epochs), 
                      'Step: {}'.format(steps), 
                      'Dev Loss: {:.4f}'.format(dev_loss.avg), 
                      'Dev Acc: {:.4f}'.format(dev_acc.avg), 
                      'Dev f1: {:.4f}'.format(dev_f1.avg)
                )
                wandb.log(
                    {
                        'train/loss':train_loss.avg, 
                        'train/accuracy':train_acc.avg, 
                        'train/f1': train_f1.avg,
                        'train/learning_rate':optimizer.param_groups[0]['lr'], 
                        'eval/loss':dev_loss.avg,
                        'eval/accuracy':dev_acc.avg,
                        'eval/f1':dev_f1.avg,
                        'Step':steps
                    }
                )
                    
                if best_eval_loss > dev_loss.avg:
                    best_eval_loss = dev_loss.avg
                    torch.save(model.state_dict(), f'{SAVE_PATH}/{fold_num+1}-fold-{CFG.num_folds}-best-eval-loss-model.pt')
                    print('Saved model with lowest validation loss: {:.4f}'.format(best_eval_loss))
                    wandb.log({'best_eval_loss':best_eval_loss})
                  
                if best_eval_accuracy < dev_acc.avg:
                    best_eval_accuracy = dev_acc.avg
                    torch.save(model.state_dict(), f'{SAVE_PATH}/{fold_num+1}-fold-{CFG.num_folds}-best-eval-accuracy-model.pt')
                    print('Saved model with highest validation accuracy: {:.4f}'.format(best_eval_accuracy))
                    wandb.log({'best_eval_accuracy':best_eval_accuracy})
                
                if best_f1_score < dev_f1.avg:
                    best_f1_score = dev_f1.avg
                    torch.save(model.state_dict(), f'{SAVE_PATH}/{fold_num+1}-fold-{CFG.num_folds}-best-eval-f1-model.pt')
                    print('Saved model with highest validation f1: {:.4f}'.format(best_f1_score))
                    wandb.log({'best_f1_score':best_f1_score})

                # reset metrics
                dev_loss.reset()
                dev_acc.reset()
                dev_f1.reset()
                
                train_acc.reset()
                train_loss.reset()
                train_f1.reset()
                

    # Prevent OOM error
    model.cpu()
    del model
    torch.cuda.empty_cache()
    clear_output()

In [None]:
def num_to_label(label):
    label_dict = {0: "entailment", 1: "contradiction", 2: "neutral"}
    list_str_label = []

    for i, v in enumerate(label):
        list_str_label.append(label_dict[v])
    
    return list_str_label

In [None]:
from torch.utils.data import DataLoader, Dataset, Subset

test_dataset = pd.read_csv(TEST_FILE_PATH)
display(test_dataset.head(2))

In [None]:
import random

test_dataset_shuffled = test_dataset.sort_values(by=["source_label"]).iloc[::-1] # order by source label: 영화 -> 여행 -> 일반
test_dataset_shuffled.head(3)

In [None]:
from ast import literal_eval

def literal_eval_without_error(input_string):
    # string [0.2765555, 0.7233189, 6.404926e-05, 6.1491264e-05] into list [0.2765555, 0.7233189, 6.404926e-05, 6.1491264e-05]
    input_string = input_string[1:-1]
    input_list = input_string.split(",")
    output_list = [item.strip() for item in input_list]
    output_list = [float(item) for item in input_list]
    return output_list


test_dataset_shuffled["electra_premise_probability"] = test_dataset_shuffled["electra_premise_probability"].apply(literal_eval_without_error)
test_dataset_shuffled["electra_hypothesis_probability"] = test_dataset_shuffled["electra_hypothesis_probability"].apply(literal_eval_without_error)
test_dataset_shuffled["electra_premise_score_probability"] = test_dataset_shuffled["electra_premise_score_probability"].apply(literal_eval_without_error)
test_dataset_shuffled["electra_hypothesis_score_probability"] = test_dataset_shuffled["electra_hypothesis_score_probability"].apply(literal_eval_without_error)

display(test_dataset_shuffled.head(6))
display(test_dataset_shuffled.tail(6))

In [None]:
df_test_dataset = preprocess_augmentation_electra(
    test_dataset_shuffled, 
    str_premise_start_special_token = "[PREMISE]", 
    str_premise_end_special_token = "[PREMISE]", 
    str_hypothesis_start_special_token = "[HYPOTHESIS]",
    str_hypothesis_end_special_token = "[HYPOTHESIS]",
    is_test = True
)

# set label as 100 
df_test_dataset['label'] = 100

In [None]:
test_set = IBDataset(df_test_dataset, is_training=False)
print(len(test_set))
test_data_loader = DataLoader(
    test_set, 
    batch_size=CFG.val_batch_size, 
    num_workers=CFG.num_workers, 
    shuffle=False,
    drop_last=False
)
len(test_data_loader)

In [None]:
import torch
import numpy as np
from tqdm.notebook import tqdm
from torch.nn import functional as F
import shutil

def flatten(t):
    return [item for sublist in t for item in sublist]

CRITERIA_LIST = ["accuracy", "f1", "loss" ]
CRITERIA ="f1"
DIR_PATH = f"/content/results/{run_name}" # or "/content/"

oof_pred = [] # out of fold prediction list
for i in range(CFG.num_folds) :
    model_path = f'{DIR_PATH}/{i+1}-fold-{CFG.num_folds}-best-eval-{CRITERIA}-model.pt'
    model = ArcFaceIBModel(
        CFG.model_name, 
        dropout_rate = CFG.dropout_rate,
    )
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    list_logits = []
    for i, data in enumerate(tqdm(test_data_loader)) :
        with torch.no_grad():
            logit_output = model(
              input_ids = data['input_ids'].to(device),
              attention_mask = data['attention_mask'].to(device),
              premise_special_token_index = data['premise_special_token_index'].to(device),
              hypothesis_special_token_index = data['hypothesis_special_token_index'].to(device),
              labels=None
            )
        list_logits.extend(logit_output.cpu().detach().numpy())
    output_probs = F.softmax(torch.Tensor(list_logits), dim=1)
    oof_pred.append(np.array(output_probs)[:,np.newaxis])

    # Prevent OOM error
    model.cpu()
    del model
    torch.cuda.empty_cache()

In [None]:
# mean probability of fold predictions
oof_pred_mean = np.mean(oof_pred, axis=0)
print(oof_pred_mean.shape) # (1666, 1, 3)
print(oof_pred_mean[:10])

In [None]:
all_label = []
all_probability = []
for pred in oof_pred_mean:
  probability = flatten(pred)
  all_probability.append(probability)
  single_label = np.argmax(probability)
  all_label.append(single_label)

df_oof = df_test_dataset.copy()
df_oof["label"] = num_to_label(all_label)
df_oof["label"] = df_oof["label"].str.strip()
df_oof["probability"] = all_probability
df_oof = df_oof.sort_values(by=['index']).reset_index(drop=True)
df_oof.head(24)

In [None]:
df_oof['label'].value_counts()

In [None]:
save_model_name = DIR_PATH.split("/")[-1]
print(save_model_name)

result_save_path = f'./result-{save_model_name}-{CFG.num_epochs}-epochs-best-{CRITERIA}.csv'
df_oof.to_csv(result_save_path, index=False)

df_submission = df_oof.copy()
df_submission = df_submission.sort_values(by=['index'])
df_submission.reset_index(drop=True, inplace=True)
df_submission = df_submission[["index", "label"]]
display(df_submission.head(6))
display(df_submission.tail(6))

submission_save_path = f'./submission-{save_model_name}-{CFG.num_epochs}-epochs-best-{CRITERIA}.csv'
df_submission.to_csv(submission_save_path, index=False)

### 2등 노트북

- Private 3등 | 0.89915 | 2.Custom Model R-Roberta
- https://dacon.io/competitions/official/235875/codeshare/4631?page=1&dtype=recent

**Keypoint**
- 학습 KLUE Official Dev Data를 추가 사용
- R-BERT 모델 구조 차용하여 모델 아키텍처 수정
-  5-Fold Soft Ensemble 을 시도하려 했지만 4번째와 5번째 Fold만 학습완료
- 이들 중 Public Score 0.897을 기록한 4번째 Fold 모델만을 Inference에 사용

In [None]:
!pip install transformers wandb

In [None]:
import pandas as pd
import numpy as np
import random
import pickle as pickle
import os
import math
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm, tqdm_notebook
tqdm.pandas()
# Step 4. 한글 글꼴 설정
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import  pprint
import warnings
warnings.filterwarnings('ignore')
from collections import defaultdict, Counter
from itertools import chain
from pprint import pprint
import wandb
# from pycaret.classification import *
# from pycaret.regression import *
# from pycaret.utils import check_metric
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSequenceClassification, AutoConfig
from transformers import BertConfig, BertForSequenceClassification, Trainer, TrainingArguments, BertModel, ElectraModel, RobertaModel
from importlib import import_module
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
def seed_everything(seed: int = 42, contain_cuda: bool = False):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)

    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    print(f"Seed set as {seed}")

seed=42
seed_everything(seed)

root_dir = "/content/drive/MyDrive/"
project_folder = "Konli"
os.chdir(os.path.join(root_dir,project_folder))


In [None]:
def wrong_batch_for_wandb(tokenizer,
                          wrong_sample_index,
                          input_ids,
                          valid_labels,
                          valid_predict,
                          valid_output,
                          ):
    num_to_label_dict = {0:'entailment',1:'contradiction',2:'neutral',}

    wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
    wrong_sample_text = [tokenizer.decode(element, skip_special_tokens=False) for element in input_ids[wrong_sample_index]]
    wrong_sample_label = [num_to_label_dict[lab] for lab in list(valid_labels[wrong_sample_index])]
    wrong_sample_pred = [num_to_label_dict[pred] for pred in list(valid_predict[wrong_sample_index])]
    wrong_sample_output = valid_output[wrong_sample_index].tolist()

    entailment_prob, contradiction_prob, neutral_prob = [], [], []
    for element in wrong_sample_output:
        entailment_prob.append(element[0])
        contradiction_prob.append(element[1])
        neutral_prob.append(element[2])

    return wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob, neutral_prob


In [None]:
# Dataset 구성.
class NLI_Dataset(Dataset):
    def __init__(self, tokenized_dataset, labels):
        self.tokenized_dataset = tokenized_dataset
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.tokenized_dataset.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
def preprocessing_dataset(args, dataset):

    if 'answer' not in dataset.label: # Train Data일 때에만
        dataset = dataset.loc[dataset.label.isnull()==False,:] # label 기준으로 결측치 있는 행 제거
        dataset = dataset.drop_duplicates(['premise','hypothesis','label']) # 중복되는 데이터가 있었네.. 중복제거

        # # Test Set에는 없는 특수문자 포함한 문장들이 Train Set에 존재.
        # # 오히려 학습에 혼란을 주는 것 같은데.. 제거할지 말지 고민중
        # dataset = dataset.loc[dataset.premise.apply(lambda x: '"' in x)==False]
        # dataset = dataset.loc[dataset.premise.apply(lambda x: '"' in x)==False]
        # dataset = dataset.loc[dataset.premise.apply(lambda x: '%' in x)==False] # 제거해야하나..
        # dataset = dataset.loc[dataset.premise.apply(lambda x: '~' in x)==False] # 제거해야하나..

        # Label Encoding
        label_to_num_dict = {'entailment':0,'contradiction':1,'neutral':2,}
        dataset['labels'] = dataset.label.map(label_to_num_dict)

    return dataset.reset_index(drop=True)

def load_data(args, dataset_dir):
    print("===================loading data=====================")
    # load dataset
    dataset = pd.read_csv(dataset_dir)
    
    # preprecessing dataset
    dataset = preprocessing_dataset(args, dataset)
    # print(dataset)

    return dataset

# bert input을 위한 tokenizing.
def tokenized_dataset(args, dataset, tokenizer):
    lst_premise = dataset['premise'].tolist()
    lst_hypothesis = dataset['hypothesis'].tolist()

    tokenized_sentences = tokenizer(
        lst_premise,
        lst_hypothesis,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=args.seq_max_len,
        add_special_tokens=True
    )

    return tokenized_sentences

    all_dataset = load_data(args, dataset_dir = f'./data/{args.train_file}')

def add_extra_data(args, dataset, label):
    if args.add_klue_data==True:
        klue_extra_data = load_data(args, dataset_dir = f'./data/klue_extra_data.csv')
        klue_extra_label = klue_extra_data['labels'].values
        dataset = pd.concat([dataset, klue_extra_data],axis=0)
        label = np.hstack([label, klue_extra_label])
    if args.add_nikl_data==True:
        nikl_extra_data = load_data(args, dataset_dir = f'./data/nikl_extra_data.csv')
        nikl_extra_label = nikl_extra_data['labels'].values
        dataset = pd.concat([dataset, nikl_extra_data],axis=0)
        label = np.hstack([label, nikl_extra_label])
    dataset.reset_index(drop=True, inplace=True)
    return dataset, label

In [None]:
def get_trainLoader(args, train_data, valid_data, train_label, valid_label, tokenizer):

    # entity_between = '</s></s>' if args.model == 'r_roberta' or args.model == 'roberta' else '[SEP]'
    tokenized_train = tokenized_dataset(args, train_data, tokenizer)
    tokenized_valid = tokenized_dataset(args, valid_data, tokenizer)

    # make dataset for pytorch.
    NLI_train_dataset = NLI_Dataset(tokenized_train, train_label)
    NLI_valid_dataset = NLI_Dataset(tokenized_valid, valid_label)

    trainloader = DataLoader(NLI_train_dataset,
                             batch_size=args.batch_size,
                             shuffle=True,
                             num_workers=args.num_workers,
                             )

    validloader = DataLoader(NLI_valid_dataset,
                             batch_size=args.batch_size,
                             shuffle=False,
                             num_workers=args.num_workers,
                             )

    return trainloader, validloader


In [None]:
from torch.optim import Adam, AdamW
from torch.optim.optimizer import Optimizer, required
import math

class AdamP(Optimizer):
    def __init__(
        self,
        params,
        lr=1e-3,
        betas=(0.9, 0.999),
        eps=1e-8,
        weight_decay=0,
        delta=0.1,
        wd_ratio=0.1,
        nesterov=False,
    ):
        defaults = dict(
            lr=lr,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            delta=delta,
            wd_ratio=wd_ratio,
            nesterov=nesterov,
        )
        super(AdamP, self).__init__(params, defaults)

    def _channel_view(self, x):
        return x.view(x.size(0), -1)

    def _layer_view(self, x):
        return x.view(1, -1)

    def _cosine_similarity(self, x, y, eps, view_func):
        x = view_func(x)
        y = view_func(y)

        return F.cosine_similarity(x, y, dim=1, eps=eps).abs_()

    def _projection(self, p, grad, perturb, delta, wd_ratio, eps):
        wd = 1
        expand_size = [-1] + [1] * (len(p.shape) - 1)
        for view_func in [self._channel_view, self._layer_view]:

            cosine_sim = self._cosine_similarity(grad, p.data, eps, view_func)

            if cosine_sim.max() < delta / math.sqrt(view_func(p.data).size(1)):
                p_n = p.data / view_func(p.data).norm(dim=1).view(expand_size).add_(eps)
                perturb -= p_n * view_func(p_n * perturb).sum(dim=1).view(expand_size)
                wd = wd_ratio

                return perturb, wd

        return perturb, wd

    def step(self, closure=None):
        loss = None
        if closure is not None:
            loss = closure()

        for group in self.param_groups:
            for p in group["params"]:
                if p.grad is None:
                    continue

                grad = p.grad.data
                beta1, beta2 = group["betas"]
                nesterov = group["nesterov"]

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state["step"] = 0
                    state["exp_avg"] = torch.zeros_like(p.data)
                    state["exp_avg_sq"] = torch.zeros_like(p.data)

                # Adam
                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]

                state["step"] += 1
                bias_correction1 = 1 - beta1 ** state["step"]
                bias_correction2 = 1 - beta2 ** state["step"]

                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)

                denom = (exp_avg_sq.sqrt() / math.sqrt(bias_correction2)).add_(
                    group["eps"]
                )
                step_size = group["lr"] / bias_correction1

                if nesterov:
                    perturb = (beta1 * exp_avg + (1 - beta1) * grad) / denom
                else:
                    perturb = exp_avg / denom

                # Projection
                wd_ratio = 1
                if len(p.shape) > 1:
                    perturb, wd_ratio = self._projection(
                        p,
                        grad,
                        perturb,
                        group["delta"],
                        group["wd_ratio"],
                        group["eps"],
                    )

                # Weight decay
                if group["weight_decay"] > 0:
                    p.data.mul_(1 - group["lr"] * group["weight_decay"] * wd_ratio)

                # Step
                p.data.add_(perturb, alpha=-step_size)

        return loss

def get_optimizer(model, args):
    if args.optimizer == "Adam":
        optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=0.01)
    elif args.optimizer == "AdamW":
        optimizer = AdamW(model.parameters(), lr=args.lr, weight_decay=0.01)
    elif args.optimizer == "AdamP":
        optimizer = AdamP(
            model.parameters(),
            lr=args.lr,
            betas=(0.9, 0.999),
            weight_decay=0.01,
            delta=0.1,
            wd_ratio=0.1,
            nesterov=False,
        )


    # 모든 parameter들의 grad값을 0으로 초기화
    optimizer.zero_grad()

    return optimizer


In [None]:
import math
import torch
from torch.optim.lr_scheduler import ReduceLROnPlateau, _LRScheduler
from transformers import get_linear_schedule_with_warmup

# https://github.com/katsura-jp/pytorch-cosine-annealing-with-warmup/blob/master/cosine_annearing_with_warmup.py
class CosineAnnealingWarmupRestarts(_LRScheduler):
    """
        optimizer (Optimizer): Wrapped optimizer.
        first_cycle_steps (int): First cycle step size.
        cycle_mult(float): Cycle steps magnification. Default: -1.
        max_lr(float): First cycle's max learning rate. Default: 0.1.
        min_lr(float): Min learning rate. Default: 0.001.
        warmup_steps(int): Linear warmup step size. Default: 0.
        gamma(float): Decrease rate of max learning rate by cycle. Default: 1.
        last_epoch (int): The index of last epoch. Default: -1.
    """
    
    def __init__(self,
                 optimizer : torch.optim.Optimizer,
                 first_cycle_steps : int,
                 cycle_mult : float = 1.,
                 max_lr : float = 0.1,
                 min_lr : float = 0.001,
                 warmup_steps : int = 0,
                 gamma : float = 1.,
                 last_epoch : int = -1
        ):
        assert warmup_steps < first_cycle_steps
        
        self.first_cycle_steps = first_cycle_steps # first cycle step size
        self.cycle_mult = cycle_mult # cycle steps magnification
        self.base_max_lr = max_lr # first max learning rate
        self.max_lr = max_lr # max learning rate in the current cycle
        self.min_lr = min_lr # min learning rate
        self.warmup_steps = warmup_steps # warmup step size
        self.gamma = gamma # decrease rate of max learning rate by cycle
        
        self.cur_cycle_steps = first_cycle_steps # first cycle step size
        self.cycle = 0 # cycle count
        self.step_in_cycle = last_epoch # step size of the current cycle
        
        super(CosineAnnealingWarmupRestarts, self).__init__(optimizer, last_epoch)
        
        # set learning rate min_lr
        self.init_lr()
    
    def init_lr(self):
        self.base_lrs = []
        for param_group in self.optimizer.param_groups:
            param_group['lr'] = self.min_lr
            self.base_lrs.append(self.min_lr)
    
    def get_lr(self):
        if self.step_in_cycle == -1:
            return self.base_lrs
        elif self.step_in_cycle < self.warmup_steps:
            return [(self.max_lr - base_lr)*self.step_in_cycle / self.warmup_steps + base_lr for base_lr in self.base_lrs]
        else:
            return [base_lr + (self.max_lr - base_lr) \
                    * (1 + math.cos(math.pi * (self.step_in_cycle-self.warmup_steps) \
                                    / (self.cur_cycle_steps - self.warmup_steps))) / 2
                    for base_lr in self.base_lrs]

    def step(self, epoch=None):
        if epoch is None:
            epoch = self.last_epoch + 1
            self.step_in_cycle = self.step_in_cycle + 1
            if self.step_in_cycle >= self.cur_cycle_steps:
                self.cycle += 1
                self.step_in_cycle = self.step_in_cycle - self.cur_cycle_steps
                self.cur_cycle_steps = int((self.cur_cycle_steps - self.warmup_steps) * self.cycle_mult) + self.warmup_steps
        else:
            if epoch >= self.first_cycle_steps:
                if self.cycle_mult == 1.:
                    self.step_in_cycle = epoch % self.first_cycle_steps
                    self.cycle = epoch // self.first_cycle_steps
                else:
                    n = int(math.log((epoch / self.first_cycle_steps * (self.cycle_mult - 1) + 1), self.cycle_mult))
                    self.cycle = n
                    self.step_in_cycle = epoch - int(self.first_cycle_steps * (self.cycle_mult ** n - 1) / (self.cycle_mult - 1))
                    self.cur_cycle_steps = self.first_cycle_steps * self.cycle_mult ** (n)
            else:
                self.cur_cycle_steps = self.first_cycle_steps
                self.step_in_cycle = epoch
                
        self.max_lr = self.base_max_lr * (self.gamma**self.cycle)
        self.last_epoch = math.floor(epoch)
        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
            param_group['lr'] = lr


def get_scheduler(optimizer, args, total_batch_):
    if args.scheduler == "plateau":
        scheduler = ReduceLROnPlateau(
            optimizer, patience=2, factor=0.85, mode="max", verbose=True
        )
    elif args.scheduler == "linear":
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            # num_warmup_steps=int(total_batch_*args.epochs*0.1),
            num_warmup_steps=args.warmup_steps,
            num_training_steps=int(total_batch_*args.epochs),
        )
    elif args.scheduler == "cosine":
        scheduler = CosineAnnealingWarmupRestarts( # ver1: first_cycle=20, warmup_steps=5, cycle_mult=1.0, max_lr=args.lr, min_lr=args.lr/100, gamma=0.8, patience=7, 
            optimizer,                             # ver2: first_cycle=30, warmup_steps=5, cycle_mult=0.8, max_lr=args.lr, min_lr=args.lr/100, gamma=0.8, patience=5
            first_cycle_steps=300,                  # ver3: first_cycle=50, warmup_steps=10, cycle_mult=1.0, max_lr=args.lr, min_lr=args.lr/100, gamma=0.8, patience=7
            warmup_steps=args.warmup_steps,
            cycle_mult=args.cycle_mult,
            max_lr=args.lr,
            min_lr=args.lr * 0.01,
            gamma=0.8,
        )

    return scheduler



In [None]:
class kobert_Classifier(nn.Module):
    def __init__(self, bert, hidden_size=768, num_classes=3, dr_rate=0.0):
        super(kobert_Classifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate

        self.pooler = nn.Linear(hidden_size, hidden_size)
        self.classifier = nn.Linear(hidden_size, num_classes)
        torch.nn.init.xavier_uniform_(self.classifier.weight)
        self.dropout = nn.Dropout(p=dr_rate)

    def forward(self, token_ids, attention_mask, segment_ids):
        # TypeError: dropout(): argument 'input' (position 1) must be Tensor, not tuple
        out = self.bert(input_ids=token_ids, attention_mask=attention_mask, token_type_ids=segment_ids)[0]
        out = out[:, 0, :]
        out = self.pooler(out)
        out = torch.nn.functional.tanh(out)  # although BERT uses tanh here, it seems Electra authors used gelu here

        if self.dr_rate:
            out = self.dropout(out)
        
        return self.classifier(out)
    

class roberta_large_Classifier(nn.Module):
    def __init__(self, roberta, hidden_size=1024, num_classes=3, dr_rate=0.0):
        super(roberta_large_Classifier, self).__init__()
        self.roberta = roberta
        self.dr_rate = dr_rate
        
        self.pooler = FCLayer(hidden_size, hidden_size//2, self.dr_rate)
        self.classifier = FCLayer(hidden_size//2, num_classes, self.dr_rate, False)

    def forward(self, token_ids, attention_mask, segment_ids=None):
        out = self.roberta(input_ids=token_ids, attention_mask=attention_mask)[0]
        
        out = out[:, 0, :] # take <s> token (equiv. to [CLS])
        out = self.pooler(out)

        return self.classifier(out)


# Relation Extraction R-BERT 아이디어 차용
# https://github.com/monologg/R-BERT/blob/master/model.py#L21
class r_roberta_Classifier(nn.Module):
    def __init__(self, roberta, hidden_size=1024, num_classes=3, dr_rate=0.0):
        super(r_roberta_Classifier, self).__init__()
        self.roberta = roberta
        self.dr_rate = dr_rate

        self.cls_fc = FCLayer(hidden_size, hidden_size//2, self.dr_rate)
        self.sentence_fc = FCLayer(hidden_size, hidden_size//2, self.dr_rate)
        self.label_classifier = FCLayer(hidden_size//2 * 3, num_classes, self.dr_rate, False)

    def forward(self, token_ids, attention_mask, segment_ids=None):
        out = self.roberta(input_ids=token_ids, attention_mask=attention_mask)[0]
        
        sentence_end_position = torch.where(token_ids == 2)[1]
        sent1_end, sent2_end = sentence_end_position[0], sentence_end_position[1]
        
        cls_vector = out[:, 0, :] # take <s> token (equiv. to [CLS])
        prem_vector = out[:,1:sent1_end]              # Get Premise vector
        hypo_vector = out[:,sent1_end+1:sent2_end]    # Get Hypothesis vector

        prem_vector = torch.mean(prem_vector, dim=1) # Average
        hypo_vector = torch.mean(hypo_vector, dim=1)

        
        # Dropout -> tanh -> fc_layer (Share FC layer for premise and hypothesis)
        cls_embedding = self.cls_fc(cls_vector)
        prem_embedding = self.sentence_fc(prem_vector)
        hypo_embedding = self.sentence_fc(hypo_vector)
        
        # Concat -> fc_layer
        concat_embedding = torch.cat([cls_embedding, prem_embedding, hypo_embedding], dim=-1)
        
        return self.label_classifier(concat_embedding)


class FCLayer(nn.Module):
    def __init__(self, input_dim, output_dim, dropout_rate=0.0, use_activation=True):
        super(FCLayer, self).__init__()
        self.use_activation = use_activation
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(input_dim, output_dim)
        self.tanh = nn.Tanh()
        
        torch.nn.init.xavier_uniform_(self.linear.weight)

    def forward(self, x):
        x = self.dropout(x)
        if self.use_activation:
            x = self.tanh(x)
        return self.linear(x)

In [None]:
def get_tokenizer(args):
    if args.model == 'kobert':
        # tokenizer = KoBertTokenizer.from_pretrained('monologg/kobert')
        tokenizer = AutoTokenizer.from_pretrained("kykim/bert-kor-base")
        
    elif args.model == 'klue_roberta_large':
        tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

    elif args.model == 'r_klue_roberta':
        tokenizer = AutoTokenizer.from_pretrained("klue/roberta-large")

    elif args.model == 'r_roberta':
        tokenizer = AutoTokenizer.from_pretrained("xlm-roberta-large")

    else:
    	raise NotImplementedError('Tokenizer & Model not available')

    return tokenizer


In [None]:
def get_model(args):
    if args.model == 'kobert':
    	# feature_model = BertModel.from_pretrained("monologg/kobert")
    	feature_model = BertModel.from_pretrained("kykim/bert-kor-base")
    	model = kobert_Classifier(feature_model, dr_rate=args.dp)

    elif args.model == 'klue_roberta_large':	# 1024
        feature_model = RobertaModel.from_pretrained("klue/roberta-large", add_pooling_layer=False)
        model = roberta_large_Classifier(feature_model, dr_rate=args.dp)

    elif args.model == 'r_roberta':
        feature_model = RobertaModel.from_pretrained("xlm-roberta-large", add_pooling_layer=False)
        model = r_roberta_Classifier(feature_model, dr_rate=args.dp)
    
    elif args.model == 'r_klue_roberta':
        feature_model = RobertaModel.from_pretrained("klue/roberta-large", add_pooling_layer=False)
        model = r_roberta_Classifier(feature_model, dr_rate=args.dp)

    else:
    	raise NotImplementedError('Tokenizer & Model not available')

    return model


In [None]:
# https://discuss.pytorch.org/t/is-this-a-correct-implementation-for-focal-loss-in-pytorch/43327/8
class FocalLoss(nn.Module):
    def __init__(self, weight=None,
                 gamma=2., reduction='mean'):
        nn.Module.__init__(self)
        self.weight = weight
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, input_tensor, target_tensor):
        log_prob = F.log_softmax(input_tensor, dim=-1)
        prob = torch.exp(log_prob)
        return F.nll_loss(
            ((1 - prob) ** self.gamma) * log_prob,
            target_tensor,
            weight=self.weight,
            reduction=self.reduction
        )

class LabelSmoothingLoss(nn.Module):
    def __init__(self, classes=3, smoothing=0.0, dim=-1):
        super(LabelSmoothingLoss, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.cls = classes
        self.dim = dim

    def forward(self, pred, target):
        pred = pred.log_softmax(dim=self.dim)
        with torch.no_grad():
            true_dist = torch.zeros_like(pred)
            true_dist.fill_(self.smoothing / (self.cls - 1))
            true_dist.scatter_(1, target.data.unsqueeze(1), self.confidence)
        return torch.mean(torch.sum(-true_dist * pred, dim=self.dim))


def get_criterion(args):
    if args.smoothing!=0 and args.criterion == 'smoothing':
        criterion = LabelSmoothingLoss(smoothing=args.smoothing)
    elif args.criterion == 'cross':
        criterion = nn.CrossEntropyLoss()
    elif args.criterion == 'focal':
        criterion = FocalLoss(gamma=2.0)
    else:
        raise NotImplementedError('Criterion not available')
    return criterion

In [None]:
def train(args, wandb, fold_lst=[4,5]):
    criterion = get_criterion(args)
    tokenizer = get_tokenizer(args)
    all_dataset = load_data(args, dataset_dir = f'./data/{args.train_file}')
    all_label = all_dataset['labels'].values

    kf = StratifiedKFold(n_splits=args.n_splits, random_state=42, shuffle=True)
    fold_idx = 1
    best_val_acc_list = []
    for train_index, test_index in kf.split(all_dataset, all_label):
        if fold_idx not in fold_lst:
            fold_idx+=1
            continue

        run = wandb.init(project=args.project_name)
        wandb.run.name = f'{args.model_name}/{fold_idx}-fold'
        wandb.config.update(args)

        os.makedirs(f'./models/{args.model_name}/{fold_idx}-fold', exist_ok=True)
        ### Model Select
        model = get_model(args)
        print('===================get model===================')
        model.to(device)


        train_data, valid_data = all_dataset.iloc[train_index], all_dataset.iloc[test_index]
        train_label, valid_label = all_label[train_index], all_label[test_index]
        
        print(f"len(train_label) : {len(train_label)}")
        print(f"len(train_data) : {len(train_data)}")
        if args.add_klue_data or args.add_nikl_data: # 외부 데이터 활용
            train_data, train_label = add_extra_data(args, train_data, train_label)
            print('='*15,'Extra Data Added','='*15)
        print(f"len(train_label) : {len(train_label)}")
        print(f"len(train_data) : {len(train_data)}")

        trainloader, validloader = get_trainLoader(args, train_data, valid_data, train_label, valid_label, tokenizer)	
        total_batch_ = len(trainloader)
        valid_batch_ = len(validloader)

        ### Optimizer
        optimizer = get_optimizer(model, args)

        ### Scheduler
        scheduler = get_scheduler(optimizer, args, total_batch_)

        best_val_loss, best_val_acc, = np.inf, 0
        early_stopping_counter = 0

        print(f"---------------------------------- {fold_idx} fold----------------------------------")	
        for i in tqdm(range(1, args.epochs+1)):
            model.train()
            epoch_perform, batch_perform = np.zeros(2), np.zeros(2)	
            print()
            progress_bar = tqdm(enumerate(trainloader), total=len(trainloader), leave=True, position=0,)
            for j, v in progress_bar:
                input_ids, attention_mask, labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)

                if 'roberta' in args.model:
                    token_type_ids = None
                else:
                    token_type_ids = v['token_type_ids'].to(device)
                optimizer.zero_grad()

                output = model(input_ids, attention_mask, token_type_ids) ## label을 안 넣어서 logits값만 출력	

                loss = criterion(output, labels)
                loss.backward()
                optimizer.step()
                scheduler.step()
                for learning_rate in scheduler.get_lr():
                    wandb.log({"learning_rate": learning_rate})


                predict = output.argmax(dim=-1)
                predict = predict.detach().cpu().numpy()
                labels = labels.detach().cpu().numpy()	
                acc = accuracy_score(labels, predict)

                batch_perform += np.array([loss.item(), acc])
                epoch_perform += np.array([loss.item(), acc])

                if (j + 1) % 50 == 0:
                    print(
                        f"Epoch {i:#04d} #{j + 1:#03d} -- loss: {batch_perform[0] / 50:#.5f}, acc: {batch_perform[1] / 50:#.4f}"
                        )
                    batch_perform = np.zeros(2)
            print()
            print(
                f"Epoch {i:#04d} loss: {epoch_perform[0] / total_batch_:#.5f}, acc: {epoch_perform[1] / total_batch_:#.2f}"
                )
            wandb.log({
                "epoch": i,
                "Train epoch Loss": epoch_perform[0] / total_batch_,
                "Train epoch Acc": epoch_perform[1] / total_batch_}
                )
            
            ###### Validation	
            model.eval()
            valid_perform = np.zeros(2)

            all_valid_predict_lst = []
            all_valid_labels_lst = []

            # 틀린 데이터들을 wandb 기록하기 위함.
            wrong_sample_dict = defaultdict(list)

            with torch.no_grad():
                for v in validloader:
                    input_ids, attention_mask, valid_labels = v['input_ids'].to(device), v['attention_mask'].to(device), v['labels'].to(device)

                    if 'roberta' in args.model:
                        token_type_ids = None
                    else:
                        token_type_ids = v['token_type_ids'].to(device)

                    valid_output = model(input_ids, attention_mask, token_type_ids)
                    valid_loss = criterion(valid_output, valid_labels)	

                    valid_predict = valid_output.argmax(dim=-1)
                    valid_predict = valid_predict.detach().cpu().numpy()
                    valid_labels = valid_labels.detach().cpu().numpy()	

                    ###########################
                    # valid eval 결과, 틀린 데이터들은 wandb에 Logging
                    if args.logging_wrong_samples:
                        wrong_sample_index = np.where(valid_labels!=valid_predict)[0]
                        if len(wrong_sample_index)>0:
                            wrong_sample_text, wrong_sample_label, wrong_sample_pred, entailment_prob, contradiction_prob, neutral_prob = wrong_batch_for_wandb(tokenizer, wrong_sample_index, input_ids, valid_labels, valid_predict, valid_output)

                            wrong_sample_dict['입력 문장 Pair'] += wrong_sample_text
                            wrong_sample_dict['실제값'] += wrong_sample_label
                            wrong_sample_dict['예측값'] += wrong_sample_pred
                            wrong_sample_dict['entailment_logit'] += entailment_prob
                            wrong_sample_dict['contradiction_logit'] += contradiction_prob
                            wrong_sample_dict['neutral_logit'] += neutral_prob
                    ###########################

                    valid_acc = accuracy_score(valid_labels, valid_predict)	
                    valid_perform += np.array([valid_loss.item(), valid_acc])

                    all_valid_predict_lst += list(valid_predict)
                    all_valid_labels_lst += list(valid_labels)
            
            ###### Model save
            val_total_loss = valid_perform[0] / valid_batch_
            val_total_acc = valid_perform[1] / valid_batch_
            best_val_loss = min(best_val_loss, val_total_loss)
        
            if val_total_acc > best_val_acc:    #  and val_total_acc >= 0.25
                print(f"New best model for val accuracy : {val_total_acc:#.4f}! saving the best model..")
                torch.save(model.state_dict(), f"./models/{args.model_name}/{fold_idx}-fold/best.pt")
                
                # 참고 : Model 추가 재학습을 위한 모델을 저장하는 코드
                # https://tutorials.pytorch.kr/beginner/saving_loading_models.html#checkpoint

                best_val_acc = val_total_acc
                early_stopping_counter = 0

                ### Confusion Matrix
                class_names = ['entailment','contradiction','neutral'] # (0,1,2)
                # https://wandb.ai/wandb/plots/reports/Confusion-Matrix--VmlldzozMDg1NTM
                wandb.log({f"{i}th_epoch_conf_mat" : wandb.plot.confusion_matrix(probs=None,
                            y_true=all_valid_labels_lst, preds=all_valid_predict_lst,
                            class_names=class_names)})
                
                if args.logging_wrong_samples and val_total_acc > 0.91:
                    ########### Logging Wrong Samples ##########
                    # Save Wrong DataFrame
                    wrong_sample_df = pd.DataFrame(wrong_sample_dict)
                    wrong_sample_df.to_csv(f"./models/{args.model_name}/{fold_idx}-fold/wrong_df.csv",index=False)
                    print('='*15,f'{fold_idx}-Fold Wrong DataFrame Saved','='*15)
                    # Loggin Wandb
                    text_table = wandb.Table(data = wrong_sample_df)
                    run.log({f"{fold_idx}th_fold_wrong_samples" : text_table})
                    ###########################
            
            else: # best보다 score가 안 좋을 때, early stopping check
                early_stopping_counter += 1
                if early_stopping_counter >= args.patience:
                    print(
                        f"EarlyStopping counter: {early_stopping_counter} out of {args.patience}"
                    )
                    break

            print()
            print(
                f">>>> Validation loss: {val_total_loss:#.5f}, Acc: {val_total_acc:#.4f}"
                )
            print()
            wandb.log({
                "epoch": i,
                "Valid Loss": val_total_loss,
                "Valid Acc": val_total_acc}
                )

        best_val_acc_list.append(best_val_acc)
        fold_idx +=1
    print('='*50)
    print(f"{args.n_splits}-fold best_val_acc_list : {best_val_acc_list}")
    print('='*15, f'{args.n_splits}-fold Final Score(ACC) : {np.mean(best_val_acc_list)}', '='*15)
    wandb.log({
    f"Total Mean ACC ({args.n_splits}-fold)": np.mean(best_val_acc_list)}
    )

In [None]:
import easydict

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'current device : {device}')

args = easydict.EasyDict({
        "seed":42,
        "optimizer":"AdamW",    # help = (AdamW, Adam, AdamP)
        "scheduler":"linear",     # help= (linear, cosine, plateau ...)
        "warmup_steps":500,
        "cycle_mult":1.2,
        "seq_max_len":128,
        "batch_size": 32,
        "epochs": 20,
        "patience":5,
        "n_splits" : 5,
        "lr": 1e-05,
        "num_workers":2,
        "criterion":'cross', # 'smoothing','focal','cross'
        "smoothing": 0.0,
        "dp": 0.0,
        "model": "r_klue_roberta",  # help='model type (kobert, koelectra, mbert, 
                                                        # roberta_base, roberta_large, 
                                                        # klue_roberta_small, klue_roberta_base, klue_roberta_large, klue_roberta_base_nli
                                                        # r_roberta, r_klue_roberta)'

        "logging_wrong_samples":True,
        "train_file":'train_data.csv',
        "test_file":'test_data.csv',
        "add_klue_data":True,
        'add_nikl_data':False,
    })

project_name = f"{args.model}_Scdu{args.scheduler}_Dp{args.dp}_add_klue_data{args.add_klue_data}_{args.n_splits}Fd_Sm{args.smoothing}_Bs{args.batch_size}_Lr{args.lr}_Ep{args.epochs}_Cy{args.cycle_mult}"
args.update(
            {
                "project_name":project_name,
                "model_name":project_name,
             }
            )

seed_everything(args.seed)


In [None]:
args

In [None]:
# wandb logging
wandb.login()


In [None]:
train(args, wandb, fold_lst=[4,5])

In [None]:
def load_test_dataset(args, tokenizer):
    test_dataset = load_data(args, dataset_dir = f"./data/{args.test_file}")
    test_label = test_dataset['labels'].values

    # # tokenizing dataset
    # entity_between = '</s></s>' if 'roberta' in args.model else '[SEP]'

    tokenized_test = tokenized_dataset(args, test_dataset, tokenizer)
    return tokenized_test, test_label

def test_single_main(args, idx):
    model = get_model(args)
    tokenizer = get_tokenizer(args)
    # load test datset
    test_dataset, test_label = load_test_dataset(args, tokenizer)
    test_dataset = NLI_Dataset(test_dataset, test_label)
    testloader = DataLoader(test_dataset,
                    shuffle=False,
                    batch_size=args.batch_size,
                    num_workers=args.num_workers,
                    )

    load_path = f'./models/{args.model_name}/{idx}-fold/best.pt'
    model.load_state_dict(torch.load(load_path,map_location=device))
    model.to(device)
    model.eval()
    progress_bar = tqdm(enumerate(testloader), total=len(testloader), leave=True, position=0,)
    for i, data in progress_bar:
        with torch.no_grad():
            logits = model(
                data['input_ids'].to(device),
                data['attention_mask'].to(device),
                data['token_type_ids'].to(device)
            )
        if i==0:
            one_fold_logits = logits
        else:
            one_fold_logits = torch.cat([one_fold_logits,logits],dim=0) # (batchsize,3) + (batchsize,3) -> (batchsize+batchsize,3)

    # torch tensor를 저장하기 위한 numpy 변환
    one_fold_logits = one_fold_logits.squeeze(0).detach().cpu().numpy()
    # numpy array 저장
    np.save(f'./models/{args.model_name}/{idx}-fold/numpy_logits', one_fold_logits)
    # np_load = np.load(f'./models/{args.model_name}/{idx}-fold/numpy_logits.npy')
        
    return np.argmax(one_fold_logits, axis=1)

In [None]:
submission_path = "./data/sample_submission.csv"
submission = pd.read_csv(submission_path)

In [None]:
idx = 4
single_pred = test_single_main(args, idx)

label_to_num_dict = {'entailment':0,'contradiction':1,'neutral':2,}
num_to_label_dict = {v:k for k,v in label_to_num_dict.items()}
print(f"len(single_pred) : {len(single_pred)}")
print('='*50)
submission['label'] = single_pred
# submission['label'] = hard_output
submission['label'] = submission['label'].map(num_to_label_dict)
submission['label'].value_counts()