In [1]:
!pip install datasets transformers==4.28.0

Collecting datasets
  Downloading datasets-2.14.5-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.6/519.6 kB[0m [31m5.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0 (from transformers==4.28.0)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m27.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.28.0)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datase

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Libraries

In [17]:
import os
import json
import copy
import pandas as pd
import numpy as np
import random

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset

from sklearn.metrics import accuracy_score

from transformers import AutoTokenizer, AutoModelForSequenceClassification, PreTrainedTokenizer
from transformers import Trainer, TrainingArguments

from typing import Optional, Dict, Tuple, List, Callable

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Available device is {device}')

Available device is cuda


# Configurations, Global Variables

In [6]:
path_config = {
    'train_data': '/content/drive/MyDrive/korean_baseline/data/sample.jsonl',
    'augmented_train_data': '/content/drive/MyDrive/korean_baseline/data/train_aug_merged.jsonl',
    'dev_data': '/content/drive/MyDrive/korean_baseline/data/sample.jsonl',
    'test_data': '/content/drive/MyDrive/korean_baseline/data/sample.jsonl',
    'model': '/content/drive/MyDrive/korean_baseline/saved_model/'
    }

model_config = {
    'base_model': 'beomi/KcELECTRA-base',
}

optim_config = {
    'lr': 2e-5,
}

data_config = {
    'max_length': 128,
    'batch_size': 8,
}

train_config = {
    'n_epochs': 25,
}

In [7]:
category_id2name = ['제품 전체#일반', '제품 전체#가격', '제품 전체#디자인', '제품 전체#품질', '제품 전체#편의성', '제품 전체#인지도',
                    '본품#일반', '본품#디자인', '본품#품질', '본품#편의성', '본품#다양성', '본품#가격', '본품#인지도',
                    '패키지/구성품#일반', '패키지/구성품#디자인', '패키지/구성품#품질', '패키지/구성품#편의성', '패키지/구성품#다양성', '패키지/구성품#가격', '패키지/구성품#인지도',
                    '브랜드#일반', '브랜드#가격', '브랜드#디자인', '브랜드#품질', '브랜드#인지도']
category_name2id = {category_id2name[i]: i for i in range(len(category_id2name))}
category_idx2name = {i: category_id2name[i] for i in range(len(category_id2name))}

polarity_id2name = ['positive', 'negative', 'neutral', 'n/a']
polarity_name2id = {polarity_id2name[i]: i for i in range(len(polarity_id2name))}

In [8]:
special_tokens_dict = {
    'additional_special_tokens': ['&name&', '&affiliation&', '&social-security-num&', '&tel-num&', '&card-num&', '&bank-account&', '&num&', '&online-account&']
}

# Dataset, DataLoader

In [9]:
class CustomDataset(Dataset):
  def __init__(
      self,
      path,
      tokenizer,
      aug_path = None,
      train = True,
      max_length=128
      ):

    self.istrain = train
    self.data = self._load_data(path, aug_path)
    self.tokenizer = tokenizer
    self.max_length = max_length



  def _load_json(self, path, encoding="utf-8"):
    with open(path, encoding=encoding) as f:
        json_list = [json.loads(line) for line in f.readlines()]
    return json_list


  def _load_data(self, path, aug_path):
    df = pd.DataFrame(self._load_json(path)).drop('id', axis=1)
    df_new = pd.DataFrame({'sentence_form': [], 'category': [], 'polarity': []})

    if self.istrain:
        for item in df.itertuples():
          for pair in category_id2name:
            isPairInOpinion = False
            for annotation in item[2]:
              category, polarity = annotation[0], annotation[2]

              if category == pair:
                df_new.loc[len(df_new)] = [item[1], category, polarity]
                isPairInOpinion = True
                break

            if isPairInOpinion is False:
              df_new.loc[len(df_new)] = [item[1], pair, 'n/a']

        if aug_path is not None:
          df_aug = pd.DataFrame(self._load_json(aug_path)).drop('id', axis=1)
          for item in df_aug.itertuples():
            category, polarity = annotation[0], annotation[2]
            df_new.loc[len(df_new)] = [item[1], category, polarity]
    else:
        for item in df.itertuples():
          for pair in category_id2name:
            isPairInOpinion = False
            for annotation in item[2]:
              category, polarity = annotation[0], annotation[2]

              if category == pair:
                df_new.loc[len(df_new)] = [item[1], category, polarity]
                isPairInOpinion = True
                break

            if isPairInOpinion is False:
              df_new.loc[len(df_new)] = [item[1], pair, 'n/a']


    return df_new


  def __len__(self):
    return len(self.data)


  def __getitem__(self, idx):
    row = self.data.iloc[idx, :]

    input = self.tokenizer(row['sentence_form'], row['category'], max_length=self.max_length, truncation=True, padding='max_length')

    input_ids = input['input_ids']
    input_ids[0] = self.tokenizer.cls_token_id
    attention_mask = input['attention_mask']
    label = polarity_name2id[row['polarity']]
    del input


    return {
        'input_ids': torch.LongTensor(input_ids),
        'attention_mask': torch.LongTensor(attention_mask),
        'label': torch.tensor(label)
        }

In [10]:
tokenizer = AutoTokenizer.from_pretrained(model_config['base_model'])
tokenizer.add_special_tokens(special_tokens_dict)

train_dataset = CustomDataset(path_config['train_data'], tokenizer)
test_dataset= CustomDataset(path_config['dev_data'], tokenizer)

Downloading (…)okenizer_config.json:   0%|          | 0.00/288 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/514 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.78M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [11]:
model = AutoModelForSequenceClassification.from_pretrained(model_config['base_model'], num_labels=4)
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at beomi/KcELECTRA-base were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at beomi/KcELECTRA-base and are newly initialized: ['classifier.dense.weight', 'classifier.

ElectraForSequenceClassification(
  (electra): ElectraModel(
    (embeddings): ElectraEmbeddings(
      (word_embeddings): Embedding(30000, 768, padding_idx=3)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ElectraEncoder(
      (layer): ModuleList(
        (0-11): 12 x ElectraLayer(
          (attention): ElectraAttention(
            (self): ElectraSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ElectraSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): L

In [12]:
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
      labels = inputs['labels']
      outputs = model(**inputs)
      logits = outputs.get('logits')
      loss_fct = nn.CrossEntropyLoss(weight=torch.tensor([1.0, 5.0, 5.0, 0.1]).to(device))
      loss = loss_fct(logits,labels)
      return (loss, outputs) if return_outputs else loss

In [13]:
training_args = TrainingArguments(
    output_dir=path_config['model']+'polarity_classification/',
    do_train = True,
    do_eval = True,
    learning_rate = 2e-5,
    warmup_steps=1000,
    weight_decay=0.01,
    num_train_epochs=train_config['n_epochs'],
    per_device_train_batch_size=data_config['batch_size'],
    per_device_eval_batch_size=data_config['batch_size'],
    evaluation_strategy='epoch',
    logging_dir='./logs',
    logging_steps=1000,
    save_total_limit=2,
)

In [18]:
def compute_metrics(pred):
  TP = 0
  FP = 0
  FN = 0
  eps = 1e-7

  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)

  # positive
  pos_label = set(np.where(labels==0)[0].tolist())
  pos_preds = set(np.where(preds==0)[0].tolist())
  # negative
  neg_label = set(np.where(labels==1)[0].tolist())
  neg_preds = set(np.where(preds==1)[0].tolist())
  # neutral
  neu_label = set(np.where(labels==2)[0].tolist())
  neu_preds = set(np.where(preds==2)[0].tolist())

  TP = len(set.intersection(pos_label, pos_preds))+len(set.intersection(neu_label, neu_preds))+len(set.intersection(neg_label, neg_preds))
  FP = len(pos_preds-pos_label)+len(neu_preds-neu_label)+len(neg_preds-neg_label)
  FN = len(pos_label-pos_preds)+len(neu_label-neu_preds)+len(neg_label-neg_preds)

  precision = TP / (TP+FP+eps)
  recall = TP / (TP+FN+eps)
  f1 = (2*precision*recall) / (precision+recall+eps)
  acc = accuracy_score(labels, preds)

  return {
      'accuracy' : acc,
      'f1' : f1,
      'precision' : precision,
      'recall' : recall
  }

In [19]:
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,1.327285,0.093333,0.054795,0.028571,0.666667
2,No log,1.216858,0.96,0.0,0.0,0.0


KeyboardInterrupt: ignored