In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Aug 21 15:55:53 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.105.17   Driver Version: 525.105.17   CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8    11W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 13.6 gigabytes of available RAM

Not using a high-RAM runtime


In [None]:
# change directory

%cd /content/drive/MyDrive/ai/competitions/2023_conv_sw_hackathon
!ls

/content/drive/MyDrive/ai/competitions/2023_conv_sw_hackathon
code	     data	       logs	 requirements
conversions  __installer__.sh  mlmodels  results


In [None]:
!pip install transformers
!pip install datasets
!pip install --upgrade accelerate==0.20.3



In [None]:
!pip install konlpy
!pip install mecab-python
!bash <(curl -s https://raw.githubusercontent.com/konlpy/konlpy/master/scripts/mecab.sh)

In [None]:
import nltk

nltk.download("popular")

In [None]:
!pip install svgling

In [None]:
sentence = '아이스 아메리카노는 두 잔 콜드 브루 라떼 한잔이랑 새우깡도 하나 주세요'
tokens = nltk.word_tokenize(sentence)
tagged = nltk.pos_tag(tokens)
entities = nltk.chunk.ne_chunk(tagged)
entities

In [None]:
# grammar = """
# NP: {<DT|PP\$>?<JJ>*<NN>}	  # rule 1
#   {<NNP>+}                  # rule 2
#   {<NNG>+}
#   {<NNP|NNG>+}
# """

grammar = """
NP: {<N.*>*<Suffix>?}   # Noun phrase
VP: {<V.*>*}            # Verb phrase
AP: {<A.*>*}            # Adjective phrase
"""

cp = nltk.RegexpParser(grammar)


In [None]:
from konlpy.tag import Mecab

mecab = Mecab()
mecab.morphs('아이스 아메리카노 두 잔 콜드 브루 라떼 한 잔 주세요')

In [None]:
tagged = mecab.pos(sentence)

In [None]:
cp.parse(tagged)

In [None]:
mecab.tagset

In [None]:
mecab.pos('아이스 아메리카노 두잔이랑 콜드 브루 라떼 한 잔 주세요')

In [None]:
mecab.tagset

In [None]:
import pandas as pd
import numpy as np
import os

from tqdm.auto import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from datasets import load_dataset
from transformers import AutoModel, AutoModelForTokenClassification, AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, Trainer, TrainingArguments

from sklearn import preprocessing
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split


In [None]:
train_files = ['슈퍼_train.csv', '음식점_train.csv', '식품_train.csv', '카페_train.csv']
valid_files = ['슈퍼_validation.csv', '음식점_validation.csv', '식품_validation.csv', '카페_validation.csv']

train_dfs = []
valid_dfs = []
for file in train_files:
    df = pd.read_csv(f"data/라벨링데이터_train/{file}", low_memory=False)
    df = df.dropna(subset=['상품명'], how='any', axis=0)
    df.reset_index(drop=True, inplace=True)
    train_dfs.append(df)

for file in valid_files:
    df = pd.read_csv(f"data/라벨링데이터_validation/{file}", low_memory=False)
    df = df.dropna(subset=['상품명'], how='any', axis=0)
    df.reset_index(drop=True, inplace=True)
    valid_dfs.append(df)

for file in [file for file in os.listdir("data/라벨링데이터_train") if file.endswith(".csv")]:
    df = pd.read_csv(f"data/라벨링데이터_train/{file}", low_memory=False)
    df = df.dropna(subset=['상품명'], how='any', axis=0)
    df = df[df['수량'].notnull()]
    df.reset_index(drop=True, inplace=True)
    train_dfs.append(df)

for file in [file for file in os.listdir("data/라벨링데이터_validation") if file.endswith(".csv")]:
    df = pd.read_csv(f"data/라벨링데이터_validation/{file}", low_memory=False)
    df = df.dropna(subset=['상품명'], how='any', axis=0)
    df = df[df['수량'].notnull()]
    df.reset_index(drop=True, inplace=True)
    valid_dfs.append(df)

train_df = pd.concat(train_dfs, ignore_index=True)
train_df.drop_duplicates(inplace=True, ignore_index=True)
valid_df = pd.concat(valid_dfs, ignore_index=True)
valid_df.drop_duplicates(inplace=True, ignore_index=True)

In [None]:
train_df = train_df[['IDX', '발화문', '가격', '수량', '사람', '상품명']]
# train_df

In [None]:
valid_df = valid_df[['IDX', '발화문', '가격', '수량', '사람', '상품명']]
# valid_df

In [None]:
pos_tag_ids = {}
pos_tag_ids['UNKNOWN'] = 0
for idx, key in enumerate(mecab.tagset.keys()):
  pos_tag_ids[key] = idx+1

pos_tag_ids

In [None]:
pos_tag_ids['NA'] = len(pos_tag_ids)

In [None]:
ner_tag_ids = {
    'O': 0,
    'B-MNU': 1,
    'I-MNU': 2,
    'B-CNT': 3,
    'I-CNT': 4
}
ner_tag_ids

In [None]:
# element -> menu or count (str)
def preprocess_element(elements):
    if pd.isna(elements):
        return []
    else:
        return list(map(lambda x: x.strip(), elements.split("|")))

preprocess_element("슈퍼 울트라 아이스 아메리카노| 카푸치노 츄이스티")

In [None]:
import random

def word_to_id_pos(pos_tag):
  pos_tag = pos_tag.split("+")
  idx = random.randrange(len(pos_tag))
  try:
    return pos_tag_ids[pos_tag[idx]]
  except:
    pos_tag_ids[pos_tag[idx]] = len(pos_tag_ids)
    return pos_tag_ids[pos_tag[idx]]

def word_to_id_ner(ner_tag):
  return ner_tag_ids[ner_tag]

def tag(df):
  id_list = []
  tokens_list = []
  pos_tags_list = []
  chunk_tags_list = []
  ner_tags_list = []
  for idx in df.index:
    sentence = df.loc[idx, '발화문']
    menus = list(set(preprocess_element(df.loc[idx, '상품명'])))
    counts = list(set(preprocess_element(df.loc[idx, '수량'])))
    menus_string = ' '.join(list(set(preprocess_element(df.loc[idx, '상품명']))))
    counts_string = ' '.join(list(set(preprocess_element(df.loc[idx, '수량']))))

    tagged_words = mecab.pos(sentence)
    tokens = []
    pos_tags = []
    chunk_tags = []
    ner_tags = []
    for tagged_word in tagged_words:
      word, tag = tagged_word
      tokens.append(word)
      pos_tags.append(word_to_id_pos(tag))
      chunk_tags.append(word_to_id_pos(tag))

      ner_tag = 0

      for menu in menus:
        start_idx = menu.find(word)
        if start_idx == 0:
          ner_tag = 1
          break
        elif start_idx == -1:
          ner_tag = 0
        else:
          ner_tag = 2
          break

      if ner_tag == 0:
        for count in counts:
          start_idx = count.find(word)
          if start_idx == 0:
            ner_tag = 3
            break
          elif start_idx == -1:
            ner_tag = 0
          else:
            ner_tag = 4
            break

      ner_tags.append(ner_tag)

    id_list.append(idx)
    tokens_list.append(tokens)
    pos_tags_list.append(pos_tags)
    chunk_tags_list.append(chunk_tags)
    ner_tags_list.append(ner_tags)

  return pd.DataFrame({
      'id': id_list,
      'token': tokens_list,
      'pos_tag': pos_tags_list,
      'chunk_tag': chunk_tags_list,
      'ner_tag': ner_tags_list
      })



In [None]:
sentence = "맛밤 10개 주문하려는데 오전 9시까지 배송되나요?"
menus = list(set(preprocess_element("맛밤")))
counts = list(set(preprocess_element("10개")))

tagged_words = mecab.pos(sentence)
tokens = []
pos_tags = []
chunk_tags = []
ner_tags = []
for tagged_word in tagged_words:
  print(tagged_word)
  word, tag = tagged_word
  tokens.append(word)
  pos_tags.append(word_to_id_pos(tag))
  chunk_tags.append(word_to_id_pos(tag))

  ner_tag = 0
  for count in counts:
    print(f"(count) count: {count}")
    print(f"(count) word: {word}")
    start_idx = count.find(word)
    print(f"(count) start_idx: {start_idx}")
    print()
    if start_idx == 0:
      ner_tag = 3
      break
    elif start_idx == -1:
      ner_tag = 0
    else:
      ner_tag = 4
      break

  for menu in menus:
    start_idx = menu.find(word)
    print(f"(menu) count: {count}")
    print(f"(menu) word: {word}")
    print(f"(menu) start_idx: {start_idx}")
    print()
    if start_idx == 0:
      ner_tag = 1
      break
    elif start_idx == -1:
      ner_tag = 0
    else:
      ner_tag = 2
      break

  ner_tags.append(ner_tag)


In [None]:
def change_count_unit(df):
    df = df.copy()
    for idx in df.index:
        storage_count = df.loc[idx, '수량']
        if type(df.loc[idx, '수량']) == str and ' ' not in df.loc[idx, '수량']:
            if random.randrange(0,2) == 0:
                df.loc[idx, '수량'] = df.loc[idx, '수량'].replace('개', ' 개')

        if type(df.loc[idx, '수량']) == str and '개' in df.loc[idx, '수량']:
            if random.randrange(0, 10) in (0,1,2,3):
                df.loc[idx, '수량'] = df.loc[idx, '수량'].replace('개', '잔')
            elif random.randrange(0,10) in (4,5):
                df.loc[idx, '수량'] = df.loc[idx, '수량'].replace('개', '조각')
            df.loc[idx, '발화문'] = df.loc[idx, '발화문'].replace(storage_count, df.loc[idx, '수량'])

    return df

In [None]:
changed_train_df = change_count_unit(train_df)
changed_valid_df = change_count_unit(valid_df)

In [None]:
tagged_train_df = tag(changed_train_df)
tagged_valid_df = tag(changed_valid_df)

In [None]:
# tagged_train_df

In [None]:
# tagged_valid_df

In [None]:
tagged_valid_df, tagged_test_df, _, _ = train_test_split(tagged_valid_df, tagged_valid_df['ner_tag'], test_size=0.2, random_state=41)

In [None]:
# tagged_valid_df

In [None]:
# tagged_test_df

In [None]:
# tagged_train_df.to_pickle("data/tagged_total_train.pkl")
# tagged_valid_df.to_pickle("data/tagged_total_valid.pkl")
# tagged_test_df.to_pickle("data/tagged_total_test.pkl")

In [None]:
# tagged_train_df.to_json("data/tagged_total_train.json")
# tagged_valid_df.to_json("data/tagged_total_valid.json")
# tagged_test_df.to_json("data/tagged_total_test.json")

In [None]:
tagged_train_df.to_csv("data/tagged_total_train.csv")
tagged_valid_df.to_csv("data/tagged_total_valid.csv")
tagged_test_df.to_csv("data/tagged_total_test.csv")

In [None]:
from datasets import load_dataset

data_files = {
    "train": "data/tagged_total_train.csv",
    "valid": "data/tagged_total_valid.csv",
    "test": "data/tagged_total_test.csv"}
raw_datasets = load_dataset("csv", data_files=data_files)
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'id', 'token', 'pos_tag', 'chunk_tag', 'ner_tag'],
        num_rows: 541369
    })
    valid: Dataset({
        features: ['Unnamed: 0', 'id', 'token', 'pos_tag', 'chunk_tag', 'ner_tag'],
        num_rows: 59712
    })
    test: Dataset({
        features: ['Unnamed: 0', 'id', 'token', 'pos_tag', 'chunk_tag', 'ner_tag'],
        num_rows: 14929
    })
})

In [None]:
train_datasets = raw_datasets['train']
train_datasets

Dataset({
    features: ['Unnamed: 0', 'id', 'token', 'pos_tag', 'chunk_tag', 'ner_tag'],
    num_rows: 541369
})

In [None]:
import ast
# load the dataset and copy the features
def process(ex):
    return {
        "id": ex["id"],
        "token": ast.literal_eval(ex["token"]),
        "pos_tag": ast.literal_eval(ex["pos_tag"]),
        "chunk_tag": ast.literal_eval(ex["chunk_tag"]),
        "ner_tag": ast.literal_eval(ex["ner_tag"])
        }
raw_datasets = raw_datasets.map(process)

In [None]:
for split in raw_datasets.keys():
    raw_datasets[split] = raw_datasets[split].remove_columns(["Unnamed: 0"])
    # raw_datasets[split] = raw_datasets[split].remove_columns(["id"])

In [None]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'token', 'pos_tag', 'chunk_tag', 'ner_tag'],
        num_rows: 541369
    })
    valid: Dataset({
        features: ['id', 'token', 'pos_tag', 'chunk_tag', 'ner_tag'],
        num_rows: 59712
    })
    test: Dataset({
        features: ['id', 'token', 'pos_tag', 'chunk_tag', 'ner_tag'],
        num_rows: 14929
    })
})

In [None]:
raw_datasets['train'][1000]

{'id': 1000,
 'token': ['요즘', '은', '달걀', '한판', '이', '삼', '십', '개', '가', '아니', '네요', '?'],
 'pos_tag': [21, 15, 21, 21, 13, 24, 24, 20, 9, 35, 2, 27],
 'chunk_tag': [21, 15, 21, 21, 13, 24, 24, 20, 9, 35, 2, 27],
 'ner_tag': [0, 0, 1, 0, 0, 3, 4, 4, 0, 0, 0, 0]}

In [None]:
ner_feature = raw_datasets['train'].features['ner_tag']
ner_feature

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

In [None]:
label_names = ['O', 'B-MNU', 'I-MNU', 'B-CNT', 'I-CNT']

words = raw_datasets['train'][1000]['token']
labels = raw_datasets['train'][1000]['ner_tag']
line1 = ""
line2 = ""
for word, label in zip(words, labels):
  full_label = label_names[label]
  max_length = max(len(word), len(full_label))
  line1 += word + " " * (max_length - len(word) + 1)
  line2 += full_label + " " * (max_length - len(full_label) + 1)

print(line1)
print(line2)

요즘 은 달걀    한판 이 삼     십     개     가 아니 네요 ? 
O  O B-MNU O  O B-CNT I-CNT I-CNT O O  O  O 


In [None]:
MODEL_NAME = './results/RoBERTa_TokenCl_230821_2/checkpoint-2110'

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
config = AutoConfig.from_pretrained(MODEL_NAME)
# config.num_labels = 189
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config, ignore_mismatched_sizes=True)

In [None]:
tokenizer.is_fast

True

In [None]:
inputs = tokenizer(raw_datasets["train"][1000]["token"], is_split_into_words=True)
inputs.tokens()
# inputs

['[CLS]',
 '요즘',
 '은',
 '달걀',
 '한판',
 '이',
 '삼',
 '십',
 '개',
 '가',
 '아니',
 '네요',
 '?',
 '[SEP]']

In [None]:
inputs.word_ids()

[None, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, None]

In [None]:
def align_labels_with_tokens(labels, word_ids):
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = -100 if word_id is None else labels[word_id]
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["token"], truncation=True, is_split_into_words=True
    )
    all_labels = examples["ner_tag"]
    new_labels = []
    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(i)
        new_labels.append(align_labels_with_tokens(labels, word_ids))

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
tokenized_datasets = raw_datasets.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=raw_datasets["train"].column_names,
)


Map:   0%|          | 0/541369 [00:00<?, ? examples/s]

Map:   0%|          | 0/59712 [00:00<?, ? examples/s]

Map:   0%|          | 0/14929 [00:00<?, ? examples/s]

In [None]:
# tokenized_datasets['train'][1000]

In [None]:
# batch = data_collator([tokenized_datasets["train"][i+1000] for i in range(2)])
# batch["labels"]

In [None]:
!pip install seqeval



In [None]:
!pip install evaluate



In [None]:
import evaluate

metric = evaluate.load("seqeval")

In [None]:
ner_feature = raw_datasets["train"].features["ner_tag"]
ner_feature

Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None)

In [None]:
label_names = ['O', 'B-MNU', 'I-MNU', 'B-CNT', 'I-CNT']
label_names

['O', 'B-MNU', 'I-MNU', 'B-CNT', 'I-CNT']

In [None]:
labels = raw_datasets['train'][1000]['ner_tag']
labels = [label_names[i] for i in labels]
print(labels)
print(raw_datasets['train'][1000]['ner_tag'])

['O', 'O', 'B-MNU', 'O', 'O', 'B-CNT', 'I-CNT', 'I-CNT', 'O', 'O', 'O', 'O']
[0, 0, 1, 0, 0, 3, 4, 4, 0, 0, 0, 0]


In [None]:
def compute_metrics(eval_preds):
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)

    # Remove ignored index (special tokens) and convert to labels
    true_labels = [[label_names[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": all_metrics["overall_precision"],
        "recall": all_metrics["overall_recall"],
        "f1": all_metrics["overall_f1"],
        "accuracy": all_metrics["overall_accuracy"],
    }

In [None]:
id2label = {i: label for i, label in enumerate(label_names)}
label2id = {v: k for k, v in id2label.items()}

In [None]:
id2label

{0: 'O', 1: 'B-MNU', 2: 'I-MNU', 3: 'B-CNT', 4: 'I-CNT'}

In [None]:
label2id

{'O': 0, 'B-MNU': 1, 'I-MNU': 2, 'B-CNT': 3, 'I-CNT': 4}

In [None]:
model = AutoModelForTokenClassification.from_pretrained(
    MODEL_NAME,
    id2label=id2label,
    label2id=label2id,
)

In [None]:
model.config.num_labels

5

In [None]:
THE_MODEL = 'RoBERTa_TokenCl_230822_1'
OUTPUT_PATH = f'./results/{THE_MODEL}'
SUBMISSION_PATH = f'./submission/{THE_MODEL}.csv'
PROBS_PATH = f'./submission/{THE_MODEL}_probs.csv'
LOG_PATH = f'./logs/{THE_MODEL}'

In [None]:
EPOCHS = 1
TOTAL_BATCH_SIZE = 128
BATCH_SIZE = 4
GRADIENT_ACCUMULATION = max(TOTAL_BATCH_SIZE // BATCH_SIZE, 1)
TOTAL_STEPS = len(raw_datasets['train']) * EPOCHS // TOTAL_BATCH_SIZE
LOGGING_STEPS = TOTAL_STEPS // 20
SAVE_STEPS = LOGGING_STEPS

training_args = TrainingArguments(
    output_dir = OUTPUT_PATH,
    save_total_limit = 10,
    save_steps = SAVE_STEPS,
    num_train_epochs = EPOCHS,
    learning_rate = 1e-5, #1e-5, 3e-5, #5e-5
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = GRADIENT_ACCUMULATION,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_dir=LOG_PATH,
    logging_steps=LOGGING_STEPS,
    evaluation_strategy='steps',
    load_best_model_at_end=True,
    fp16=True,
    eval_steps=SAVE_STEPS,
    metric_for_best_model='f1',
    remove_unused_columns=False
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['valid'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer
)

trainer.train()

model.save_pretrained(OUTPUT_PATH)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
211,0.0772,0.081944,0.862463,0.865504,0.863981,0.97064
422,0.0516,0.095867,0.862247,0.858244,0.860241,0.969769
633,0.0426,0.097845,0.856151,0.860134,0.858137,0.969545
