In [1]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset
import torch
import ast

## 데이터 파일 읽기

In [2]:
file_path = './data/NER_결제승인.xlsx'
data = pd.read_excel(file_path)

data.head()

Unnamed: 0,message,result,annotations
0,"[Web발신] [신한체크승인] 장*진(4384) 01/03 17:24 (금액)5,0...","{\n""Method"":""신한체크"",\n""Location"":""서부카투사식당"",\n""T...","[(9, 13, 'METHOD'), (50, 57, 'LOCATION'), (27,..."
1,"[Web발신] [신한체크승인] 장*진(4384) 01/05 12:04 (금액)5,0...","{\n""Method"":""신한체크"",\n""Location"":""서부카투사식당"",\n""T...","[(9, 13, 'METHOD'), (50, 57, 'LOCATION'), (27,..."
2,"[Web발신] [신한체크승인] 장*진(4384) 08/16 22:47 (금액)1,5...","{\n""Method"":""신한체크"",\n""Location"":""태평할인마트"",\n""Ti...","[(9, 13, 'METHOD'), (50, 56, 'LOCATION'), (27,..."
3,"[Web발신] [신한체크승인] 장*진(8730) 12/20 13:35 (금액)4,2...","{\n""Method"":""신한체크"",\n""Location"":""한솥도시락한림대앞점"",\...","[(9, 13, 'METHOD'), (50, 60, 'LOCATION'), (27,..."
4,"[Web발신] [신한체크승인] 장*진(8730) 12/21 23:30 (금액)1,0...","{\n""Method"":""신한체크"",\n""Location"":""네이버페이"",\n""Tim...","[(9, 13, 'METHOD'), (50, 55, 'LOCATION'), (27,..."


## 데이터 전처리

In [3]:
# annotations 컬럼의 문자열을 파이썬 리스트로 변환
data['annotations'] = data['annotations'].apply(ast.literal_eval)

# 태그를 정수로 매핑하는 딕셔너리
tag2id = {'O': 0, 'METHOD': 1, 'LOCATION': 2, 'TIME': 3, 'COST': 4}

# 사전 훈련된 토크나이저 로드
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

## 토큰화 및 엔티티 태그 할당

In [4]:
# 토큰화 및 태그 할당 함수
def tokenize_and_align_labels(texts, annotations, tokenizer):
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True, is_split_into_words=False, return_offsets_mapping=True)
    labels = []

    for i, annotation in enumerate(annotations):
        doc_labels = np.ones(len(tokenized_inputs['input_ids'][i]), dtype=int) * -100
        offsets = tokenized_inputs['offset_mapping'][i]

        for start, end, tag in annotation:
            token_start_index = 0
            token_end_index = 0

            # 시작 인덱스와 끝 인덱스 찾기
            for idx, (offset_start, offset_end) in enumerate(offsets):
                if start >= offset_start and start < offset_end:
                    token_start_index = idx
                    break
            for idx, (offset_start, offset_end) in enumerate(offsets):
                if end > offset_start and end <= offset_end:
                    token_end_index = idx
                    break

            # 해당 범위의 토큰들에 태그 할당
            if tag in tag2id:
                doc_labels[token_start_index:token_end_index + 1] = tag2id[tag]

        labels.append(doc_labels)

    # offset_mapping 정보는 더 이상 필요 없으므로 제거
    tokenized_inputs.pop('offset_mapping')
    return tokenized_inputs, labels

In [5]:
# 실제 데이터에 대해 토큰화 및 태그 할당
texts = data['message'].tolist()
annotations = data['annotations'].tolist()
tokenized_texts, labels = tokenize_and_align_labels(texts, annotations, tokenizer)

## 훈련 데이터 셋 준비

In [6]:
# 데이터셋 클래스 정의
class NERDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


In [7]:
# 데이터셋 객체 생성
train_dataset = NERDataset(tokenized_texts, labels)

## 모델 및 학습 설정

In [8]:
# 모델 로드
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=len(tag2id))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# 훈련 설정
training_args = TrainingArguments(
    output_dir='./model',
    num_train_epochs=10,
    per_device_train_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)

In [10]:
# 트레이너 생성 및 훈련 시작
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)

trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss


TrainOutput(global_step=30, training_loss=1.4053236643473308, metrics={'train_runtime': 38.1549, 'train_samples_per_second': 11.532, 'train_steps_per_second': 0.786, 'total_flos': 22680357070800.0, 'train_loss': 1.4053236643473308, 'epoch': 10.0})

## 모델 저장

In [12]:
# 모델 저장
model_path = "./model/bert"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./model/bert/tokenizer_config.json',
 './model/bert/special_tokens_map.json',
 './model/bert/vocab.txt',
 './model/bert/added_tokens.json',
 './model/bert/tokenizer.json')