In [1]:
import pandas as pd
import torch
import ast
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoModelForTokenClassification, AutoTokenizer

## 데이터 파일 읽기

In [2]:
file_path = './data/NER학습데이터/NER_자동이체.xlsx'
data = pd.read_excel(file_path)

data.head()

Unnamed: 0,message,result,annotations
0,[Web발신] [신한은행] 장＊진 고객님께서 (주) 인터파크 업체(기관코드 1000...,"{'BANK': '신한은행', 'COMPANY': '(주) 인터파크 업체'}","[(9, 13, 'BANK'), (25, 36, 'COMPANY')]"
1,[Web발신] [신한은행] 장＊진 고객님께서 주식회사 차이코퍼레이션(기관코드 C10...,"{'BANK': '신한은행', 'COMPANY': '주식회사 차이코퍼레이션'}","[(9, 13, 'BANK'), (25, 37, 'COMPANY')]"
2,[Web발신] [신한은행] 장＊진 고객님께서 차이코퍼레이션 업체(기관코드 00410...,"{'BANK': '신한은행', 'COMPANY': '차이코퍼레이션 업체'}","[(9, 13, 'BANK'), (25, 35, 'COMPANY')]"
3,[Web발신] [신한은행] 장＊진 고객님께서 비즈플레이(기관코드 K210100017...,"{'BANK': '신한은행', 'COMPANY': '비즈플레이'}","[(9, 13, 'BANK'), (25, 30, 'COMPANY')]"
4,[Web발신] [신한은행] 장＊진 고객님께서 비즈제로페이 업체(기관코드 004500...,"{'BANK': '신한은행', 'COMPANY': '비즈제로페이 업체'}","[(9, 13, 'BANK'), (25, 34, 'COMPANY')]"


## 데이터 전처리

In [3]:
# annotations 컬럼의 문자열을 파이썬 리스트로 변환
data['annotations'] = data['annotations'].apply(ast.literal_eval)

# 태그를 정수로 매핑하는 딕셔너리
tag2id = {'O': 0, 'BANK': 1, 'COMPANY': 2}

# KoBERT 토크나이저 로드
model_name = "monologg/kobert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

## 토큰화 및 엔티티 태그 할당

In [4]:
def tokenize_and_align_labels(tokenizer, texts, annotations, tag2id):
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt", return_offsets_mapping=True)
    labels = []

    for i, annotation in enumerate(annotations):
        offset_mapping = tokenized_inputs['offset_mapping'][i].tolist()  # offset_mapping을 리스트로 변환
        label_ids = [tag2id['O']] * len(offset_mapping)  # Initialize with the 'O' label

        # Convert character-level annotation to token-level annotation
        for start_char, end_char, label in annotation:
            # Find the start token index
            start_token_index = None
            end_token_index = None
            for idx, (offset_start, offset_end) in enumerate(offset_mapping):
                if start_token_index is None and offset_start <= start_char < offset_end:
                    start_token_index = idx
                if offset_start < end_char <= offset_end:
                    end_token_index = idx
                    break  # Stop the loop once the end token is found

            # It's possible that a single word gets split into multiple tokens.
            # We need to assign the correct label to all tokens derived from the word.
            if start_token_index is not None and end_token_index is not None:
                for token_index in range(start_token_index, end_token_index + 1):
                    label_ids[token_index] = tag2id[label]

        # Set labels for special tokens to -100 so that they are not used in the loss calculation
        label_ids = [-100 if token_idx in (tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id) else label for token_idx, label in zip(tokenized_inputs["input_ids"][i].tolist(), label_ids)]
        labels.append(label_ids)

    # Remove offset_mapping from tokenized_inputs for model training
    tokenized_inputs.pop("offset_mapping")

    return tokenized_inputs, labels

In [5]:
# 실제 데이터에 대해 토큰화 및 태그 할당
texts = data['message'].tolist()
annotations = data['annotations'].tolist()
tokenized_texts, labels = tokenize_and_align_labels(tokenizer, texts, annotations, tag2id)

## 훈련 데이터 셋 준비

In [6]:
class NERDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])  # labels를 텐서로 변환
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


In [7]:
# 데이터셋 객체 생성
train_dataset = NERDataset(tokenized_texts, labels)

## 모델 및 학습 설정

In [8]:
# 모델 로드
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(tag2id))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# 학습 준비
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
model.train()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [10]:
# 최적화기 설정
optimizer = AdamW(model.parameters(), lr=5e-5)

### GPU 사용 코드

In [11]:
# for epoch in range(3):
#     for batch in train_loader:
#         optimizer.zero_grad()
#         input_ids = batch['input_ids'].to(torch.device('cuda'))
#         attention_mask = batch['attention_mask'].to(torch.device('cuda'))
#         labels = batch['labels'].to(torch.device('cuda'))
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

### CPU 사용 코드

In [12]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 학습 과정
for epoch in range(25):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1, Loss: 0.8072622567415237
Epoch 2, Loss: 0.23196949809789658
Epoch 3, Loss: 0.19795387238264084
Epoch 4, Loss: 0.19378655403852463
Epoch 5, Loss: 0.1899193450808525
Epoch 6, Loss: 0.1906154751777649
Epoch 7, Loss: 0.18877806514501572
Epoch 8, Loss: 0.18817968666553497
Epoch 9, Loss: 0.18682911247015
Epoch 10, Loss: 0.18326665461063385
Epoch 11, Loss: 0.1763073280453682
Epoch 12, Loss: 0.1637028232216835
Epoch 13, Loss: 0.1525496169924736
Epoch 14, Loss: 0.13995250314474106
Epoch 15, Loss: 0.12373781204223633
Epoch 16, Loss: 0.11843179538846016
Epoch 17, Loss: 0.09095700830221176
Epoch 18, Loss: 0.07870538160204887
Epoch 19, Loss: 0.06261428631842136
Epoch 20, Loss: 0.055289264768362045
Epoch 21, Loss: 0.05215075425803661
Epoch 22, Loss: 0.04429098404943943
Epoch 23, Loss: 0.03517486434429884
Epoch 24, Loss: 0.02727077528834343
Epoch 25, Loss: 0.021283176727592945


## 학습된 모델 테스트 함수

In [13]:
id2tag = {0: 'O', 1: 'BANK', 2: 'COMPANY'}
def predict(model, tokenizer, device, message):
    model.eval()
    inputs = tokenizer(message, return_tensors="pt", padding=True, truncation=True, max_length=512, return_offsets_mapping=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    offset_mapping = inputs["offset_mapping"].detach().cpu().numpy()[0]  # 오프셋 매핑 정보
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

    labels = [id2tag[id] for id in predictions[0].cpu().numpy()]
    extracted_info = {"BANK": "", "COMPANY": ""}
    
    for i, (offset, label) in enumerate(zip(offset_mapping, labels)):
        if label != "O":
            start, end = offset
            extracted_text = message[start:end]
            extracted_info[label] += extracted_text + " "

    for key in extracted_info:
        extracted_info[key] = extracted_info[key].strip()

    return extracted_info



In [14]:
# 예시 메시지로 예측 테스트
test_message = "[Web발신] [신한은행] 장＊진 고객님께서 카카오페이(기관코드 C010268581)에서 신청하신 오픈뱅킹 출금이체건이 아래와 같이 등록되었습니다. 출금은행 : 신한은행 계좌번호 : 110-***-*88420 예금주명 : 장＊진 납부자번호 : 20220726347026339639 오픈뱅킹 출금이체를 신청하지 않은 경우 해당기관인 카카오페이(☎16447405)로 문의하여 주시기 바랍니다. 신한은행 고객센터(☎1599-8000) ※ 본 메시지는 오픈뱅킹 출금이체 관련 부당출금을 예방하기 위하여 고객님께 무료로 통지하고 있습니다."
predicted_info = predict(model, tokenizer, device, test_message)
for domain, text in predicted_info.items():
    print(f"{domain}: {text.strip()}")


BANK: 신한은행
COMPANY: 카카오페이


## 모델 저장

In [15]:
model_save_path = "./model/kobert_자동이체"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./model/kobert_자동이체/tokenizer_config.json',
 './model/kobert_자동이체/special_tokens_map.json',
 './model/kobert_자동이체/vocab.txt',
 './model/kobert_자동이체/added_tokens.json',
 './model/kobert_자동이체/tokenizer.json')