In [46]:
import pandas as pd
import torch
import ast
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoModelForTokenClassification, AutoTokenizer

## 데이터 파일 읽기

In [47]:
file_path = './data/NER학습데이터/NER_계좌개설.xlsx'
data = pd.read_excel(file_path)

data.head()

Unnamed: 0,message,result,annotations
0,[Web발신] [카카오뱅크] 장*진님 입출금통장(2340)이 개설되었습니다. 금융사...,"{'BANK': '카카오뱅크', 'TYPE': '입출금통장'}","[(9, 14, 'BANK'), (21, 26, 'TYPE')]"
1,"[Web발신] [카카오뱅크] 장*진님, 미래에셋증권 주식계좌 개설신청이 접수되었습니...","{'BANK': '미래에셋증권', 'TYPE': '주식계좌'}","[(22, 28, 'BANK'), (29, 33, 'TYPE')]"
2,[Web발신] 장우진님의 토스증권 계좌가 개설되었어요. 이제 토스증권에서 투자를 시...,"{'BANK': '토스증권', 'TYPE': '주식계좌'}","[(14, 18, 'BANK')]"
3,[Web발신] [카카오뱅크] 장*진님 입출금통장(4687)이 개설되었습니다. 금융사...,"{'BANK': '카카오뱅크', 'TYPE': '입출금통장'}","[(9, 14, 'BANK'), (21, 26, 'TYPE')]"
4,"[Web발신] [카카오뱅크] 장*진님, KB증권 주식계좌 개설신청이 접수되었습니다....","{'BANK': 'KB증권', 'TYPE': '주식계좌'}","[(22, 26, 'BANK'), (27, 31, 'TYPE')]"


## 데이터 전처리

In [48]:
# annotations 컬럼의 문자열을 파이썬 리스트로 변환
data['annotations'] = data['annotations'].apply(ast.literal_eval)

# 태그를 정수로 매핑하는 딕셔너리
tag2id = {'O': 0, 'BANK': 1, 'TYPE': 2}

# KoBERT 토크나이저 로드
model_name = "monologg/kobert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

## 토큰화 및 엔티티 태그 할당

In [49]:
def tokenize_and_align_labels(tokenizer, texts, annotations, tag2id):
    tokenized_inputs = tokenizer(texts, truncation=True, padding=True, return_tensors="pt", return_offsets_mapping=True)
    labels = []

    for i, annotation in enumerate(annotations):
        offset_mapping = tokenized_inputs['offset_mapping'][i].tolist()  # offset_mapping을 리스트로 변환
        label_ids = [tag2id['O']] * len(offset_mapping)  # Initialize with the 'O' label

        # Convert character-level annotation to token-level annotation
        for start_char, end_char, label in annotation:
            # Find the start token index
            start_token_index = None
            end_token_index = None
            for idx, (offset_start, offset_end) in enumerate(offset_mapping):
                if start_token_index is None and offset_start <= start_char < offset_end:
                    start_token_index = idx
                if offset_start < end_char <= offset_end:
                    end_token_index = idx
                    break  # Stop the loop once the end token is found

            # It's possible that a single word gets split into multiple tokens.
            # We need to assign the correct label to all tokens derived from the word.
            if start_token_index is not None and end_token_index is not None:
                for token_index in range(start_token_index, end_token_index + 1):
                    label_ids[token_index] = tag2id[label]

        # Set labels for special tokens to -100 so that they are not used in the loss calculation
        label_ids = [-100 if token_idx in (tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id) else label for token_idx, label in zip(tokenized_inputs["input_ids"][i].tolist(), label_ids)]
        labels.append(label_ids)

    # Remove offset_mapping from tokenized_inputs for model training
    tokenized_inputs.pop("offset_mapping")

    return tokenized_inputs, labels

In [50]:
# 실제 데이터에 대해 토큰화 및 태그 할당
texts = data['message'].tolist()
annotations = data['annotations'].tolist()
tokenized_texts, labels = tokenize_and_align_labels(tokenizer, texts, annotations, tag2id)

## 훈련 데이터 셋 준비

In [51]:
class NERDataset(Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels is not None:
            item['labels'] = torch.tensor(self.labels[idx])  # labels를 텐서로 변환
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])


In [52]:
# 데이터셋 객체 생성
train_dataset = NERDataset(tokenized_texts, labels)

## 모델 및 학습 설정

In [53]:
# 모델 로드
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(tag2id))

Some weights of BertForTokenClassification were not initialized from the model checkpoint at monologg/kobert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
# 학습 준비
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
model.train()

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(8002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [55]:
# 최적화기 설정
optimizer = AdamW(model.parameters(), lr=5e-5)

### GPU 사용 코드

In [56]:
# for epoch in range(3):
#     for batch in train_loader:
#         optimizer.zero_grad()
#         input_ids = batch['input_ids'].to(torch.device('cuda'))
#         attention_mask = batch['attention_mask'].to(torch.device('cuda'))
#         labels = batch['labels'].to(torch.device('cuda'))
#         outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
#         loss = outputs.loss
#         loss.backward()
#         optimizer.step()

### CPU 사용 코드

In [57]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 학습 과정
for epoch in range(40):
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Epoch 1, Loss: 1.1406569480895996
Epoch 2, Loss: 0.43953147530555725
Epoch 3, Loss: 0.27248793840408325
Epoch 4, Loss: 0.23816511034965515
Epoch 5, Loss: 0.22236695885658264
Epoch 6, Loss: 0.21640382707118988
Epoch 7, Loss: 0.21423012018203735
Epoch 8, Loss: 0.21606312692165375
Epoch 9, Loss: 0.21248814463615417
Epoch 10, Loss: 0.21310879290103912
Epoch 11, Loss: 0.2113528698682785
Epoch 12, Loss: 0.21176020801067352
Epoch 13, Loss: 0.20843113958835602
Epoch 14, Loss: 0.20668773353099823
Epoch 15, Loss: 0.20520201325416565
Epoch 16, Loss: 0.19755975902080536
Epoch 17, Loss: 0.19557730853557587
Epoch 18, Loss: 0.18676593899726868
Epoch 19, Loss: 0.1671893298625946
Epoch 20, Loss: 0.14836560189723969
Epoch 21, Loss: 0.1284675896167755
Epoch 22, Loss: 0.11500602215528488
Epoch 23, Loss: 0.12107397615909576
Epoch 24, Loss: 0.10536197572946548
Epoch 25, Loss: 0.10391106456518173
Epoch 26, Loss: 0.14461664855480194
Epoch 27, Loss: 0.09702137112617493
Epoch 28, Loss: 0.11427625268697739
Epoch

## 학습된 모델 테스트 함수

In [58]:
id2tag = {0: 'O', 1: 'BANK', 2: 'TYPE'}

def predict(model, tokenizer, device, message):
    model.eval()
    inputs = tokenizer(message, return_tensors="pt", padding=True, truncation=True, max_length=512, return_offsets_mapping=True)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)
    offset_mapping = inputs["offset_mapping"].detach().cpu().numpy()[0]  # 오프셋 매핑 정보
    
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)

    labels = [id2tag[id] for id in predictions[0].cpu().numpy()]
    extracted_info = {"BANK": "", "TYPE": ""}
    
    for i, (offset, label) in enumerate(zip(offset_mapping, labels)):
        if label != "O":
            start, end = offset
            extracted_text = message[start:end]
            extracted_info[label] += extracted_text + " "

    for key in extracted_info:
        extracted_info[key] = extracted_info[key].strip()

    return extracted_info



In [61]:
# 예시 메시지로 예측 테스트
test_message = "[Web발신] [카카오뱅크] 장*진님, 하나증권 주식계좌 개설신청이 접수되었습니다. 계좌개설은 하나증권에서 순차적으로 진행되어 시간이 다소 소요됩니다. 계좌개설이 완료되면 증권사에서 문자나 알림톡으로 안내해 드릴 예정입니다."
predicted_info = predict(model, tokenizer, device, test_message)
for domain, text in predicted_info.items():
    print(f"{domain}: {text.strip()}")


BANK: 하나증권
TYPE: 주식계좌


## 모델 저장

In [62]:
model_save_path = "./model/kobert_계좌개설"
model.save_pretrained(model_save_path)
tokenizer.save_pretrained(model_save_path)

('./model/kobert_계좌개설/tokenizer_config.json',
 './model/kobert_계좌개설/special_tokens_map.json',
 './model/kobert_계좌개설/vocab.txt',
 './model/kobert_계좌개설/added_tokens.json',
 './model/kobert_계좌개설/tokenizer.json')