# 트랜스포머 설치

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 4.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 45.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 23.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.6 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 52.3 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


# 말뭉치 다운로드

In [2]:
import pandas as pd
import tqdm
import urllib.request

In [3]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
chat_data = pd.read_csv('ChatBotData.csv')

In [4]:
chat_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [5]:
len(chat_data)

11823

# 토크나이저 준비

`PreTrainedTokenizerFast`를 사용하여 KoGPT2의 Tokenizer 설정 정보를 불러온다.

In [6]:
from transformers import PreTrainedTokenizerFast

In [7]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token='</s>',       # 문장 시작토큰
    eos_token='</s>',       # 문장 마지막토큰
    unk_token='<unk>',      # 어휘에 없는 토큰
    pad_token='<pad>',      # 크기 맞추기 토큰
    mask_token='<mask>',     # 마스킹 토큰
)

Downloading:   0%|          | 0.00/2.69M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


스페셜 토큰 확인

In [8]:
for i in range (10):
    print("index : ",i," =  tokens : ",tokenizer.decode(i))

index :  0  =  tokens :  <s>
index :  1  =  tokens :  </s>
index :  2  =  tokens :  <usr>
index :  3  =  tokens :  <pad>
index :  4  =  tokens :  <sys>
index :  5  =  tokens :  <unk>
index :  6  =  tokens :  <mask>
index :  7  =  tokens :  <d>
index :  8  =  tokens :  </d>
index :  9  =  tokens :  <unused0>


# 토크나이징, 데이터 구축

`</s>` `<usr>` 유저 질문 `<sys>` 모델 답변 `</s>` `<pad>`...

In [9]:
import numpy as np
from dataclasses import dataclass
from typing import List, Optional
import torch
from torch.utils.data import Dataset

In [None]:
"""
@dataclass
class Features:
    token_ids: List[int]
    attention_mask: Optional[List[int]] = None
    token_type_ids: Optional[List[int]] = None
    label_ids: Optional[List[int]] = None
"""

'\n@dataclass\nclass Features:\n    token_ids: List[int]\n    attention_mask: Optional[List[int]] = None\n    token_type_ids: Optional[List[int]] = None\n    label_ids: Optional[List[int]] = None\n'

In [10]:
# 챗봇 데이터를 처리하는 클래스
class ChatbotDataset(Dataset):
    def __init__(self, chats, max_len=50):  # 데이터셋의 전처리를 해주는 부분
        self._data = chats
        self.max_len = max_len
        self.q_token = "<usr>"
        self.a_token = "<sys>"
        self.bos = tokenizer.bos_token
        self.eos = tokenizer.eos_token
        self.mask = tokenizer.mask_token
        self.tokenizer = tokenizer

    def __len__(self):  # chatbotdata 의 길이를 리턴
        return len(self._data)

    def __getitem__(self, idx):  # 로드한 챗봇 데이터를 차례차례 DataLoader로 넘겨주는 메서드
        index = self._data.iloc[idx]

        q = index["Q"]  # 질문
        q_toked = self.tokenizer.tokenize(self.bos + self.q_token + q)      # </s> <usr> 질문
        q_len = len(q_toked)

        a = index["A"]  # 답변
        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)      # <sys> 답 </s>
        a_len = len(a_toked)

        # 질문의 길이가 최대길이보다 클때
        if q_len > self.max_len: 
            q_toked = q_toked[-(int(self.max_len / 2)):]   # 질문길이를 최대길이의 반으로 
            q_len = len(q_toked)
            """
            a_len = self.max_len - q_len              # 답변의 길이를 최대길이 - 질문길이
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)
            """

        # 질문 + 답변 길이가 최대길이보다 클때
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len        # 답변의 길이 = 최대길이 - 질문길이

            if a_len <= 0:       # 질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]   # 질문길이를 최대길이의 반으로 
                q_len = len(q_toked)
                a_len = self.max_len - q_len              # 답변의 길이를 최대길이 - 질문길이
                
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        # 질문 + 답변을 index로 변환   
        token = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)
        # 최대길이만큼 padding
        while len(token) < self.max_len:
            token += [self.tokenizer.pad_token_id]

        # attention(어텐션마스크) = 질문+답변 길이 1 + 나머지(패딩) 0
        attention = [1]*(q_len+a_len) + [0]*(self.max_len - q_len - a_len)

        # token_type_ids(세그먼트 정보) = 질문길이 0 + 답변길이 1 + 나머지 0
        token_type = [0]*q_len + [1]*a_len + [0]*(self.max_len - q_len - a_len)

        # labels(답변) = [<mask>, <mask>, ...., <mask>, ..., <sys>,..답변.. </s>, <pad>....]
        labels = [self.mask,] * q_len + a_toked[0:]
        # index로 변환
        labels = self.tokenizer.convert_tokens_to_ids(labels)
        # 최대길이만큼 padding
        while len(labels) < self.max_len:
            labels += [self.tokenizer.pad_token_id]

        """
        features = []    
        feature = Features(
            token_ids = token, attention_mask = attention, token_type_ids = token_type, label_ids = labels
        )
        features.append(feature)
        """
        # 질문 + 답변, 어텐션마스크, 세그먼트 정보, 답변
        # return features
        return (token, attention, token_type, labels)

데이터셋 구축

구성 : token_ids, attention_mask, token_type_ids, label_ids)

In [11]:
chat_dataset = ChatbotDataset(chat_data, max_len=50)

In [12]:
for n in range(3):
    print("chat_dataset[",n,"]")
    print("token_ids      : ", chat_dataset[n][0])
    print("attention_mask : ", chat_dataset[n][1])
    print("token_type_ids : ", chat_dataset[n][2])
    print("label_ids      : ", chat_dataset[n][3],"\n")

chat_dataset[ 0 ]
token_ids      :  [1, 2, 9349, 7888, 739, 7318, 376, 4, 12557, 6824, 9108, 9028, 7098, 25856, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
attention_mask :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
token_type_ids :  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
label_ids      :  [6, 6, 6, 6, 6, 6, 6, 4, 12557, 6824, 9108, 9028, 7098, 25856, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

chat_dataset[ 1 ]
token_ids      :  [1, 2, 9020, 8263, 7497, 10192, 11615, 8210, 8006, 4, 12422, 8711, 9535, 7483, 12521, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
attention_mask :  [1, 1,

데이터로더 구축

In [13]:
# collate_fn 구성
def collate_batch(batch):
    token_ids = [item[:][0] for item in batch]
    attention_mask = [item[:][1] for item in batch]
    token_tpye_ids = [item[:][2] for item in batch]
    label_ids = [item[:][3] for item in batch]

    return torch.LongTensor(token_ids), torch.LongTensor(attention_mask), torch.LongTensor(token_tpye_ids), torch.LongTensor(label_ids)

In [14]:
from torch.utils.data import DataLoader, RandomSampler

In [15]:
chat_dataloader = DataLoader(
    chat_dataset,
    batch_size = 32,
    sampler = RandomSampler(chat_dataset, replacement=False),
    collate_fn = collate_batch,
    drop_last = False,
    num_workers = 0,
)

In [16]:
# 데이터로더 확인
sample_data = iter(chat_dataloader)
sample_ids = next(sample_data)

token_ids, attention_mask, token_type_ids, label_ids = sample_ids

print("first item of batch (chat_dataloader)")
print("token_ids \n", token_ids[:][0], token_ids.size(),"\n")
print("attention_mask \n", attention_mask[:][0], attention_mask.size(),"\n")
print("token_type_ids \n", token_type_ids[:][0], token_type_ids.size(),"\n")
print("label_ids \n", label_ids[:][0], label_ids.size())

first item of batch (chat_dataloader)
token_ids 
 tensor([    1,     2,  9050,  7703,  6824, 10721, 11732, 11720,  8017, 10030,
            4,  9308, 10105,  7426,  9723,  7281,  7890,  6824,  7661, 25856,
            1,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3]) torch.Size([32, 50]) 

attention_mask 
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]) torch.Size([32, 50]) 

token_type_ids 
 tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]) torch.Size([32, 50]) 

label_ids 
 tensor([    6,     6,     6,     6,     6,     6,     6,     6,     6,     6,
            4,  

# 모델 학습

In [17]:
import torch.nn
from transformers import GPT2LMHeadModel

In [18]:
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

Downloading:   0%|          | 0.00/490M [00:00<?, ?B/s]

In [19]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

epoch = 3
learning_rate = 3e-5

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

Sneg = -1e18

In [20]:
model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

In [None]:
avg_loss = 0
for epoch in range(epoch):
    for batch_idx, samples in enumerate(tqdm(chat_dataloader)):
        optimizer.zero_grad()       # optimizer 초기화(Gradient)

        token_ids, attention_mask, token_type_ids, label_ids = samples
        token_ids = token_ids.to(device)
        label_ids = label_ids.to(device)

        out = model(token_ids)
        out = out.logits

        mask_3d = token_type_ids.unsqueeze(dim=2).repeat_interleave(repeats=out.shape[2], dim=2)
        mask_out = torch.where(mask_3d == 1, out.cpu(), (Sneg * torch.ones_like(out)).cpu()).to(device)

        loss = criterion(mask_out.transpose(2, 1), label_ids)

        loss.backward()
        """
        # 평균 loss = loss 정규화
        avg_loss = loss.sum() / token_type_ids.sum()
        avg_loss.backward()
        """
        optimizer.step()
        avg_loss += loss.item()

    avg_loss /= batch_idx
    
    print(f'epoch: {epoch}, loss: {avg_loss}')