# 트랜스포머 설치

In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 46.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 39.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 25.4 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.2 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found ex

# 말뭉치 다운로드

In [None]:
import pandas as pd
from tqdm import tqdm
import urllib.request

In [None]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
chat_data = pd.read_csv('ChatBotData.csv')

In [None]:
chat_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [None]:
len(chat_data)

11823

# 토크나이저 준비

`PreTrainedTokenizerFast`를 사용하여 KoGPT2의 Tokenizer 설정 정보를 불러온다.

In [None]:
from transformers import PreTrainedTokenizerFast

In [None]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token='</s>',       # 문장 시작토큰
    eos_token='</s>',       # 문장 마지막토큰
    unk_token='<unk>',      # 어휘에 없는 토큰
    pad_token='<pad>',      # 크기 맞추기 토큰
    mask_token='<mask>',     # 마스킹 토큰
)

Downloading:   0%|          | 0.00/2.69M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/0.98k [00:00<?, ?B/s]

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


스페셜 토큰 확인

In [None]:
for i in range (10):
    print("index : ",i," =  tokens : ",tokenizer.decode(i))

index :  0  =  tokens :  <s>
index :  1  =  tokens :  </s>
index :  2  =  tokens :  <usr>
index :  3  =  tokens :  <pad>
index :  4  =  tokens :  <sys>
index :  5  =  tokens :  <unk>
index :  6  =  tokens :  <mask>
index :  7  =  tokens :  <d>
index :  8  =  tokens :  </d>
index :  9  =  tokens :  <unused0>


# 토크나이징, 데이터 구축

`</s>` `<usr>` 유저 질문 `<sys>` 모델 답변 `</s>` `<pad>`...

In [None]:
import numpy as np
from dataclasses import dataclass
from typing import List, Optional
import torch
from torch.utils.data import Dataset

In [None]:
# 챗봇 데이터를 처리하는 클래스
class ChatbotDataset(Dataset):
    def __init__(self, chats, max_len=50):  # 데이터셋의 전처리를 해주는 부분
        self._data = chats
        self.max_len = max_len
        self.q_token = "<usr>"
        self.a_token = "<sys>"
        self.bos = tokenizer.bos_token
        self.eos = tokenizer.eos_token
        self.mask = tokenizer.mask_token
        self.tokenizer = tokenizer

    def __len__(self):  # chatbotdata 의 길이를 리턴
        return len(self._data)

    def __getitem__(self, idx):  # 로드한 챗봇 데이터를 차례차례 DataLoader로 넘겨주는 메서드
        index = self._data.iloc[idx]

        q = index["Q"]  # 질문
        q_toked = self.tokenizer.tokenize(self.bos + self.q_token + q)      # </s> <usr> 질문
        q_len = len(q_toked)

        a = index["A"]  # 답변
        a_toked = self.tokenizer.tokenize(self.a_token + a + self.eos)      # <sys> 답 </s>
        a_len = len(a_toked)

        # 질문의 길이가 최대길이보다 클때
        if q_len > self.max_len: 
            q_toked = q_toked[-(int(self.max_len / 2)):]   # 질문길이를 최대길이의 반으로 
            q_len = len(q_toked)

        # 질문 + 답변 길이가 최대길이보다 클때
        if q_len + a_len > self.max_len:
            a_len = self.max_len - q_len        # 답변의 길이 = 최대길이 - 질문길이

            if a_len <= 0:       # 질문의 길이가 너무 길어 질문만으로 최대 길이를 초과 한다면
                q_toked = q_toked[-(int(self.max_len / 2)) :]   # 질문길이를 최대길이의 반으로 
                q_len = len(q_toked)
                a_len = self.max_len - q_len              # 답변의 길이를 최대길이 - 질문길이
                
            a_toked = a_toked[:a_len]
            a_len = len(a_toked)

        # 질문 + 답변 토큰을 index로 변환   
        token = self.tokenizer.convert_tokens_to_ids(q_toked + a_toked)
        # 최대길이만큼 padding
        while len(token) < self.max_len:
            token += [self.tokenizer.pad_token_id]

        # attention(어텐션마스크) = 질문+답변 길이 1 + 나머지(패딩) 0
        attention = [1]*(q_len+a_len) + [0]*(self.max_len - q_len - a_len)

        # token_type_ids(세그먼트 정보) = 질문 0 + 답변 1 + 나머지 0
        token_type = [0]*q_len + [1]*a_len + [0]*(self.max_len - q_len - a_len)

        # label(답변) = </s>, <usr>, <mask>, .... , <mask>, <sys>, 답변, </s>, <pad>, ... , <pad>
        label = q_toked[0:2] + [self.mask,]*(q_len-2) + a_toked[0:]
        # label을 index로 변환
        label = self.tokenizer.convert_tokens_to_ids(label)
        # 최대길이만큼 padding
        while len(label) < self.max_len:
            label += [self.tokenizer.pad_token_id]

        
        # 질문 + 답변, 어텐션마스크, 세그먼트 정보, 답변
        return (token, attention, token_type, label)

데이터셋 구축

구성 : (token, attention, token_type, label) = (token_ids, attention_mask, token_type_ids, label_ids)

In [None]:
chat_dataset = ChatbotDataset(chat_data, max_len=50)

In [None]:
for n in range(3):
    print("chat_dataset[",n,"]")
    print("token_ids      : ", chat_dataset[n][0])
    print("attention_mask : ", chat_dataset[n][1])
    print("token_type_ids : ", chat_dataset[n][2])
    print("label_ids      : ", chat_dataset[n][3],"\n")

chat_dataset[ 0 ]
token_ids      :  [1, 2, 9349, 7888, 739, 7318, 376, 4, 12557, 6824, 9108, 9028, 7098, 25856, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
attention_mask :  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
token_type_ids :  [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
label_ids      :  [1, 2, 6, 6, 6, 6, 6, 4, 12557, 6824, 9108, 9028, 7098, 25856, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 

chat_dataset[ 1 ]
token_ids      :  [1, 2, 9020, 8263, 7497, 10192, 11615, 8210, 8006, 4, 12422, 8711, 9535, 7483, 12521, 1, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
attention_mask :  [1, 1,

데이터로더 구축

In [None]:
# collate_fn 구성
def collate_batch(batch):
    token_ids = [item[:][0] for item in batch]
    attention_mask = [item[:][1] for item in batch]
    token_tpye_ids = [item[:][2] for item in batch]
    label_ids = [item[:][3] for item in batch]

    return torch.LongTensor(token_ids), torch.LongTensor(attention_mask), torch.LongTensor(token_tpye_ids), torch.LongTensor(label_ids)

In [None]:
from torch.utils.data import DataLoader, RandomSampler

In [None]:
chat_dataloader = DataLoader(
    chat_dataset,
    batch_size = 16,
    sampler = RandomSampler(chat_dataset, replacement=False),
    collate_fn = collate_batch,
    drop_last = False,
    num_workers = 0,
)

In [None]:
# 데이터로더 확인
sample_data = iter(chat_dataloader)
sample_ids = next(sample_data)

token_ids, attention_mask, token_type_ids, label_ids = sample_ids

print("first item of batch (chat_dataloader)")
print("token_ids \n", token_ids[:][0], token_ids.size(),"\n")
print("attention_mask \n", attention_mask[:][0], attention_mask.size(),"\n")
print("token_type_ids \n", token_type_ids[:][0], token_type_ids.size(),"\n")
print("label_ids \n", label_ids[:][0], label_ids.size())

first item of batch (chat_dataloader)
token_ids 
 tensor([    1,     2, 10715, 19340, 15931,     4, 17776,  9518,  9203, 46850,
        23763,  8234,   389,     1,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3,
            3,     3,     3,     3,     3,     3,     3,     3,     3,     3]) torch.Size([16, 50]) 

attention_mask 
 tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]) torch.Size([16, 50]) 

token_type_ids 
 tensor([0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]) torch.Size([16, 50]) 

label_ids 
 tensor([    1,     2,     6,     6,     6,     4, 17776,  9518,  9203, 46850,
        23763,  

# 모델 학습

모델 준비 및 설정 초기화

In [None]:
import torch.nn
from transformers import GPT2LMHeadModel

In [None]:
model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

Downloading:   0%|          | 0.00/490M [00:00<?, ?B/s]

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

epoch = 15
learning_rate = 3e-5

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

모델 구성 확인 및 훈련모드 설정

In [None]:
model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(51200, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dro

학습 진행

In [None]:
for epoch in range(epoch):
    loss = 0.0
    avg_loss = 0.0
    
    for batch_idx, samples in enumerate(tqdm(chat_dataloader)):
        optimizer.zero_grad()       # optimizer 초기화(Gradient)

        # 모델 입력 텐서 GPU에 올리기
        token_ids, attention_mask, token_type_ids, label_ids = samples
        token_ids = token_ids.to(device)
        attention_mask = attention_mask.to(device)
        token_type_ids = token_type_ids.to(device)
        label_ids = label_ids.to(device)

        out = model(
            input_ids=token_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            labels=label_ids,
            )
        
        loss = out.loss
        loss.backward()
        optimizer.step()

        avg_loss += loss.item()

        # GPU 캐시 비우기 (GPU 메모리 확보)
        torch.cuda.empty_cache()

    avg_loss /= batch_idx
    scheduler.step()
    
    print(f'epoch: {epoch}, loss: {loss}, avg_loss: {avg_loss} \n')

100%|██████████| 739/739 [08:05<00:00,  1.52it/s]


epoch: 0, loss: 0.48396292328834534, avg_loss: 0.605375489891383 



100%|██████████| 739/739 [08:04<00:00,  1.52it/s]


epoch: 1, loss: 0.312633752822876, avg_loss: 0.3880761875127389 



100%|██████████| 739/739 [08:04<00:00,  1.52it/s]


epoch: 2, loss: 0.25816458463668823, avg_loss: 0.3015389526641466 



100%|██████████| 739/739 [08:05<00:00,  1.52it/s]


epoch: 3, loss: 0.26147696375846863, avg_loss: 0.2356664444897879 



100%|██████████| 739/739 [08:04<00:00,  1.52it/s]


epoch: 4, loss: 0.1920185089111328, avg_loss: 0.18370983312486955 



100%|██████████| 739/739 [08:05<00:00,  1.52it/s]


epoch: 5, loss: 0.11993344873189926, avg_loss: 0.1445161362751551 



100%|██████████| 739/739 [08:05<00:00,  1.52it/s]


epoch: 6, loss: 0.08482593297958374, avg_loss: 0.11155478137376186 



100%|██████████| 739/739 [08:07<00:00,  1.52it/s]


epoch: 7, loss: 0.0853043720126152, avg_loss: 0.0868803758507337 



100%|██████████| 739/739 [08:05<00:00,  1.52it/s]


epoch: 8, loss: 0.08930700272321701, avg_loss: 0.06859463120641586 



100%|██████████| 739/739 [08:05<00:00,  1.52it/s]


epoch: 9, loss: 0.06484388560056686, avg_loss: 0.05453349054011145 



100%|██████████| 739/739 [08:02<00:00,  1.53it/s]


epoch: 10, loss: 0.04209190979599953, avg_loss: 0.043804300093372175 



100%|██████████| 739/739 [08:01<00:00,  1.53it/s]


epoch: 11, loss: 0.04811542108654976, avg_loss: 0.035453682906703574 



100%|██████████| 739/739 [08:02<00:00,  1.53it/s]


epoch: 12, loss: 0.03627637401223183, avg_loss: 0.028973009268220686 



100%|██████████| 739/739 [08:02<00:00,  1.53it/s]


epoch: 13, loss: 0.01334947720170021, avg_loss: 0.02509319289842353 



100%|██████████| 739/739 [08:02<00:00,  1.53it/s]

epoch: 14, loss: 0.01873115263879299, avg_loss: 0.021211150999567132 






# 챗봇 실행

In [None]:
while 1:
    q = input("user > ").strip()
    # quit 입력시 챗봇 종료
    if q == "quit":
        break

    input_ids = tokenizer.encode(q, return_tensors="pt").to(device)

    with torch.no_grad():
        gen_ids = model.generate(
            input_ids,
            do_sample=True,
            min_length=10,
            max_length=30,
            top_p=0.9,
            top_k=10,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            temperature=0.8,
        )
    
    generated = tokenizer.decode(gen_ids[0])
    generated = generated[generated.index("<sys>")+5 : generated.index("</s>")]
    
    print(f'Chatbot > {generated}')

user > 여행 가고싶다
Chatbot >  저도요! 기운내세요!
user > 오늘 하루 힘내보자
Chatbot >  응원합니다!
user > 강아지 산책 시켜야겠다
Chatbot >  좋은 생각이에요.
user > quit
