# 트랜스포머 설치

In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.0 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 47.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 8.7 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 59.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 64.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: p

# 말뭉치 다운로드

In [22]:
import pandas as pd
import tqdm
import urllib.request

In [23]:
urllib.request.urlretrieve("https://raw.githubusercontent.com/songys/Chatbot_data/master/ChatbotData.csv", filename="ChatBotData.csv")
train_data = pd.read_csv('ChatBotData.csv')

In [137]:
train_data.head()

Unnamed: 0,Q,A,label
0,12시 땡!,하루가 또 가네요.,0
1,1지망 학교 떨어졌어,위로해 드립니다.,0
2,3박4일 놀러가고 싶다,여행은 언제나 좋죠.,0
3,3박4일 정도 놀러가고 싶다,여행은 언제나 좋죠.,0
4,PPL 심하네,눈살이 찌푸려지죠.,0


In [146]:
len(train_data)

11823

# 토크나이저 준비

`PreTrainedTokenizerFast`를 사용하여 KoGPT2의 Tokenizer 설정 정보를 불러온다.

In [26]:
from transformers import PreTrainedTokenizerFast

In [165]:
tokenizer = PreTrainedTokenizerFast.from_pretrained(
    "skt/kogpt2-base-v2",
    bos_token='</s>',       # 문장 시작토큰
    eos_token='</s>',       # 문장 마지막토큰
    unk_token='<unk>',      # 어휘에 없는 토큰
    pad_token='<pad>',      # 크기 맞추기 토큰
    mask_token='<mask>',     # 마스킹 토큰
    model_max_length = 50
)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'GPT2Tokenizer'. 
The class this function is called from is 'PreTrainedTokenizerFast'.


스페셜 토큰 확인

In [166]:
for i in range (10):
    print("index : ",i," =  tokens : ",tokenizer.decode(i))

index :  0  =  tokens :  <s>
index :  1  =  tokens :  </s>
index :  2  =  tokens :  <usr>
index :  3  =  tokens :  <pad>
index :  4  =  tokens :  <sys>
index :  5  =  tokens :  <unk>
index :  6  =  tokens :  <mask>
index :  7  =  tokens :  <d>
index :  8  =  tokens :  </d>
index :  9  =  tokens :  <unused0>


# 토크나이징

`</s>` `<usr>` 유저 질문 `<sys>` 모델 답변 `</s>` `<pad>`...

In [254]:
import torch

In [278]:
def get_chat_data(data):
    tokens = []
    for question, answer in zip(data.Q.to_list(), data.A.to_list()):
        bos_token = [tokenizer.bos_token_id]
        eos_token = [tokenizer.eos_token_id]
        sent = tokenizer.encode('<usr>' + question + '<sys>' + answer, padding='max_length')  # 인코딩
        sentence_token = bos_token + sent + eos_token
        tokens.append(sentence_token)
    return tokens

# 데이터 구축

데이터셋 구축

In [279]:
dataset = get_chat_data(train_data)

In [280]:
print(dataset[0], '\n', "토큰화 된 데이터 갯수: ", len(dataset))

[1, 2, 9349, 7888, 739, 7318, 376, 4, 12557, 6824, 9108, 9028, 7098, 25856, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1] 
 토큰화 된 데이터 갯수:  11823


토큰 데이터셋 텐서로 변환

In [281]:
dataset = torch.tensor(dataset, dtype=torch.int32)

데이터로더 구축

In [283]:
from torch.utils.data import DataLoader, RandomSampler

In [284]:
dataloader = DataLoader(
    dataset,
    batch_size=32,
    sampler=RandomSampler(dataset, replacement=False),
    drop_last=False,
    num_workers=0,
)

데이터로더 배치 1개 확인

In [297]:
batch_iter = iter(dataloader)
batch_data = next(batch_iter)

print(batch_data, batch_data.size())

tensor([[    1,     2, 12102,  ...,     3,     3,     1],
        [    1,     2, 12383,  ...,     3,     3,     1],
        [    1,     2, 11732,  ...,     3,     3,     1],
        ...,
        [    1,     2,  9800,  ...,     3,     3,     1],
        [    1,     2, 10305,  ...,     3,     3,     1],
        [    1,     2, 14938,  ...,     3,     3,     1]], dtype=torch.int32) torch.Size([32, 52])


In [303]:
for id in range(0, 3):
    print(tokenizer.decode(batch_data[id]))

</s><usr> 나는 왜 태어났을까<sys> 사랑 받기 위해 태어났어요.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad></s>
</s><usr> 단체생활 적응이 안돼<sys> 점점 적응 될 거예요.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad></s>
</s><usr> 좀 있으면 귀국하는데 짝녀에게 뭘 챙겨줄까?<sys> 작은 것 하나라도 챙겨준다면 센스있는 사람이라고 생각할 거예요.<pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad></s>
