<a href="https://colab.research.google.com/github/DonghaeSuh/NLP_Pytorch/blob/main/Model/GPT_2/GPT2_GEN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Sentencepiece : https://github.com/google/sentencepiece/blob/master/python/README.md

### NSMC

In [5]:
import urllib

In [6]:
urllib.request.urlretrieve('https://raw.githubusercontent.com/e9t/nsmc/master/ratings_train.txt','train.txt')

('train.txt', <http.client.HTTPMessage at 0x7b4c5cefc220>)

In [7]:
temp=[]

In [8]:
with open('train.txt','r') as f:
  f.readline()
  for sent in f.readlines():
    data = sent.split('\t')
    temp.append(data[1])

In [None]:
len(temp)

150000

In [9]:
cd drive/MyDrive/Pytorch\ NLP/GPT-2/data_in/gpt2_ckpt

/content/drive/MyDrive/Pytorch NLP/GPT-2/data_in/gpt2_ckpt


In [None]:
with open('corpus.txt','w') as f:
  for sent in temp:
    f.write(f'\n {sent}')

### Sentencepiece

https://keep-steady.tistory.com/7

In [10]:
pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.99


In [11]:
import sentencepiece as spm

In [None]:
spm.SentencePieceTrainer.train(input='corpus.txt',model_prefix='kor_tokenizer',vocab_size=50257, model_type='bpe',max_sentence_length=9999999,pad_id=0,unk_id=1,bos_id=2,eos_id=3)

### Tokenizer

In [12]:
sp=spm.SentencePieceProcessor()
sp.Load('kor_tokenizer.model')

True

In [None]:
sp.EncodeAsPieces('안녕하세요 저는 사람이에요.')

['▁안녕하세요', '▁저는', '▁사람이', '에요', '.']

In [None]:
sp.encode('안녕하세요 저는 사람이에요.')

[20363, 1619, 1047, 802, 48546]

In [None]:
sp.encode('그래')

[312]

In [None]:
sp.bos_id()

2

In [13]:
vocab = [sp.id_to_piece(id) for id in range(sp.get_piece_size())]

In [14]:
print('0 :{}, 1 :{} , 2 :{}, 3 :{} '.format(vocab[0],vocab[1],vocab[2],vocab[3]))

0 :<pad>, 1 :<unk> , 2 :<s>, 3 :</s> 


## Input for fine-tuning

### filtering

In [15]:
import re

In [None]:
temp[5]

'막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화.ㅋㅋㅋ...별반개도 아까움.'

In [16]:
filtered_sent = [ ' '.join((re.sub('[^ㄱ-ㅎ가-힣0-9 ]',' ',sent)).split()) for sent in temp]

In [None]:
filtered_sent[5]

'막 걸음마 뗀 3세부터 초등학교 1학년생인 8살용영화 ㅋㅋㅋ 별반개도 아까움'

### padding and truncation

In [17]:
import numpy as np

In [None]:
tokenized_len=[len(sp.encode(sent)) for sent in filtered_sent ]

In [None]:
print('3사분위 길이 :{}'.format(np.percentile(tokenized_len,99)))

3사분위 길이 :47.0


In [None]:
[0]*10

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [18]:
MAX_LEN=47

In [2]:
import torch

In [19]:
def preprocess_input_output(input,max_len):
  # input : (samples, sent_len)

  train_input=[]
  train_output=[]

  # append <s>, </s>, <pad>
  for sent in input:
    if len(sp.encode(sent))<max_len:
      pad_len = max_len-len(sp.encode(sent))-1
      train_input.append([sp.bos_id()]+sp.encode(sent)+[sp.pad_id()]*pad_len)
      train_output.append(sp.encode(sent)+[sp.eos_id()]+[sp.pad_id()]*pad_len)
    else: # truncation
      train_input.append([sp.bos_id()]+sp.encode(sent)[:max_len-1])
      train_output.append(sp.encode(sent)[:max_len-1]+[sp.eos_id()])

  return torch.LongTensor(train_input), torch.LongTensor(train_output) # (samples, max_len)

In [20]:
train_input, train_output = preprocess_input_output(filtered_sent,MAX_LEN)

In [None]:
train_input[0]

tensor([    2,     8,  1080,    55, 17382,  2106,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])

In [None]:
train_output[0]

tensor([    8,  1080,    55, 17382,  2106,     3,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0])

In [3]:
from torch.utils.data import Dataset,DataLoader

In [4]:
class CustomDataset(Dataset):
  def __init__(self,x,y):
    self.x=x
    self.y=y

  def __getitem__(self,index):
    return self.x[index], self.y[index]

  def __len__(self):
    return self.x.size()[0]

In [22]:
train_dataset = CustomDataset(train_input,train_output)

In [32]:
train_iter = DataLoader(train_dataset,batch_size=64)

In [31]:
next(iter(train_iter))

[tensor([[    2,     8,  1080,  ...,     0,     0,     0],
         [    2,  1609, 17387,  ...,     0,     0,     0],
         [    2,    25, 48579,  ...,     0,     0,     0],
         ...,
         [    2,   194, 48700,  ...,     0,     0,     0],
         [    2, 39386,  1661,  ...,     0,     0,     0],
         [    2,  4928,  3547,  ...,     0,     0,     0]]),
 tensor([[    8,  1080,    55,  ...,     0,     0,     0],
         [ 1609, 17387, 23046,  ...,     0,     0,     0],
         [   25, 48579, 50166,  ...,     0,     0,     0],
         ...,
         [  194, 48700, 48547,  ...,     0,     0,     0],
         [39386,  1661, 38441,  ...,     0,     0,     0],
         [ 4928,  3547,   610,  ...,     0,     0,     0]])]

### config & model_weight

In [None]:
urllib.request.urlretrieve('https://huggingface.co/gpt2/resolve/main/config.json','config.json')

('config.json', <http.client.HTTPMessage at 0x7fdd07c1ada0>)

In [None]:
urllib.request.urlretrieve('https://huggingface.co/gpt2/resolve/main/model.safetensors','model.safetensors')

('model.safetensors', <http.client.HTTPMessage at 0x7fdd07c19ed0>)

## Top k & Top p

### torch.multinomial

In [21]:
import torch
import torch.nn as nn

In [None]:
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-99999):
  # logits = (vocab_size, )

  if top_k>0:
    top_k=min(top_k,logits.size()[0])
    indices_to_remove=logits<torch.topk(logits,top_k).values[-1] # smaller than the smallest value among the top_k values
    logits[indices_to_remove]=filter_value

  if top_p>0.0:
    sorted_logits=torch.sort(logits,descending=True)
    sorted_logits_index=torch.argsort(logits,descending=True)
    prob_cumsum = torch.cumsum(nn.Softmax()(sorted_logits))

    sorted_indices_to_remove=prob_cumsum>top_p
    sorted_indices_to_remove=torch.cat([torch.LongTensor([False]),sorted_indices_to_remove[:-1]]) # prevent if first is True
    indices_to_remove = sorted_logits_index[sorted_indices_to_remove]

    logits[indices_to_remove]=filter_value

  return logits   # logits = (vocab_size, )


## Greedy & Generate_sentence Function

In [None]:
def generate_sent(seed_word,model,max_len=100,greedy=False,top_k=0,top_p=0.0):
  sent= seed_word
  toked = sp.encode(sent)

  for _ in range(max_len):
    input_ids = torch.LongTensor([sp.bos_id()]+toked)[None,:] # input_ids = (1, cumulated_seq_len)
    outputs = model(input_ids)[0,-1,:] # outputs : (vocab_size, )

    if greedy:
      gen = sp.id_to_piece(outputs.argmax().tolist()) # outputs.argmax().tolist() -> int
    else:
      output_logits = top_k_top_p_filtering(outputs,top_k,top_p) # logits = (vocab_size, )
      gen = torch.multinomial(nn.Softmax(-1)(output_logits),num_samples=1,replacement=True).tolist()[0]
      gen = sp.id_to_piece(gen)
    if gen == '</s>':
      break

    sent+=gen.replace('▁',' ')
    toked=sp.encode(sent)

  return sent

## Huggingface Transformers

In [None]:
cd ..

/content/drive/MyDrive/Pytorch NLP/GPT-2/data_in


In [26]:
pip install transformers

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m26.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m39.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m39.2 MB/s[0m eta [36m0:00:0

In [27]:
from transformers import GPT2LMHeadModel

## Model Pre-training

In [67]:
class GPT2Gen(nn.Module):
  def __init__(self,dir_path):
    super(GPT2Gen,self).__init__()
    self.gpt2 = GPT2LMHeadModel.from_pretrained(dir_path,ignore_mismatched_sizes=True)

  def forward(self,x):
    outputs=self.gpt2(x)[0]
    return torch.transpose(outputs,2,1) # (batch_size=1, vocab_size, max_len)

In [30]:
# hyperparameter
MAX_LEN=47
BATCH_SIZE=64
EPOCHS=10

device='cuda' if torch.cuda.is_available() else 'cpu'

In [72]:
model = GPT2Gen('./data_in/gpt2_ckpt').to(device)
criterion = torch.nn.CrossEntropyLoss(reduction='none')
optimizer = torch.optim.Adam(model.parameters(),lr=3e-5)

In [33]:
from tqdm import tqdm

In [59]:
a=torch.Tensor([[1,1,1,0,0],[1,1,0,0,0]])
mask=~(a==0)

b=torch.Tensor([[3,2,5,0,0],[1,5,3,1,0]])
(b*mask).sum()

tensor(16.)

In [60]:
def compute_loss(outputs,y,criterion):
  loss = criterion(outputs,y) # (batch_size, max_len)
  mask = ~(y==0)

  return (loss*mask).sum()

In [64]:
def train(model,optimizer,criterion,train_iter):
  model.train()

  total_loss = 0
  for batch in tqdm(train_iter):
    x=batch[0].to(device)
    y=batch[1].to(device)

    outputs = model(x) # outputs : (batch_size, vocab_size, max_len)
    loss = compute_loss(outputs,y,criterion) # (batch_size, max_len )
    total_loss+=loss

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

  return total_loss/len(train_iter)

In [65]:
import os

In [73]:
best_loss = None

for e in range(EPOCHS):
  avg_loss=train(model,optimizer,criterion,train_iter)
  print('avg_loss : {}'.format(avg_loss))

  if not best_loss or avg_loss<best_loss:
    os.makedirs('./data_out/best_model')
    torch.save(model.state_dict(),'./data_out/best_model/best_weight.pt')
    best_loss=avg_loss

  0%|          | 0/2344 [00:00<?, ?it/s]


OutOfMemoryError: ignored

## Text Generative Model

In [29]:
cd ..

/content/drive/MyDrive/Pytorch NLP/GPT-2/data_in


In [None]:
class GPT2Gen(nn.Module):
  def __init__(self,dir_path):
    super(GPT2Gen,self).__init__()
    self.gpt2 = GPT2LMHeadModel.from_pretrained(dir_path,ignore_mismatched_sizes=True)

  def forward(self,x):
    return self.gpt2(x)[0] # (batch_size=1, cumulated_seq_len, vocab_size =500257)

In [None]:
gpt2_model = GPT2Gen('./data_in/gpt2_ckpt')

Generate_sentences

In [None]:
generate_sent('안녕',gpt2_model,top_k=10)

  gen = torch.multinomial(nn.Softmax()(output_logits),num_samples=1,replacement=True).tolist()[0]


'안녕 행복하길 바쁘노릇실상 귀여트랑실상 협주곡실상실상 협주곡 귀여태격태격태격 행복하길 있었는지 귀여실상실상 귀여 협주곡 무료영화 니가 귀여실상 귀여트랑 무료영화트랑트랑태격 귀여실상실상 귀여딩딩태격 니가 협주곡트랑태격태격실상 무료영화실상실상실상실상트랑 협주곡실상 바쁘 니가 가고싶실상실상실상 바쁘트랑태격 협주곡실상실상 기대되네요트랑실상실상 협주곡실상실상실상실상실상실상높음 협주곡실상 가고싶 바쁘 단어에실상트랑실상 단어에 짜맞추 협주곡실상 단어에트랑트랑 협주곡 협주곡 바쁘트랑트랑 짜맞추실상 바쁘'