In [1]:
# from transformers import PreTrainedTokenizerFast

# tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
#     bos_token='</s>', eos_token='</s>', unk_token='<unk>',
#     pad_token='<pad>', mask_token='<mask>') 
# tokenizer.tokenize("안녕하세요. 한국어 GPT-2 입니다.😤:)l^o")


In [2]:
# import torch
# from transformers import GPT2LMHeadModel

# model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')
# text = '별 하나의 추억과, 별 하나의 사랑과'
# input_ids = tokenizer.encode(text)
# gen_ids = model.generate(torch.tensor([input_ids]),
#                            max_length=128,
#                            repetition_penalty=2.0,
#                            pad_token_id=tokenizer.pad_token_id,
#                            eos_token_id=tokenizer.eos_token_id,
#                            bos_token_id=tokenizer.bos_token_id,
#                            use_cache=True)
# generated = tokenizer.decode(gen_ids[0,:].tolist())
# print(generated)


In [3]:
MODEL_NAME = "skt/kogpt2-base-v2"
DATA_IN_PATH = './datasets'
MODEL_PATH = './models'
TRAIN_DATA_FILE = "ko_slogan_test2.csv"

In [4]:
from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel

tokenizer = PreTrainedTokenizerFast.from_pretrained(MODEL_NAME)
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

In [5]:
TOKENS_DICT = {
    'bos_token':'</s>',
    'eos_token':'</s>',
    'unk_token':'<unk>',
    'pad_token':'<pad>',
    'mask_token':'<mask>',
    'additional_special_tokens':['<context>', '<slogan>'],
}

# 특수 토큰이 토크나이저에 추가되고 모델은 수정된 토크나이저에 맞게 임베딩의 크기를 조정
tokenizer.add_special_tokens(TOKENS_DICT)
model.resize_token_embeddings(len(tokenizer))

print(tokenizer.special_tokens_map)

{'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'mask_token': '<mask>', 'additional_special_tokens': "['<context>', '<slogan>']"}


In [6]:
import csv
import torch
from torch.utils.data import Dataset

#학습용 데이터 로더
class SloganDataset(Dataset):
  def __init__(self, filename, tokenizer, seq_length=32): # seq_length=64

    context_tkn = tokenizer.additional_special_tokens_ids[0]  # 토크나이저의 additional_special_tokens_ids[0] : <context>
    slogan_tkn = tokenizer.additional_special_tokens_ids[1] # 토크나이저의 additional_special_tokens_ids[1] : <slogan>
    pad_tkn = tokenizer.pad_token_id  # </s>
    eos_tkn = tokenizer.eos_token_id  # </s>

    self.examples = []  # example 빈리스트 생성
    with open(filename, 'r', encoding='UTF8') as csvfile:  # UTF8로 인코딩
      reader = csv.reader(csvfile)
      # ['company', 'slogan']
      # ['그린카', '그린카로 그리는 일상콘텐츠 ']
      # ['웨이브, 24시간 콘텐츠 스토어', '웨이브에 있었어 ']
      # ['삼성카드 카카오페이 신용카드', '귀여운 디자인에 그렇지 않은 혜택 '] ...
      
      for row in reader:
        # 컨텍스트 및 슬로건 세그먼트 구축
        context = [context_tkn] + tokenizer.encode(row[0], max_length=seq_length//2-1)
        # print(context) - [51200, 14005, 25306]
        # print(tokenizer.decode(context)) - <context> 하이마트
        slogan = [slogan_tkn] + tokenizer.encode(row[1], max_length=seq_length//2-2) + [eos_tkn]
        # print(context) - [51201, 11324, 414, 10553, 30254, 422, 431, 9815, 41427, 411, 739, 1]
        # print(tokenizer.decode(context)) - <slogan> OH HAPPY SALE </s>
        
        # 두 부분을 함께 연결
        tokens = context + slogan + [pad_tkn] * ( seq_length - len(context) - len(slogan) ) # 32 길이만큼 </pad>토큰 채움

        # 해당 세그먼트로 각 토큰에 주석달기 (읽을 수 있도록 주석달아줌)
        segments = [context_tkn] * len(context) + [slogan_tkn] * ( seq_length - len(context) )  
        # print(segments) - [51200, 51200, 51200, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201, 51201]
        # print(tokenizer.decode(segments)) - <context><context><context><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan><slogan>

        # 레이블을 -100으로 설정하여 컨텍스트, 패딩 및 <slogan> 토큰을 무시합니다.
        labels = [-100] * (len(context)+1) + slogan[1:] + [-100] * ( seq_length - len(context) - len(slogan) )
        #print(labels) - [-100, -100, -100, -100, 11324, 414, 10553, 30254, 422, 431, 9815, 41427, 411, 739, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100]
        
        # 데이터셋에 전처리된 예제 추가
        self.examples.append((tokens, segments, labels)) #[토큰, 세그먼트, 라벨]

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, item):
    return torch.tensor(self.examples[item])


# Build the dataset and display the dimensions of the 1st batch for verification:
# 데이터세트를 빌드하고 검증을 위해 첫 번째 배치의 차원을 표시:
slogan_dataset = SloganDataset('./datasets/ko_slogan_test2.csv', tokenizer)
print(next(iter(slogan_dataset)).size())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


torch.Size([3, 32])


In [7]:
import math, random

from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

# 훈련 및 검증 데이터 분할을 위한 인덱스 생성
indices = list(range(len(slogan_dataset)))

random.seed(42)
random.shuffle(indices)

split = math.floor(0.1 * len(slogan_dataset))
train_indices, val_indices = indices[split:], indices[:split]

# PyTorch 데이터 로더를 빌드
train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_loader = DataLoader(slogan_dataset, batch_size=32, sampler=train_sampler)
val_loader = DataLoader(slogan_dataset, batch_size=64, sampler=val_sampler)
# 참고: 역전파(backprogation)가 포함되지 않으므로 유효성 검사를 위해 배치 크기를 두 배로 늘릴 수 있음(따라서 GPU 메모리에 맞음)

In [8]:
import numpy as np
from tqdm import tqdm

def fit(model, optimizer, train_dl, val_dl, epochs=1, device=torch.device('cpu')):

  for i in range(epochs):

    print(f'\n--- Starting epoch #{i+1} ---')

    model.train()
    
    # 한 epoch 동안 배치 손실과 배치 크기를 추적을 위한 리스트 생성
    losses = []
    nums = []

    for xb in tqdm(train_dl, desc="Training"):
      # 배치를 훈련 장치로 이동
      inputs = xb.to(device)

      # 토큰 ID, 세그먼트 ID 및 정답(레이블)을 사용하여 모델을 호출
      outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])
      
      # 목록에 손실 및 배치 크기를 추가
      loss = outputs[0]
      losses.append(loss.item())
      nums.append(len(xb))

      loss.backward()

      optimizer.step()
      model.zero_grad()

    # 한 epoch 동안의 평균 비용을 계산
    train_cost = np.sum(np.multiply(losses, nums)) / sum(nums)


    # 이제 유효성 검사를 위해 동일한 작업을 수행
    model.eval()
    
    with torch.no_grad():
      losses = []
      nums = []

      for xb in tqdm(val_dl, desc="Validation"):
        inputs = xb.to(device)
        outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])
        losses.append(outputs[0].item())
        nums.append(len(xb))

    val_cost = np.sum(np.multiply(losses, nums)) / sum(nums)

    print(f'\n--- Epoch #{i+1} finished --- Training cost: {train_cost} / Validation cost: {val_cost}')


In [9]:
from transformers import AdamW

# Move the model to the GPU:
device = torch.device('cuda')
model.to(device)

# Fine-tune GPT2 for 5 epochs: 
optimizer = AdamW(model.parameters()) # 트랜스포머의 AdamW
fit(model, optimizer, train_loader, val_loader, epochs=5, device=device)

Training:   0%|          | 0/311 [00:00<?, ?it/s]


--- Starting epoch #1 ---


Training: 100%|██████████| 311/311 [00:58<00:00,  5.29it/s]
Validation: 100%|██████████| 18/18 [00:02<00:00,  8.52it/s]
Training:   0%|          | 0/311 [00:00<?, ?it/s]


--- Epoch #1 finished --- Training cost: 6.6821430814315335 / Validation cost: 5.847744272844824

--- Starting epoch #2 ---


Training: 100%|██████████| 311/311 [01:00<00:00,  5.13it/s]
Validation: 100%|██████████| 18/18 [00:01<00:00,  9.34it/s]
Training:   0%|          | 0/311 [00:00<?, ?it/s]


--- Epoch #2 finished --- Training cost: 5.635900312961076 / Validation cost: 5.566500684902139

--- Starting epoch #3 ---


Training: 100%|██████████| 311/311 [00:59<00:00,  5.24it/s]
Validation: 100%|██████████| 18/18 [00:01<00:00,  9.27it/s]
Training:   0%|          | 0/311 [00:00<?, ?it/s]


--- Epoch #3 finished --- Training cost: 5.281440420119124 / Validation cost: 5.450374875996447

--- Starting epoch #4 ---


Training: 100%|██████████| 311/311 [01:00<00:00,  5.15it/s]
Validation: 100%|██████████| 18/18 [00:01<00:00,  9.02it/s]
Training:   0%|          | 0/311 [00:00<?, ?it/s]


--- Epoch #4 finished --- Training cost: 4.986320364761621 / Validation cost: 5.339524038668672

--- Starting epoch #5 ---


Training: 100%|██████████| 311/311 [00:59<00:00,  5.24it/s]
Validation: 100%|██████████| 18/18 [00:01<00:00,  9.31it/s]


--- Epoch #5 finished --- Training cost: 4.71373273202649 / Validation cost: 5.228613368849948





In [10]:
#torch.save(model.state_dict(), MODEL_PATH, 'model_weights.pth')
# 모델 체크포인트(저장코드) 만드는 중
# py파일로 분할해서 번거롭지 않게 변형할 예정 (processing, learning, generation 등)

In [11]:
# HuggingFace에서 top k와 top p로 함수 샘플링

import torch.nn.functional as F
from tqdm import trange


def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):

    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # top-k의 마지막 토큰보다 확률이 낮은 모든 토큰을 제거
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # 임계값 이상의 누적 확률을 가진 토큰 제거
        sorted_indices_to_remove = cumulative_probs > top_p
        
        # 첫 번째 토큰도 임계값보다 높게 유지하려면 인덱스를 오른쪽으로 이동
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        # 정렬된 텐서를 원래 인덱싱에 분산
        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
        logits[indices_to_remove] = filter_value
    return logits


# HuggingFace에서 컨텍스트/슬로건 분리 작업에 맞게 조정됨
def sample_sequence(model, length, context, segments_tokens=None, num_samples=1, temperature=1, top_k=0, top_p=0.0, repetition_penalty=1.0,
                    device='cpu'):
    context = torch.tensor(context, dtype=torch.long, device=device)
    context = context.unsqueeze(0).repeat(num_samples, 1)
    generated = context

    with torch.no_grad():
        for _ in trange(length):

            inputs = {'input_ids': generated}
            if segments_tokens != None:
              inputs['token_type_ids'] = torch.tensor(segments_tokens[:generated.shape[1]]).unsqueeze(0).repeat(num_samples, 1)


            outputs = model(**inputs)  # 참고: GPT-2/Transfo-XL/XLNet/CTRL(캐시된 숨겨진 상태)과 함께 '과거'를 사용할 수도 있음
            next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)

            # CTRL의 반복 페널티(https://arxiv.org/abs/1909.05858)
            for i in range(num_samples):
                for _ in set(generated[i].tolist()):
                    next_token_logits[i, _] /= repetition_penalty
                
            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
            if temperature == 0: # greedy sampling:
                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
            else:
                next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
            generated = torch.cat((generated, next_token), dim=1)
    return generated


In [12]:
context = "현대카드, 청년들을 위한 신용카드"

context_tkn = tokenizer.additional_special_tokens_ids[0]
slogan_tkn = tokenizer.additional_special_tokens_ids[1]

input_ids = [context_tkn] + tokenizer.encode(context)

segments = [slogan_tkn] * 32
segments[:len(input_ids)] = [context_tkn] * len(input_ids)

input_ids += [slogan_tkn]

# Move the model back to the CPU for inference:
model.to(torch.device('cpu'))

# Generate 20 samples of max length 20
generated = sample_sequence(model, length=20, context=input_ids, segments_tokens=segments, num_samples=20)

print('\n\n--- Generated Slogans ---\n')

for g in generated:
  slogan = tokenizer.decode(g.squeeze().tolist())
  slogan = slogan.split('</s>')[0].split('<slogan>')[1]
  print(slogan)  

100%|██████████| 20/20 [00:16<00:00,  1.24it/s]



--- Generated Slogans ---

 작은 결제 
 재룸, 살아난기에요 
 본격품도 키워 계산해 
 차없는 커져를 챔피까지, 신선하게 
일지 웃게 전에 공개은 마음처럼 살지 않군자 
 소중한 대한민국을 걷다 
 예런지난 공거리 말않기만든만 나더 
 제대로 마침내의 키움이 경험하다 
 더하면 바꿔리다 
 HAPwes 
 은행이 모약에 '국 
 공홍성의 감부로 다시 
 스타일시스를 찾은 판을 만나야 
 DE이 다 않아도 쉽게 누릴다면 
 앞장서 않는 내일을 바꾸다 
 직접 대한기 블랙지 않도록 과학은통제상담 
 하고않지 못한 우리만의 소중한 고기 
 컴몬을 계속하다 
 신선하게 더하다 
 무대체 여행됩니다, 삶의 건조 



