In [1]:
# 변수 선언

MODEL_NAME = 'distilgpt2' #'distilgpt2' 'gpt2-medium'
DATA_IN_PATH = "./datasets/"
DATA_OUT_PATH = "./models/"
TRAIN_DATA_FILE = "slogans.csv"
TRAIN_DATA_NAME = "en slogan"
PLOT_OUT_PATH = "./plots/"



In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
model = GPT2LMHeadModel.from_pretrained(MODEL_NAME)

In [3]:
# Declare special tokens for padding and separating the context from the slogan:
SPECIAL_TOKENS_DICT = {
    'pad_token': '<pad>',
    'additional_special_tokens': ['<context>', '<slogan>'],
}

# 어휘에 다음 특수 토큰을 추가하고 모델의 임베딩 크기를 조정:
tokenizer.add_special_tokens(SPECIAL_TOKENS_DICT)
model.resize_token_embeddings(len(tokenizer))

print(tokenizer.special_tokens_map)

{'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<pad>', 'additional_special_tokens': "['<context>', '<slogan>']"}


In [4]:
import csv

import torch
from torch.utils.data import Dataset



class SloganDataset(Dataset):
  def __init__(self, filename, tokenizer, seq_length=64):

    context_tkn = tokenizer.additional_special_tokens_ids[0]
    slogan_tkn = tokenizer.additional_special_tokens_ids[1]
    pad_tkn = tokenizer.pad_token_id
    eos_tkn = tokenizer.eos_token_id

    self.examples = []
    with open(filename, 'r', encoding='UTF-8') as csvfile:
      reader = csv.reader(csvfile)
      for row in reader:
      
        context = [context_tkn] + tokenizer.encode(row[0], max_length=seq_length//2-1)
        slogan = [slogan_tkn] + tokenizer.encode(row[1], max_length=seq_length//2-2) + [eos_tkn]
        
        
        tokens = context + slogan + [pad_tkn] * ( seq_length - len(context) - len(slogan) )

        segments = [context_tkn] * len(context) + [slogan_tkn] * ( seq_length - len(context) )

        labels = [-100] * (len(context)+1) + slogan[1:] + [-100] * ( seq_length - len(context) - len(slogan) )

        self.examples.append((tokens, segments, labels))

  def __len__(self):
    return len(self.examples)

  def __getitem__(self, item):
    return torch.tensor(self.examples[item])

slogan_dataset = SloganDataset(DATA_IN_PATH + TRAIN_DATA_FILE, tokenizer)
print(next(iter(slogan_dataset)).size())

torch.Size([3, 64])


In [5]:
import math, random

from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

indices = list(range(len(slogan_dataset)))

random.seed(100)
random.shuffle(indices)

split = math.floor(0.1 * len(slogan_dataset))
train_indices, val_indices = indices[split:], indices[:split]

train_sampler = SubsetRandomSampler(train_indices)
val_sampler = SubsetRandomSampler(val_indices)

train_loader = DataLoader(slogan_dataset, batch_size=32, sampler=train_sampler)
val_loader = DataLoader(slogan_dataset, batch_size=64, sampler=val_sampler)

In [6]:
import numpy as np
from tqdm import tqdm
import os


def fit(model, optimizer, train_dl, val_dl, epochs=1, device=torch.device('cpu')):
  
  if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    print('--- Directory creation completed successfully ---')
  else:
    print('--- Directory already exists ---')
  

  for i in range(epochs):

    print('\n--- Starting epoch #{} ---'.format(i+1))

    model.train()

    losses = []
    nums = []

    for xb in tqdm(train_dl, desc="Training"):

      inputs = xb.to(device)

      outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])
      
      loss = outputs[0]
      losses.append(loss.item())
      nums.append(len(xb))

      loss.backward()

      optimizer.step()
      model.zero_grad()

    train_cost = np.sum(np.multiply(losses, nums)) / sum(nums)

    model.eval()
    
    with torch.no_grad():
      losses = []
      nums = []

      for xb in tqdm(val_dl, desc="Validation"):
        inputs = xb.to(device)
        outputs = model(inputs[:,0,:], token_type_ids=inputs[:,1,:], labels=inputs[:,2,:])
        losses.append(outputs[0].item())
        nums.append(len(xb))

    val_cost = np.sum(np.multiply(losses, nums)) / sum(nums)
    
    print('\n--- Epoch #{} finished --- Training cost: {} / Validation cost: {}'.format(i+1, train_cost, val_cost))

    if (i + 1) % 1 == 0 :
      torch.save(model.state_dict(), DATA_OUT_PATH + TRAIN_DATA_NAME + '_' + f'{i+1}epoch' + '_' + 'model.pth')
      print(f'\n--- Epoch #{i+1} Saving complete ! ---')
  
    torch.cuda.empty_cache()


In [7]:
from transformers import AdamW

# Move the model to the GPU:
device = torch.device('cuda')
model.to(device)

# Fine-tune GPT2 for 5 epochs:
optimizer = AdamW(model.parameters())
fit(model, optimizer, train_loader, val_loader, epochs=3, device=device)

Training:   0%|          | 0/268 [00:00<?, ?it/s]

--- Directory already exists ---

--- Starting epoch #1 ---


Training: 100%|██████████| 268/268 [01:03<00:00,  4.25it/s]
Validation: 100%|██████████| 15/15 [00:02<00:00,  6.16it/s]



--- Epoch #1 finished --- Training cost: 4.357697346853832 / Validation cost: 3.238239254270281


Training:   0%|          | 1/268 [00:00<00:43,  6.09it/s]


--- Epoch #1 Saving complete ! ---

--- Starting epoch #2 ---


Training: 100%|██████████| 268/268 [01:02<00:00,  4.30it/s]
Validation: 100%|██████████| 15/15 [00:02<00:00,  6.36it/s]



--- Epoch #2 finished --- Training cost: 2.6955439815111633 / Validation cost: 3.2977525446595264


Training:   0%|          | 0/268 [00:00<?, ?it/s]


--- Epoch #2 Saving complete ! ---

--- Starting epoch #3 ---


Training: 100%|██████████| 268/268 [01:01<00:00,  4.33it/s]
Validation: 100%|██████████| 15/15 [00:02<00:00,  6.59it/s]



--- Epoch #3 finished --- Training cost: 2.0005977358582085 / Validation cost: 3.5433764998652353

--- Epoch #3 Saving complete ! ---
