In [1]:
import re
import random

import numpy as np

import torch
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig
from datasets import load_dataset

from tqdm.notebook import tqdm

RuntimeError: Failed to import transformers.models.auto because of the following error (look up to see its traceback):
libssl.so.10: cannot open shared object file: No such file or directory

In [None]:
MODEL_NAME = 'gpt2'

# get gpt2 tokenizer and add special tokens
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.add_special_tokens({'bos_token': '<BOS>', 'eos_token': '<EOS>', 'unk_token': '<UNK>', 'pad_token': '<PAD>'})

# get gpt2 model and resize the output shape based on the tokens added
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)
model.resize_token_embeddings(len(tokenizer))

In [None]:
DEVICE = 'cuda'
MAXLEN = 100
EPOCHS = 10
BS = 4
LR = 0.001

def create_sentence_pairs(x):
    sentences = re.split(r'[\s]*#Person\d#: ', x['dialogue'])[1:]
    return {'sentence_pairs': [' '.join([tokenizer.bos_token, sent1, tokenizer.pad_token, sent2, tokenizer.eos_token]) for sent1, sent2 in zip(sentences[:-1], sentences[1:])]}


In [None]:
train_dataset = load_dataset('knkarthick/dialogsum', split='train')
val_dataset = load_dataset('knkarthick/dialogsum', split='validation')

train_dataset = train_dataset.map(create_sentence_pairs)
val_dataset = val_dataset.map(create_sentence_pairs)

In [None]:
class GPTDataset(Dataset):
    def __init__(self, sentence_pairs):
        self.sentence_pairs = [pair for pair_ls in sentence_pairs for pair in pair_ls]
        random.shuffle(self.sentence_pairs)
        self.sentence_pairs = self.sentence_pairs

        self.tokenized_pairs = tokenizer(self.sentence_pairs, max_length=MAXLEN, padding='max_length', truncation=True, return_tensors='pt')
        self.tokenized_inputs = self.tokenized_pairs['input_ids']
        self.tokenized_mask = self.tokenized_pairs['attention_mask']

    def __getitem__(self, item):
        return {'input_ids': self.tokenized_inputs[item],
                'attention_mask': self.tokenized_mask[item]}

    def __len__(self):
        return len(self.sentence_pairs)


In [None]:
gpt_train = GPTDataset(train_dataset['sentence_pairs'])
gpt_valid = GPTDataset(val_dataset['sentence_pairs'])

In [None]:
train_loader = DataLoader(gpt_train, batch_size=BS, shuffle=True)
val_loader = DataLoader(gpt_valid, batch_size=BS, shuffle=True)

In [None]:
sent = tokenizer.bos_token + 'Hi, how are you today?' + tokenizer.pad_token
tok_sent = tokenizer(sent, max_length=MAXLEN, return_tensors='pt')

tok_sent = {k: v.to(DEVICE) for k, v in tok_sent.items()}

In [None]:
## training loop
wandb.init('gpt2_tuner', project='chat2learn', config={'batch_size': BS, 
                                                       'learning_rate': LR, 
                                                       'epochs': EPOCHS})

optimizer = Adam(model.parameters(), lr=LR)
#scheduler = StepLR(optimizer, step_size=500, gamma=0.5)

model = model.to(DEVICE)

for epoch in range(EPOCHS):
    model.train()
    pbar = tqdm(enumerate(train_loader))
    train_losses = []
    for idx, x in pbar:
        for k in x.keys():
            x[k] = x[k].to(DEVICE)

        optimizer.zero_grad()
        loss = model(**x, labels=x['input_ids']).loss
        loss.backward()
        optimizer.step()

        train_losses.append(loss.item())
        pbar.set_description(f'Loss: {np.mean(train_losses):.3f}')

        wandb.log({'epochs': epoch,
                   'learning_rate': LR,
                   'loss': loss.item()})
        
        if idx % 1000 == 999:
            model.eval()
            gpt_out = model.generate(**tok_sent)
            print(list(map(tokenizer.decode, gpt_out)))

            model.train()
        
        #scheduler.step()
        
    model.eval()
    pbar = tqdm(val_loader)
    val_losses = []
    min_val_loss = None
    for x in pbar:
        for k in x.keys():
            x[k] = x[k].to(DEVICE)

        loss = model(**x, labels=x['input_ids']).loss

        val_losses.append(loss.item())
        pbar.set_description(f'Validation Loss: {np.mean(val_losses):.3f}')

        wandb.log({'epochs': epoch, 'learning_rate': LR, 'val_loss': loss.item()})

    if min_val_loss is None or np.mean(val_losses) < min_val_loss:
        min_val_loss = np.mean(val_losses)
        torch.save(model, 'gpt_model.pt')

In [None]:
torch.save(model, 'gpt_model')

In [None]:
while True:
    pass
