In this notebook we tried to fine tuned a gpt2 model with the wow dataset

# GPT fine-tuning

We load the data for train, validation and test 

In [2]:
import json
import torch

# Base_directory
base_dir = './wizard_of_wikipedia/'

# Load the data
with open(base_dir + 'train.json') as f:
    train_data = json.load(f)
with open(base_dir + 'valid_random_split.json') as f:
    valid_data = json.load(f)
with open(base_dir + 'test_random_split.json') as f:
    test_data = json.load(f)

In [3]:
train_data[0]

{'chosen_topic': 'Science fiction',
 'persona': 'i enjoy movies about aliens invading the earth.',
 'wizard_eval': 5,
 'dialog': [{'speaker': '0_Wizard',
   'text': "I think science fiction is an amazing genre for anything. Future science, technology, time travel, FTL travel, they're all such interesting concepts.",
   'checked_sentence': {'chosen_Science_fiction_0': 'Science fiction (often shortened to SF or sci-fi) is a genre of speculative fiction, typically dealing with imaginative concepts such as futuristic science and technology, space travel, time travel, faster than light travel, parallel universes, and extraterrestrial life.'},
   'checked_passage': {'chosen_topic_0_Science_fiction': 'Science fiction'},
   'retrieved_passages': [{'Hyperspace (science fiction)': ['Hyperspace is a faster-than-light (FTL) method of traveling used in science fiction.',
      'It is typically described as an alternative "sub-region" of space co-existing with our own universe which may be entered u

We started from this gpt model loaded from huggingface

In [21]:
from transformers import GPTNeoForCausalLM, AutoTokenizer

model_id = "EleutherAI/gpt-neo-125M"

# default_device = 'cpu'
default_device = 'mps' # apple silicon
device = torch.device('cuda' if torch.cuda.is_available() else default_device)

tokenizer = AutoTokenizer.from_pretrained(model_id, padding_side='left')
model = GPTNeoForCausalLM.from_pretrained('./model').to(device)

In [22]:
tokenizer.pad_token = tokenizer.eos_token

In [3]:
def extract_checked_sentence(utterance):
    try:
        checked_sentence = list(utterance['checked_sentence'].values())[0]
        return 'PASSAGE: ' + checked_sentence + '\n'
    except:
        return ''

def parse_dialog(dialog):
        return '\n'.join([
            f'SPEAKER: {utterance["speaker"]}\n' + \
            extract_checked_sentence(utterance) + \
            f'TEXT: {utterance["text"]}\n'
        for utterance in dialog])

def parse_data(dataset):
    return [
        f'CHOSEN_TOPIC: {sample["chosen_topic"]}\n' + \
        f'PERSONA: {sample["persona"]}\n' + \
        parse_dialog(sample['dialog'])
    for sample in dataset]

In [4]:
train_parsed = parse_data(train_data)
valid_parsed = parse_data(valid_data)
test_parsed = parse_data(test_data)

In [8]:
from datasets import Dataset

train_parsed = Dataset.from_dict({'text': train_parsed})
valid_parsed = Dataset.from_dict({'text': valid_parsed})
test_parsed = Dataset.from_dict({'text': test_parsed})

In [8]:
from datasets import DatasetDict

data = DatasetDict()
data['train'] = train_parsed
data['validation'] = valid_parsed
data['test'] = test_parsed

In [9]:
def tokenize_function(examples):
    input_encodings = tokenizer(examples["text"], padding=True, truncation=True)
    sample = {
        'input_ids': input_encodings.input_ids
    }
    return sample

tokenized_data = data.map(tokenize_function, batched=True)

100%|██████████| 19/19 [00:06<00:00,  2.94ba/s]
100%|██████████| 1/1 [00:00<00:00,  2.68ba/s]
100%|██████████| 1/1 [00:00<00:00,  2.77ba/s]


In [10]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

In [11]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    "cooler_trainer_name", 
    evaluation_strategy="steps",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=6.25e-5,
    lr_scheduler_type="linear",
    per_device_eval_batch_size=1,
    use_mps_device=True
)

In [12]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=tokenized_data['train'], 
    eval_dataset=tokenized_data['validation'],
    data_collator=data_collator
)

In [31]:
trainer.train(resume_from_checkpoint=True)

  0%|          | 0/13821 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
 90%|█████████ | 12500/13821 [31:18<1:28:19,  4.01s/it]

{'loss': 0.6522, 'learning_rate': 5.973699442876782e-06, 'epoch': 2.71}


                                                       
 90%|█████████ | 12500/13821 [33:53<1:28:19,  4.01s/it]

{'eval_loss': 0.7444853782653809, 'eval_runtime': 154.7224, 'eval_samples_per_second': 6.34, 'eval_steps_per_second': 6.34, 'epoch': 2.71}


 94%|█████████▍| 13000/13821 [1:05:40<45:10,  3.30s/it]  

{'loss': 0.663, 'learning_rate': 3.712647420591853e-06, 'epoch': 2.82}


                                                       
 94%|█████████▍| 13000/13821 [1:08:18<45:10,  3.30s/it]

{'eval_loss': 0.7434998750686646, 'eval_runtime': 157.9971, 'eval_samples_per_second': 6.209, 'eval_steps_per_second': 6.209, 'epoch': 2.82}


 98%|█████████▊| 13500/13821 [1:39:30<22:36,  4.23s/it]   

{'loss': 0.6493, 'learning_rate': 1.4515953983069243e-06, 'epoch': 2.93}


                                                       
 98%|█████████▊| 13500/13821 [1:42:08<22:36,  4.23s/it]

{'eval_loss': 0.7430979609489441, 'eval_runtime': 158.1466, 'eval_samples_per_second': 6.203, 'eval_steps_per_second': 6.203, 'epoch': 2.93}


100%|██████████| 13821/13821 [2:03:42<00:00,  1.86it/s]  

{'train_runtime': 7422.7047, 'train_samples_per_second': 7.449, 'train_steps_per_second': 1.862, 'train_loss': 0.08617220559053591, 'epoch': 3.0}





TrainOutput(global_step=13821, training_loss=0.08617220559053591, metrics={'train_runtime': 7422.7047, 'train_samples_per_second': 7.449, 'train_steps_per_second': 1.862, 'train_loss': 0.08617220559053591, 'epoch': 3.0})

Let's test the model on a some sentences of the test set

In [None]:
GENERATION_LENGTH = 200

test_index = [0, 5, 6]

outputs = []

for i in test_index:
    train = train_parsed[i]
    split_train = train.split('\n')
    input = '\n'.join(split_train[:5])
    encoded_input = tokenizer.encode(input, return_tensors="pt").to(device)
    encoded_output = model.generate(encoded_input, top_k=50, max_length=GENERATION_LENGTH, top_p=0.95, temperature=0.7)
    decoded_output = tokenizer.decode(encoded_output[0], skip_special_tokens=True)
    output = decoded_output.split('\n')
    topic_output = []
    topic_output.append(output[0])
    topic_output.append(output[2])
    topic_output.append(output[4])
    topic_output.append(output[6:8])
    outputs.append(topic_output)

In [67]:
for output in outputs:
    for elem in output:
        print(elem)
    print('\n')

CHOSEN_TOPIC: Science fiction
SPEAKER: 0_Wizard
TEXT: I think science fiction is an amazing genre for anything. Future science, technology, time travel, FTL travel, they're all such interesting concepts.
['SPEAKER: 1_Apprentice', "TEXT: I agree, I think it's a great idea. I think it's also a great idea to explore the universe."]


CHOSEN_TOPIC: Romance (love)
SPEAKER: 0_Wizard
TEXT: I don't know how to be romantic. I have trouble expressing emotional attraction. I'm not very good at it.
['SPEAKER: 1_Apprentice', 'TEXT: I have been in a lot of romantic relationships that have ended up poorly. I have been in a lot of romantic relationships that have ended up poorly.']


CHOSEN_TOPIC: Krav Maga
SPEAKER: 0_Wizard
TEXT: Hello. I hope you might enjoy or know something about Krav Maga?
['SPEAKER: 1_Apprentice', 'TEXT: I have heard of it. I have heard of it before. I have heard of it before. I have heard of it before. I have heard of it before. I have heard of it before. I have heard of it bef