In [10]:
import numpy as np
import pandas as pd
import tqdm

import evaluate
import torch
from datasets import (
    Dataset, load_dataset, 
    load_metric
)

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, 
    TrainingArguments, Trainer, 
    DataCollatorForLanguageModeling,
    EarlyStoppingCallback
)

In [11]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
torch.cuda.empty_cache()
device

device(type='cpu')

In [12]:
print("Loading model... ", end='', flush=True)
tokeniser = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
model.to(device)
print('Done')

Loading model... Done


In [19]:
raw_dataset = load_dataset("vicgalle/alpaca-gpt4", split="train")
raw_dataset = raw_dataset.shuffle(seed=42).select(range(10))
raw_dataset

Found cached dataset parquet (C:/Users/chong/.cache/huggingface/datasets/vicgalle___parquet/vicgalle--alpaca-gpt4-1e85e31ce0639161/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
Loading cached shuffled indices for dataset at C:\Users\chong\.cache\huggingface\datasets\vicgalle___parquet\vicgalle--alpaca-gpt4-1e85e31ce0639161\0.0.0\2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec\cache-4978e7bfa952f753.arrow


Dataset({
    features: ['instruction', 'input', 'output', 'text'],
    num_rows: 10
})

In [14]:
temp_lst = []

for row in tqdm.tqdm(raw_dataset, desc="Re-formatting dataset", unit=" rows", leave=False):
    temp_dict = {}
    temp_dict["text"] = (row["instruction"]+" "+row["input"]).strip() + tokeniser.eos_token + row["output"].strip() + tokeniser.eos_token
    temp_lst.append(temp_dict)

temp_df = pd.DataFrame(temp_lst, columns=["text"])
temp_df.dropna()

processed_dataset = Dataset.from_pandas(temp_df)
processed_dataset



Dataset({
    features: ['text'],
    num_rows: 10
})

In [15]:
tokeniser.pad_token = tokeniser.eos_token

def preprocess(example):
    return tokeniser(example["text"], padding=True, truncation=True)

tokenised_dataset = processed_dataset.map(preprocess)

tokenised_dataset = tokenised_dataset.remove_columns(["text"])
tokenised_dataset = tokenised_dataset.with_format("torch", columns=["input_ids", "attention_mask"])
tokenised_dataset = tokenised_dataset.train_test_split(test_size=0.1)
tokenised_dataset

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 9
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 1
    })
})

In [16]:
data_collator = DataCollatorForLanguageModeling(tokeniser, mlm=False)

In [17]:
training_args = TrainingArguments(
    output_dir="../models/test_model",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    optim="adamw_torch",
    report_to="all",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenised_dataset["train"],
    eval_dataset=tokenised_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokeniser,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [18]:
trainer.train()
trainer.save_model("../models/test_model/final") 

  0%|          | 0/6 [00:00<?, ?it/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 9.5746, 'learning_rate': 3.3333333333333335e-05, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.9337158203125, 'eval_runtime': 0.0748, 'eval_samples_per_second': 13.362, 'eval_steps_per_second': 13.362, 'epoch': 1.0}
{'loss': 7.4222, 'learning_rate': 1.6666666666666667e-05, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.8238205909729, 'eval_runtime': 0.077, 'eval_samples_per_second': 12.991, 'eval_steps_per_second': 12.991, 'epoch': 2.0}
{'loss': 7.7933, 'learning_rate': 0.0, 'epoch': 3.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 4.782866954803467, 'eval_runtime': 0.0904, 'eval_samples_per_second': 11.063, 'eval_steps_per_second': 11.063, 'epoch': 3.0}
{'train_runtime': 65.4265, 'train_samples_per_second': 0.413, 'train_steps_per_second': 0.092, 'train_loss': 8.263354142506918, 'epoch': 3.0}
