In [12]:
import numpy as np

import torch

Hugging Face Dataset library

https://huggingface.co/docs/datasets/v2.14.4/en/index

In [30]:
from datasets import load_dataset              

# Split ="train" to make it Dataset object, by default it load as Dataset Dict object
encoded_dataset = load_dataset('json', split='train', data_files="Dataset/Encoded_Ecommerce_FAQ_Chatbot_dataset.json")

Found cached dataset json (C:/Users/User/.cache/huggingface/datasets/json/default-b1a5f79678292222/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


In [31]:
encoded_dataset

Dataset({
    features: ['questions', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 110
})

In [32]:
type(encoded_dataset)

datasets.arrow_dataset.Dataset

In [51]:
# train, validation test split

splited_dataset = encoded_dataset.train_test_split(test_size=0.15)

In [2]:
# with open("Dataset/Encoded_Ecommerce_FAQ_Chatbot_dataset.json", "r", encoding='utf-8') as json_file:
#     encoded_dataset = json.load(json_file)

# # Convert python list -> numpy array -> pytorch tensor
# for key in ['input_ids', 'attention_mask', 'labels']:
#     encoded_dataset[key] = torch.tensor(np.array(encoded_dataset[key]))

In [9]:
# encoded_dataset['train']

Dataset({
    features: ['questions', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 110
})

Hugging Face Transformer Library documentation

https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.Trainer

dialoGPT model

DialoGPT (from Microsoft Research) released with the paper DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.

source : https://arxiv.org/abs/1911.00536

Microsoft github : https://github.com/microsoft/DialoGPT#retraining-full-models

In [36]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding

In [37]:
model_name = "microsoft/DialoGPT-large"
# model_name = "microsoft/DialoGPT-medium"
# model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [61]:
tokenizer.pad_token = tokenizer.eos_token

In [38]:
max_token_limit = tokenizer.max_len_single_sentence
print(f"Maximum token limit for single sentence: {max_token_limit}")

Maximum token limit for single sentence: 1024


Hugging Face evaluate library

https://huggingface.co/docs/evaluate/index

In [39]:
import evaluate

# perplexity, BLEU score, ROUGE score
metric = evaluate.load("bleu")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = predictions[:, 0]
    return metric.compute(predictions=predictions, references=labels)

In [20]:
# training_args = TrainingArguments(
#     output_dir="./output",
#     overwrite_output_dir=True,
#     num_train_epochs=5,
#     per_device_train_batch_size=1,
#     save_steps=1000,
#     save_total_limit=2,
#     optim="adamw_torch"
# )

In [40]:
# Accumulate gradients over multiple mini-batches before performing a parameter update (i.e., a backward pass and optimization step)
gradient_accumulation_steps = 1

# logging steps
logging_steps = 100

# Adjust logging_steps for gradient accumulation
logging_steps = logging_steps // gradient_accumulation_steps

In [41]:
training_args = TrainingArguments(
    output_dir="./logs", # Output fine-tuned model and logs
    overwrite_output_dir=True, # Overwrite the output directory if exists
    num_train_epochs=5, # Number of training epochs (110 num rows, allow more epoch for less dataset)
    per_device_train_batch_size=2, # Batch size for training (number of samples in each batch)
    save_steps=500, # Number of steps before saving a checkpoint
    save_total_limit=3, # Maximum number of checkpoints to keep
    evaluation_strategy="steps", # Evaluate on validation data at specified steps
    eval_steps=500, # Evaluate every 500 steps
    logging_steps=logging_steps, # Log training information every 100 steps
    learning_rate=1e-4, # start with Low learning rate, dialoGPT paper use 1e-5 to 5e-5.
    warmup_steps=500, # Warm-up steps for learning rate (helps with stability)
    weight_decay=0.01, # Weight decay (L2 regularization) parameter (prevent overfitting)
    load_best_model_at_end=True, # Load the best model checkpoint at the end of training, only works when evaluation strategy is steps
    metric_for_best_model="eval_loss", # Metric to determine best model
    optim="adamw_torch" # Optimizer used for training - Adamw
)

In [42]:
data_collator_ = DataCollatorWithPadding(tokenizer=tokenizer)

In [53]:
splited_dataset['train']

Dataset({
    features: ['questions', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 93
})

In [62]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=splited_dataset['train'],
    eval_dataset=splited_dataset['test'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    data_collator = data_collator_
)

In [55]:
torch.cuda.empty_cache()

In [57]:
for key in ['input_ids', 'attention_mask', 'labels']:
    print(f"{key} length : {len(splited_dataset['train'][key])}")

input_ids length : 93
attention_mask length : 93
labels length : 93


In [63]:
trainer.train()

  0%|          | 0/235 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
import pandas as pd

pd.DataFrame(trainer.state.log_history)

In [None]:
model.save_pretrained("fine_tuned_dialogpt_FAQ_Ecommerce")
# tokenizer.save_pretrained("tokenizer")