In [2]:
import torch

Hugging Face Dataset library

https://huggingface.co/docs/datasets/v2.14.4/en/index

In [3]:
from datasets import load_dataset

# Split ="train" to make it Dataset object, by default it load as Dataset Dict object
encoded_dataset = load_dataset('json', split='train', data_files="Dataset/Encoded_Ecommerce_FAQ_Chatbot_dataset.json")

Found cached dataset json (C:/Users/User/.cache/huggingface/datasets/json/default-b1a5f79678292222/0.0.0/8bb11242116d547c741b2e8a1f18598ffdd40a1d4f2a2872c7a28b697434bc96)


In [4]:
encoded_dataset

Dataset({
    features: ['questions', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 110
})

In [5]:
type(encoded_dataset)

datasets.arrow_dataset.Dataset

In [6]:
# train, validation test split

splited_dataset = encoded_dataset.train_test_split(test_size=0.15)

In [7]:
splited_dataset

DatasetDict({
    train: Dataset({
        features: ['questions', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 93
    })
    test: Dataset({
        features: ['questions', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 17
    })
})

In [8]:
# with open("Dataset/Encoded_Ecommerce_FAQ_Chatbot_dataset.json", "r", encoding='utf-8') as json_file:
#     encoded_dataset = json.load(json_file)

# # Convert python list -> numpy array -> pytorch tensor
# for key in ['input_ids', 'attention_mask', 'labels']:
#     encoded_dataset[key] = torch.tensor(np.array(encoded_dataset[key]))

In [9]:
# encoded_dataset['train']

Hugging Face Transformer Library documentation

https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.Trainer

dialoGPT model

DialoGPT (from Microsoft Research) released with the paper DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.

source : https://arxiv.org/abs/1911.00536

Microsoft github : https://github.com/microsoft/DialoGPT#retraining-full-models

In [10]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorWithPadding

In [11]:
model_name = "microsoft/DialoGPT-large"
# model_name = "microsoft/DialoGPT-medium"
# model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [12]:
tokenizer.pad_token = tokenizer.eos_token

In [13]:
max_token_limit = tokenizer.max_len_single_sentence
print(f"Maximum token limit for single sentence: {max_token_limit}")

Maximum token limit for single sentence: 1024


Hugging Face evaluate library

https://huggingface.co/docs/evaluate/index

Bleu score- https://huggingface.co/spaces/evaluate-metric/bleu

evaluate bleu expects :
predictions (list of strs): Translations to score.
references (list of lists of strs): references for each translation.

In [14]:
import evaluate

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = [str(prediction) for prediction in predictions]
    references = [[str(reference) for reference in reference_list] for reference_list in labels]
    
    # perplexity, BLEU score, ROUGE score
    bleu = evaluate.load("bleu")
    results = bleu.compute(predictions=predictions, references=references)

    return results

In [14]:
# training_args = TrainingArguments(
#     output_dir="./output",
#     overwrite_output_dir=True,
#     num_train_epochs=5,
#     per_device_train_batch_size=1,
#     save_steps=1000,
#     save_total_limit=2,
#     optim="adamw_torch"
# )

In [15]:
training_args = TrainingArguments(
    output_dir="./logs", # Output fine-tuned model and logs
    overwrite_output_dir=True, # Overwrite the output directory if exists
    num_train_epochs=5, # Number of training epochs (110 num rows, allow more epoch for less dataset)
    per_device_train_batch_size=2, # Batch size for training (number of samples in each batch)
    save_steps=100, # Number of steps before saving a checkpoint
    save_total_limit=3, # Maximum number of checkpoints to keep
    evaluation_strategy="steps", # Evaluate on validation data at specified steps
    eval_steps=100, # Evaluate every 500 steps
    logging_steps=25, # Log training information every n steps
    learning_rate=1e-4, # start with Low learning rate, dialoGPT paper use 1e-5 to 5e-5.
    # warmup_steps=500, # Warm-up steps for learning rate (helps with stability)
    # weight_decay=0.01, # Weight decay (L2 regularization) parameter (prevent overfitting)
    load_best_model_at_end=True, # Load the best model checkpoint at the end of training, only works when evaluation strategy is steps
    metric_for_best_model="eval_loss", # Metric to determine best model
    optim="adamw_torch" # Optimizer used for training - Adamw
)

In [16]:
data_collator_ = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
splited_dataset['train']

Dataset({
    features: ['questions', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 93
})

In [18]:
validation_set = splited_dataset['test']

validation_set

Dataset({
    features: ['questions', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 17
})

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=splited_dataset['train'],
    eval_dataset=splited_dataset['test'],
    # tokenizer=tokenizer,
    # compute_metrics=compute_metrics,
    # data_collator = data_collator_
)

In [26]:
torch.cuda.empty_cache()

In [21]:
for key in ['input_ids', 'attention_mask', 'labels']:
    print(f"{key} length : {len(splited_dataset['train'][key])}")

input_ids length : 93
attention_mask length : 93
labels length : 93


In [27]:
trainer.train()

  0%|          | 0/235 [00:00<?, ?it/s]

{'loss': 1.6248, 'learning_rate': 1e-05, 'epoch': 1.06}
{'loss': 0.8983, 'learning_rate': 2e-05, 'epoch': 2.13}
{'loss': 0.4606, 'learning_rate': 3e-05, 'epoch': 3.19}
{'loss': 0.3149, 'learning_rate': 4e-05, 'epoch': 4.26}
{'train_runtime': 5412.5213, 'train_samples_per_second': 0.086, 'train_steps_per_second': 0.043, 'train_loss': 0.7423461345916099, 'epoch': 5.0}


TrainOutput(global_step=235, training_loss=0.7423461345916099, metrics={'train_runtime': 5412.5213, 'train_samples_per_second': 0.086, 'train_steps_per_second': 0.043, 'train_loss': 0.7423461345916099, 'epoch': 5.0})

In [28]:
import pandas as pd

pd.DataFrame(trainer.state.log_history)

Unnamed: 0,loss,learning_rate,epoch,step,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,1.6248,1e-05,1.06,50,,,,,
1,0.8983,2e-05,2.13,100,,,,,
2,0.4606,3e-05,3.19,150,,,,,
3,0.3149,4e-05,4.26,200,,,,,
4,,,5.0,235,5412.5213,0.086,0.043,1011922000000000.0,0.742346


In [29]:
model.save_pretrained("fine_tuned_dialogpt_FAQ_Ecommerce_1")
# tokenizer.save_pretrained("tokenizer")