## Self Learning Chatbot - Generative

task : Retrain dialoGPT

In [1]:
import json
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

In [2]:
import torch

torch.cuda.is_available()

print(torch.__version__)

1.13.1+cpu


dialoGPT model

DialoGPT (from Microsoft Research) released with the paper DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.

source : https://arxiv.org/abs/1911.00536

Microsoft github : https://github.com/microsoft/DialoGPT#retraining-full-models

Initialize pre trained Model

In [3]:
# model_name = "microsoft/DialoGPT-large"
model_name = "microsoft/DialoGPT-medium"
# model_name = "microsoft/DialoGPT-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

In [4]:

tokenizer.add_special_tokens({'pad_token': '[PAD]'})

1

Data Preprocessing

In [5]:
def load_intent_data(file_path):
    # load intent based json format dataset

    with open(file_path, "r", encoding='utf-8') as json_file:
            data = json.load(json_file)
    return data['intents']


In [6]:
intent_data = load_intent_data('Dataset/intents_it.json')

In [7]:
dataset = Dataset.from_dict({
    'dialog': [intent['patterns'] for intent in intent_data],
    'response': [intent['responses'][0] for intent in intent_data]  # Using the first response as the target
})

In [8]:
def encode_data(data):

    dialog_responses = [
        f"User: {' '.join(dialog)}\nAssistant: {response}" 
        for dialog, response in zip(data['dialog'], data['response'])
    ]

    encoded_inputs = tokenizer(dialog_responses)
    
    # encoded_inputs = tokenizer(
    #     dialog_responses,
    #     padding='max_length',
    #     truncation=True,
    #     max_length=512,
    #     return_tensors='pt'
    # )
    
    return {
        'input_ids': encoded_inputs['input_ids'],
        'attention_mask': encoded_inputs['attention_mask'],
        'labels': encoded_inputs['input_ids']
    }

In [9]:
encoded_dataset = dataset.map(encode_data, batched=True, batch_size=8)

Map:   0%|          | 0/18 [00:00<?, ? examples/s]

In [10]:
print(encoded_dataset)

Dataset({
    features: ['dialog', 'response', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 18
})


In [11]:
print(len(encoded_dataset['dialog']))

print(encoded_dataset['response'])

18
['Hello, Welcome to Our IT Chatbot', 'See you!', 'Happy to help! Any other issues?', "Sorry, can't understand you", 'I can guide you through\n 1)Password Reset\n2) Trouble-Shooting issues\n3) Virus Issues\n4) Printing Issues\n5) many more IT issues', 'Confirm your email address', 'The reset pin has been sent on your registered mobile number', 'Simply rebooting can fix the blue screen of death (or STOP error, as it is otherwise known).', 'The first step is to check the recycle bin. If that fails, you can contact your IT Support partner.', 'However, all your work is not definitely lost. If you have Auto-Recover options enabled in Microsoft Office, then there are some easy steps to recover your work.\nIf not, you can also search for Word backup files by clicking “open”, “computer” and then browsing to the folder where the file was last saved.\nYou may also be able to find your file by performing a search on your computer for temporary files with a .tmp file extension or a ~ prefix.', '

In [12]:
# def load_and_preprocess_data(file_path):

#     with open(file_path, "r") as json_file:
#         data = json.load(json_file)

#     conversations = []
#     processed_dialogues = []

#     for intent in data['intents']:

#         for pattern, response in zip(intent['patterns'], intent['responses']):

#             # make it pattern|response pair
#             conversation = f"User: {pattern}|Bot: {response}"
#             conversations.append(conversation)

#     print(conversations)

#     # Perform Tokenization and preprocess each dialogue in the conversations
#     for dialogue in conversations:

#         user_input, bot_response = dialogue.split('|')

#         # Tokenize user input and bot response separately
#         user_input_tokens = tokenizer.encode(user_input.strip(), add_special_tokens=False)
#         bot_response_tokens = tokenizer.encode(bot_response.strip(), add_special_tokens=False)

#         processed_dialogues.append((user_input_tokens, bot_response_tokens))

#     return processed_dialogues

In [13]:
# processed_data = load_and_preprocess_data(file_path="Dataset\intents_1.json")

Hugging Face Transformer Library documentation

https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.Trainer

 Data format : input_tokens_n [SEP] [MASK] [target_token]

In [14]:
# Data collator
# batching and processing the training data for model

# data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [15]:
# # Define training arguments
# training_args = TrainingArguments(
#     output_dir="./output",   # output directory
#     num_train_epochs=10,             # total number of training epochs
#     per_device_train_batch_size=16,  # batch size per device during training
#     per_device_eval_batch_size=64,   # batch size for evaluation
#     warmup_steps=500,                # number of warmup steps for learning rate scheduler
#     weight_decay=0.01,               # strength of weight decay
#     logging_dir=None,                # directory for storing logs
#     fp16=True                        # use floating point 16 bit precision for training
#     gradient_accumulation_steps=8,
#     logging_steps=100,
# )

In [24]:
# Training param

training_args = TrainingArguments(
    output_dir="./output",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=1,
    save_steps=1000,
    save_total_limit=2,
    optim="adamw_torch"
)

In [20]:
print(encoded_dataset)

Dataset({
    features: ['dialog', 'response', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 18
})


In [25]:
# Initialize Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=encoded_dataset
)

In [19]:
print("Encoded Dataset type:", type(encoded_dataset))

Encoded Dataset type: <class 'datasets.arrow_dataset.Dataset'>


In [26]:
input_ids_length = len(encoded_dataset['input_ids'])
attention_mask_length = len(encoded_dataset['attention_mask'])
labels_length = len(encoded_dataset['labels'])

print("Input IDs Length:", input_ids_length)
print("Attention Mask Length:", attention_mask_length)
print("Labels Length:", labels_length)

Input IDs Length: 18
Attention Mask Length: 18
Labels Length: 18


In [27]:
trainer.train()

  0%|          | 0/90 [00:00<?, ?it/s]

{'train_runtime': 212.3662, 'train_samples_per_second': 0.424, 'train_steps_per_second': 0.424, 'train_loss': 2.4217152913411457, 'epoch': 5.0}


TrainOutput(global_step=90, training_loss=2.4217152913411457, metrics={'train_runtime': 212.3662, 'train_samples_per_second': 0.424, 'train_steps_per_second': 0.424, 'train_loss': 2.4217152913411457, 'epoch': 5.0})

In [22]:
model.save_pretrained("tuned_dialogpt")
tokenizer.save_pretrained("tokenizer")

('tokenizer\\tokenizer_config.json',
 'tokenizer\\special_tokens_map.json',
 'tokenizer\\vocab.json',
 'tokenizer\\merges.txt',
 'tokenizer\\added_tokens.json',
 'tokenizer\\tokenizer.json')