# Import Toolkits

In [1]:
import pandas as pd
from transformers import T5Tokenizer , T5ForConditionalGeneration , Trainer , TrainingArguments

# Load Dataset

In [2]:
df = pd.read_csv("/content/domain_specific_chatbot_data.csv")
df.head()

Unnamed: 0,query,response,intent,domain
0,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
1,How can I schedule an appointment with my doctor?,You can schedule an appointment by calling our...,appointment booking,healthcare
2,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
3,How can I check my account balance?,You can check your balance by logging into you...,balance inquiry,finance
4,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   query     3000 non-null   object
 1   response  3000 non-null   object
 2   intent    3000 non-null   object
 3   domain    3000 non-null   object
dtypes: object(4)
memory usage: 93.9+ KB


# Data Pre-processing

In [4]:
from sklearn.model_selection import train_test_split

train_df , val_df = train_test_split(df , test_size = 0.2 , random_state = 42)

In [5]:
print(f"Train shape : {train_df.shape}")
print(f"Validation shape : {val_df.shape}")

Train shape : (2400, 4)
Validation shape : (600, 4)


In [6]:
train_df

Unnamed: 0,query,response,intent,domain
642,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
700,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
226,What are the symptoms of flu?,"Flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
1697,How do I update my contact details on my account?,"To update your contact details, log into your ...",contact update,finance
1010,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
...,...,...,...,...
1638,Can I make changes to my loan repayment schedule?,Changes to your loan repayment schedule can be...,loan repayment adjustment,finance
1095,"I lost my credit card, what should I do?",Please contact our customer service immediatel...,lost card reporting,finance
1130,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
1294,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance


In [7]:
train_df = train_df.reset_index(drop = True)
val_df = val_df.reset_index(drop = True)
train_df

Unnamed: 0,query,response,intent,domain
0,What should I do if I miss a dose of my medica...,"If you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
1,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
2,What are the symptoms of flu?,"Flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
3,How do I update my contact details on my account?,"To update your contact details, log into your ...",contact update,finance
4,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
...,...,...,...,...
2395,Can I make changes to my loan repayment schedule?,Changes to your loan repayment schedule can be...,loan repayment adjustment,finance
2396,"I lost my credit card, what should I do?",Please contact our customer service immediatel...,lost card reporting,finance
2397,What are the side effects of the COVID-19 vacc...,Common side effects of the COVID-19 vaccine in...,side effects inquiry,healthcare
2398,What is the interest rate for a personal loan?,The current interest rate for personal loans i...,loan inquiry,finance


In [8]:
import re

def clean_text(text):
  text = re.sub(r"\r\n" , " " , text)
  text = re.sub(r"\s+" , " " , text)
  text = re.sub(r"<.*?>" , "" , text)
  text = text.strip().lower()
  return text

In [9]:
train_df['query'] = train_df['query'].apply(clean_text)
train_df['response'] = train_df['response'].apply(clean_text)

val_df['query'] = val_df['query'].apply(clean_text)
val_df['response'] = val_df['response'].apply(clean_text)

train_df

Unnamed: 0,query,response,intent,domain
0,what should i do if i miss a dose of my medica...,"if you miss a dose, take it as soon as you rem...",medication inquiry,healthcare
1,what are the side effects of the covid-19 vacc...,common side effects of the covid-19 vaccine in...,side effects inquiry,healthcare
2,what are the symptoms of flu?,"flu symptoms include fever, cough, sore throat...",flu symptoms inquiry,healthcare
3,how do i update my contact details on my account?,"to update your contact details, log into your ...",contact update,finance
4,what are the side effects of the covid-19 vacc...,common side effects of the covid-19 vaccine in...,side effects inquiry,healthcare
...,...,...,...,...
2395,can i make changes to my loan repayment schedule?,changes to your loan repayment schedule can be...,loan repayment adjustment,finance
2396,"i lost my credit card, what should i do?",please contact our customer service immediatel...,lost card reporting,finance
2397,what are the side effects of the covid-19 vacc...,common side effects of the covid-19 vaccine in...,side effects inquiry,healthcare
2398,what is the interest rate for a personal loan?,the current interest rate for personal loans i...,loan inquiry,finance


# Tokenization

In [10]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [16]:
def tokenize_func(examples):
  inputs = tokenizer(
      examples['query'],
      max_length = 250,
      truncation = True,
      padding = "max_length"
  )
  target = tokenizer(
      examples['response'],
      max_length = 250,
      truncation = True,
      padding = "max_length"
  )

  inputs['labels'] = target['input_ids']

  return inputs

In [20]:
train_df = train_df.apply(tokenize_func , axis = 1)
val_df=val_df.apply(tokenize_func , axis = 1)

In [22]:
train_df[0]

{'input_ids': [125, 225, 3, 23, 103, 3, 99, 3, 23, 3041, 3, 9, 6742, 13, 82, 7757, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

# Fine-Tuning Model

In [24]:
model = T5ForConditionalGeneration.from_pretrained("t5-base")

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=6,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_steps=500,
    eval_steps=50
)

trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_df,
    eval_dataset = val_df
)

trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mabdelrhmanessam829[0m ([33mabdelrhmanessam829-a-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.


Step,Training Loss
50,13.2423
100,7.4263
150,1.4396
200,0.1834
250,0.0697
300,0.0283
350,0.0139
400,0.0072
450,0.0046
500,0.0032


TrainOutput(global_step=1800, training_loss=0.6231648543213184, metrics={'train_runtime': 2135.4908, 'train_samples_per_second': 6.743, 'train_steps_per_second': 0.843, 'total_flos': 4281735168000000.0, 'train_loss': 0.6231648543213184, 'epoch': 6.0})

# Save and Load Model

In [25]:
model.save_pretrained("./chatbot_model")
tokenizer.save_pretrained("./chatbot_model")

loaded_model = T5ForConditionalGeneration.from_pretrained("./chatbot_model")
loaded_tokenizer = T5Tokenizer.from_pretrained("./chatbot_model")

# Chatbot System

In [27]:
device = model.device

def chatbot(query):
  query = clean_text(query)
  input_ids = tokenizer(query , return_tensors = "pt" , max_length = 250 , truncation = True)

  inputs = {key : value.to(device) for key , value in input_ids.items()}

  outputs = model.generate(
      input_ids = inputs['input_ids'],
      max_length = 250,
      num_beams = 5,
      early_stopping = True
  )
  return tokenizer.decode(outputs[0] , skip_special_tokens = True)

while True:
  user_input = input("You: ")
  if user_input.lower() == "exit":
    break
  response = chatbot(user_input)
  print(f"Chatbot: {response}")

You: how to login to the system?
Chatbot: to log into the system, go to the system and go to the login page.
You: how to find setting option?
Chatbot: .
You: how to find setting option?
Chatbot: you can set setting option by logging into your account or using our mobile app.
You: where to find setting option?
Chatbot: setting option available on our site .
You: exit
