# Importing Lib's & DataSet

In [None]:
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

In [None]:
df = pd.read_csv('/content/domain_specific_chatbot_data.csv')
df.head()

# Data Splitting & Traning

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state= 42)
train_df.shape, val_df.shape

In [None]:
# Reset the indexes
train_data = train_df.reset_index(drop= True)
val_data = val_df.reset_index(drop= True)
val_data.head()

# Preprocessing Text

In [None]:
import re

def preprocess_text(text):
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'\r\n', '', text)
    text = text.strip().lower()
    return text

train_data['query'] = train_data['query'].apply(preprocess_text)
train_data['response'] = train_data['response'].apply(preprocess_text)

val_data['query'] = val_data['query'].apply(preprocess_text)
val_data['response'] = val_data['response'].apply(preprocess_text)

train_data.head()

# Importing Model & Tokenizing

In [None]:
tokenizer = T5Tokenizer.from_pretrained('t5-base')

Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
def tokenization(demo):
    inputs = tokenizer(demo['query'], padding= 'max_length', truncation= True, max_length= 100)
    targets = tokenizer(demo['response'], padding= 'max_length', truncation= True, max_length= 100)
    inputs['labels'] = targets['input_ids']
    return inputs

train_dataset = train_data.apply(tokenization, axis= 1)
val_dataset = val_data.apply(tokenization, axis= 1)

In [None]:
train_data['response'].iloc[0]

In [None]:
train_dataset.iloc[0]

{'input_ids': [125, 225, 3, 23, 103, 3, 99, 3, 23, 3041, 3, 9, 6742, 13, 82, 7757, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [3, 99, 25, 3041, 3, 9, 6742, 6, 240, 34, 38, 1116, 38, 25, 1423, 3, 3227, 34, 31, 7, 966, 97, 21, 39, 416, 6742, 5, 3, 99, 25, 22, 60, 3, 20305, 6, 574, 39, 4640, 3175, 5, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# Traning the Model

In [None]:
model = T5ForConditionalGeneration.from_pretrained('t5-small')

training_args = TrainingArguments(
    output_dir= './results',
    num_train_epochs= 5,
    per_device_train_batch_size= 8,
    per_device_eval_batch_size= 8,
    warmup_steps= 50,
    weight_decay= 0.01,
    logging_dir= './logs',
    logging_steps= 10,
    eval_strategy= 'epoch',
    eval_steps= 50,
)

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset
)

trainer.train()

  | |_| | '_ \/ _` / _` |  _/ -_)


<IPython.core.display.Javascript object>

# Running the Model

In [None]:
# Save & tokenize the model in chatbot_model directory.
model.save_pretrained('./chatbot_model')
tokenizer.save_pretrained('./chatbot_model')

# Loads & tokenize the same model
model = T5ForConditionalGeneration.from_pretrained('./chatbot_model')
tokenizer = T5Tokenizer.from_pretrained('./chatbot_model')

# ChatBot Function

In [None]:
device = model.device # It ensure the input data is on same device.

def chatbot(query):
    query = preprocess_text(query)
    input_ids = tokenizer(query,  truncation= True, return_tensors= 'pt', max_length= 100) # converts query into token ids.
    inputs = {key : value.to(device) for key, value in input_ids.items()} # It moves the tensors to the device where the model exsists.

    outputs = model.generate(
        input_ids['input_ids'],
        max_length= 100,
        num_beams= 5, # 5 possible continoution at each step.
    )

    return tokenizer.decode(outputs[0], skip_special_tokens= True) # tokens(nums) back to text form.

while True:
    user_inps = input('You: ')
    if user_inps.lower() == 'quit':
        break
    response = chatbot(user_inps)
    print('Bot:', response)