# 1 - Install Required Libraries

In [None]:
!pip install -q pandas numpy scikit-learn nltk

In [None]:
!pip install -U transformers accelerate



# 2 - Upload Dataset

In [None]:
from google.colab import files
uploaded = files.upload()

Saving chatbot.csv.txt to chatbot.csv (3).txt


In [None]:
import os
os.listdir()

['.config',
 'chatbot.csv (1).txt',
 'chatbot.csv (2).txt',
 'chatbot.csv (3).txt',
 'chatbot.csv.txt',
 'sample_data']

# 3 - Load and Clean the Dataset

In [None]:
import pandas as pd
import re

# Try tab-separated first, else comma
filename = list(uploaded.keys())[0]  # e.g., 'chatbot.csv.txt'

try:
    df = pd.read_csv(filename, sep='\t', names=['input', 'response'], on_bad_lines='skip')
except Exception:
    df = pd.read_csv(filename, sep=',', names=['input', 'response'], on_bad_lines='skip')

print("Raw dataset size:", len(df))
df.head()

Raw dataset size: 4856


Unnamed: 0,input,response
0,hi,Hello
1,who are you?,My name is Jarvis
2,What do you do?,answer you question.
3,"hi, how are you doing?",i'm fine. how about yourself?
4,i'm fine. how about yourself?,i'm pretty good. thanks for asking.


## Clean the Text

In [None]:
def clean_text(text):
    text = str(text).lower()
    # Keep letters, numbers and some punctuation
    text = re.sub(r"[^a-zA-Z0-9?.!,']+", " ", text)
    return text.strip()

df['input'] = df['input'].apply(clean_text)
df['response'] = df['response'].apply(clean_text)

# Drop empty / NaNs
df.dropna(inplace=True)
df = df[(df['input'] != "") & (df['response'] != "")].reset_index(drop=True)

print("After cleaning:", len(df))
df.head(10)

After cleaning: 4856


Unnamed: 0,input,response
0,hi,hello
1,who are you?,my name is jarvis
2,what do you do?,answer you question.
3,"hi, how are you doing?",i'm fine. how about yourself?
4,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
5,i'm pretty good. thanks for asking.,no problem. so how have you been?
6,no problem. so how have you been?,i've been great. what about you?
7,i've been great. what about you?,i've been good. i'm in school right now.
8,i've been good. i'm in school right now.,what school do you go to?
9,what school do you go to?,i go to pcc.


# 4 â€“ Split into train & validation

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

print("Train size:", len(train_df))
print("Val size:", len(val_df))

Train size: 3884
Val size: 972


# 5 â€“ Baseline model: TF-IDF + cosine similarity

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df['input'])

def baseline_reply(user_input):
    user_input_clean = clean_text(user_input)
    user_vec = vectorizer.transform([user_input_clean])
    sims = cosine_similarity(user_vec, X_train)
    idx = sims.argmax()
    return train_df.iloc[idx]['response']

### Quick test:

In [None]:
for i in range(5):
    print("Input:    ", train_df.iloc[i]['input'])
    print("Expected: ", train_df.iloc[i]['response'])
    print("Predicted:", baseline_reply(train_df.iloc[i]['input']))
    print("-"*60)

Input:     why is a 90 a b?
Expected:  i'm trying to challenge you guys.
Predicted: i'm trying to challenge you guys.
------------------------------------------------------------
Input:     have you attended school today?
Expected:  i attended school today. did you?
Predicted: i attended school today. did you?
------------------------------------------------------------
Input:     is it easy to learn?
Expected:  yes, it will only take about 30 minutes.
Predicted: yes, it will only take about 30 minutes.
------------------------------------------------------------
Input:     oh, so you took an art class?
Expected:  yeah, i loved that class.
Predicted: yeah, i loved that class.
------------------------------------------------------------
Input:     yes, there is.
Expected:  i feel safe here.
Predicted: i feel safe here.
------------------------------------------------------------


### You can also test interactively:

In [None]:
while True:
    user = input("You (baseline): ")
    if user.lower() in ['exit', 'quit', 'bye']:
        print("Bot: Goodbye ðŸ‘‹")
        break
    print("Bot:", baseline_reply(user))

You (baseline): i am angry
Bot: you should go to bed.
You (baseline): bye
Bot: Goodbye ðŸ‘‹


# 6 â€“ Choose framework & architecture (Deep Learning model)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "microsoft/DialoGPT-small"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Make sure we have a pad token (DialoGPT uses EOS as pad)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


# 7 â€“ Build a Dataset class for fine-tuning

In [None]:
import torch
from torch.utils.data import Dataset

class ChatDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length=128):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.samples = []

        for _, row in dataframe.iterrows():
            inp = row['input']
            resp = row['response']
            text = inp + tokenizer.eos_token + resp + tokenizer.eos_token

            encodings = tokenizer(
                text,
                truncation=True,
                max_length=max_length,
                padding="max_length"
            )

            self.samples.append(encodings)

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val) for key, val in self.samples[idx].items()}
        # For causal LM: labels are the same as input_ids, but pad tokens are ignored (-100)
        labels = item['input_ids'].clone()
        labels[item['attention_mask'] == 0] = -100
        item['labels'] = labels
        return item

train_dataset = ChatDataset(train_df, tokenizer)
val_dataset = ChatDataset(val_df, tokenizer)

len(train_dataset), len(val_dataset)

(3884, 972)

# 8 â€“ Training setup (Trainer API)

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./chatbot-dialoGPT",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=50,
    logging_steps=20,

    # NEW FORMAT (2024+)
    eval_strategy="epoch",
    save_strategy="epoch",

    save_total_limit=2,
    logging_dir="./logs",
    report_to=[]
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# 9 â€“ Train the deep learning model (Train the Model)

In [None]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,2.6286,2.29673
2,2.0656,2.160639
3,1.7811,2.137514


TrainOutput(global_step=2913, training_loss=2.2682415431527145, metrics={'train_runtime': 551.6763, 'train_samples_per_second': 21.121, 'train_steps_per_second': 5.28, 'total_flos': 761143689216000.0, 'train_loss': 2.2682415431527145, 'epoch': 3.0})

### After training, save the model & tokenizer:

In [None]:
trainer.save_model("./chatbot_model")
tokenizer.save_pretrained("./chatbot_model")

('./chatbot_model/tokenizer_config.json',
 './chatbot_model/special_tokens_map.json',
 './chatbot_model/chat_template.jinja',
 './chatbot_model/vocab.json',
 './chatbot_model/merges.txt',
 './chatbot_model/added_tokens.json',
 './chatbot_model/tokenizer.json')

# 10 â€“ Evaluate model (quantitatively + qualitatively)

### 10.1 Get validation loss & perplexity

In [None]:
import math

eval_results = trainer.evaluate()
print("Eval loss:", eval_results["eval_loss"])
print("Perplexity:", math.exp(eval_results["eval_loss"]))

Eval loss: 2.137514352798462
Perplexity: 8.478337260429088


### 10.2. Compare baseline vs deep learning on a few examples

In [None]:
def generate_response(model, tokenizer, user_input, max_length=100):
    model.eval()
    input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt")

    # Move to GPU if available
    if torch.cuda.is_available():
        model.to("cuda")
        input_ids = input_ids.to("cuda")

    with torch.no_grad():
        chat_ids = model.generate(
            input_ids,
            max_length=max_length,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )

    response = tokenizer.decode(chat_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response

for i in range(5):
    sample_inp = val_df.iloc[i]['input']
    sample_exp = val_df.iloc[i]['response']
    print(f"Input:      {sample_inp}")
    print(f"Expected:   {sample_exp}")
    print(f"Baseline:   {baseline_reply(sample_inp)}")
    print(f"Deep model: {generate_response(model, tokenizer, sample_inp)}")
    print("-"*80)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Input:      see an eye doctor.
Expected:   that's what i need to do.
Baseline:   my doctor said i need surgery.
Deep model: okay, i'll get the screen shots.
--------------------------------------------------------------------------------
Input:      i'm in love with that girl.
Expected:   have you told her?
Baseline:   i love you, too.
Deep model: who is that?
--------------------------------------------------------------------------------
Input:      he said he was thinking about it, but he didn't get around to it.
Expected:   he didn't get around to turning himself in, either.
Baseline:   did he say what time?
Deep model: he did put a stamp on the envelope.
--------------------------------------------------------------------------------
Input:      do you have a car?
Expected:   yes, i do.
Baseline:   yes i am having a car
Deep model: yes,i do.
--------------------------------------------------------------------------------
Input:      this trail is hard to climb.
Expected:   especia

# 11 â€“ Interactive chat with your fine-tuned model

In [None]:
# Make sure model is on GPU if available
if torch.cuda.is_available():
    model.to("cuda")

chat_history_ids = None

while True:
    user_input = input("You (deep model): ")
    if user_input.lower() in ['exit', 'quit', 'bye']:
        print("Bot: Goodbye ðŸ‘‹")
        break

    # Prepare input tokens (we'll append to history each turn)
    new_input_ids = tokenizer.encode(user_input + tokenizer.eos_token, return_tensors="pt")

    if torch.cuda.is_available():
        new_input_ids = new_input_ids.to("cuda")

    if chat_history_ids is not None:
        bot_input_ids = torch.cat([chat_history_ids, new_input_ids], dim=-1)
    else:
        bot_input_ids = new_input_ids

    chat_history_ids = model.generate(
        bot_input_ids,
        max_length=1000,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )

    bot_response = tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)
    print("Bot:", bot_response)

You (deep model): hey
Bot: hello
You (deep model): i am angry
Bot: who are you?
You (deep model): i am adil
Bot: what is that?
You (deep model): my name is adil
Bot: hello
You (deep model): who are you
Bot: i am fatale
You (deep model): how old are you
Bot: i'm malibu beach
You (deep model): bye
Bot: Goodbye ðŸ‘‹
