# SUMMATIVE ASSIGNMENT - BUILD DOMAIN-SPECIFIC A CHAT-BOT


In [1]:
!pip install transformers datasets evaluate nltk




In [2]:
# Import necessary libraries
from google.colab import drive
import pandas as pd
import json
import re
import torch
from transformers import BertTokenizerFast, BertForSequenceClassification, BertForQuestionAnswering, TrainingArguments, Trainer
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from nltk.translate.bleu_score import sentence_bleu
import evaluate
import numpy as np


**1. Data Loading**

In [3]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
import os
dataset_path = '/content/drive/MyDrive/Covid-19/Covid_Dataset.json'

df = pd.read_json(dataset_path)
df.head()  # View the first few records



Unnamed: 0,question,answer
0,Can I get COVID-19 from animals when travellin...,Although the current spread and growth of the ...
1,How can I protect myself and others?,The best way to prevent illness from COVID-19 ...
2,Where did COVID-19 come from?,"It was first found in Wuhan City, Hubei Provin..."
3,Can my pet or other animals get sick from COVI...,"However, livestock producers should follow nor..."
4,How can I protect my child from COVID-19?,Washing hands with soap and water for at least...


In [6]:
# Check column names and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24240 entries, 0 to 24239
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  23426 non-null  object
 1   answer    24240 non-null  object
dtypes: object(2)
memory usage: 378.9+ KB


In [9]:
df.isnull().sum()

Unnamed: 0,0
question,0
answer,0
anser,0


**2. Data Preprocessing**

In [7]:
# Remove duplicates (if any)
df.drop_duplicates(inplace=True)

# Lowercase conversion
df["question"] = df["question"].str.lower()
df["anser"] = df["answer"].str.lower()

# Remove unnecessary spaces
df["question"] = df["question"].str.strip()
df["answer"] = df["answer"].str.strip()


In [8]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove special characters, numbers, etc.
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    return text

df['question'] = df['question'].fillna('unknown')
df['answer'] = df['answer'].fillna('unknown')

df['question'] = df['question'].apply(clean_text)
df['answer'] = df['answer'].apply(clean_text)


**3. Tokenization for BERT**

In [10]:
# Load BERT Tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Function to tokenize dataset for BERT Q&A
def tokenize_function(examples):
    encoding = tokenizer(
        examples["question"],  # Question
        examples["answer"],  # Context (Answer)
        truncation=True,
        padding="max_length",
        max_length=256,
        return_offsets_mapping=True,
        return_tensors="pt"
    )

    start_positions = []
    end_positions = []
    for i in range(len(examples["answer"])):
        answer_text = examples["answer"][i]
        start_char = answer_text.find(answer_text)
        end_char = start_char + len(answer_text)

        start_idx, end_idx = None, None
        for j, (start, end) in enumerate(encoding["offset_mapping"][i]):
            if start <= start_char < end:
                start_idx = j
            if start < end_char <= end:
                end_idx = j
                break

        start_positions.append(start_idx if start_idx is not None else 0)
        end_positions.append(end_idx if end_idx is not None else 0)

    encoding["start_positions"] = start_positions
    encoding["end_positions"] = end_positions
    return encoding


# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)

# Apply tokenization
dataset = dataset.map(tokenize_function, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Map:   0%|          | 0/110 [00:00<?, ? examples/s]

In [11]:
print(df[['question', 'answer']].sample(10))  # Check random samples for correctness


                                                question  \
335        why are schools closing down becasue of covid   
25                                     what caused covid   
102                                what is selfisolating   
299                     are there enough tests for covid   
156    wont covid just go away over time why are we f...   
18     what can i do to reduce my and my familys risk...   
43                    im pregnant am i at risk for covid   
56                        how many covid cases are there   
12611                       what is flattening the curve   
913    will warm weather  summer  heat stop outbreak ...   

                                                  answer  
335    please keep up with local news media sources a...  
25     their symptoms are severe acute respiratory il...  
102                                                  nan  
299                                                  nan  
156                                         

In [12]:
# Check the dataset after tokenization to inspect the output
print(dataset[0])  # Display the first tokenized example


{'question': 'can i get covid from animals when travelling to other countries', 'answer': 'although the current spread and growth of the covid outbreak is primarily associated with spread from person to person experts agree that the virus likely originated from bats and may have passed through an intermediary animal source currently unknown in china before being transmitted to humansif animals are imported from an affected area they should be closely monitored for signs of illness you should contact a veterinarian if they become sick and call ahead to ensure they are aware of the circumstances', 'anser': 'although the current spread and growth of the covid-19 outbreak is primarily associated with spread from person to person, experts agree that the virus likely originated from bats and may have passed through an intermediary animal source (currently unknown) in china before being transmitted to humans.if animals are imported from an affected area: they should be closely monitored for s

In [13]:
# Split into training & validation
train_test_split = dataset.train_test_split(test_size=0.2, seed=42)
train_dataset = train_test_split["train"]
val_dataset = train_test_split["test"]

print(f"Training Samples: {len(train_dataset)}")
print(f"Validation Samples: {len(val_dataset)}")

Training Samples: 88
Validation Samples: 22


**4. Model Selection & Fine-Tuning**

In [14]:
# Load pre-trained BERT model for Question Answering
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

# Ensure model runs on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/Covid-19/results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="/content/drive/MyDrive/Covid-19/logs",
    logging_steps=1,
    logging_first_step=True,
)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Start Training
trainer.train()


[34m[1mwandb[0m: Currently logged in as: [33ma-ajani[0m ([33ma-ajani-african-leadership-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss
1,4.7981,4.458673
2,3.4492,3.488148
3,2.6779,2.79413
4,2.2322,2.361972
5,2.145,2.211546


TrainOutput(global_step=55, training_loss=3.3229071747172965, metrics={'train_runtime': 42.4015, 'train_samples_per_second': 10.377, 'train_steps_per_second': 1.297, 'total_flos': 57485286481920.0, 'train_loss': 3.3229071747172965, 'epoch': 5.0})

**5. Data Evaluation and Testing**

In [16]:
evaluation = trainer.evaluate()
print(evaluation)

{'eval_loss': 2.2115464210510254, 'eval_runtime': 0.1359, 'eval_samples_per_second': 161.847, 'eval_steps_per_second': 22.07, 'epoch': 5.0}


In [17]:
# Save the fine-tuned model
trainer.save_model("/content/drive/MyDrive/Covid-19/saved_model")

# Save the tokenizer
tokenizer.save_pretrained("/content/drive/MyDrive/Covid-19/saved_model")


('/content/drive/MyDrive/Covid-19/saved_model/tokenizer_config.json',
 '/content/drive/MyDrive/Covid-19/saved_model/special_tokens_map.json',
 '/content/drive/MyDrive/Covid-19/saved_model/vocab.txt',
 '/content/drive/MyDrive/Covid-19/saved_model/added_tokens.json',
 '/content/drive/MyDrive/Covid-19/saved_model/tokenizer.json')

In [18]:
# Calculate F1-score
predictions = trainer.predict(val_dataset)
predicted_start_logits = predictions.predictions[0]
predicted_end_logits = predictions.predictions[1]

predicted_start_positions = predicted_start_logits.argmax(axis=-1)
predicted_end_positions = predicted_end_logits.argmax(axis=-1)

true_start_positions = [example['start_positions'] for example in val_dataset]
true_end_positions = [example['end_positions'] for example in val_dataset]

f1_start = f1_score(true_start_positions, predicted_start_positions, average='weighted')
f1_end = f1_score(true_end_positions, predicted_end_positions, average='weighted')
print(f"F1 Score (Start Positions): {f1_start}")
print(f"F1 Score (End Positions): {f1_end}")

# Calculate BLEU score
bleu = evaluate.load("bleu")
references = [[example['answer']] for example in val_dataset]  # Use completion as reference

# Decode predicted tokens
decoded_predictions = []
for i in range(len(predicted_start_positions)):
  start = predicted_start_positions[i]
  end = predicted_end_positions[i]
  decoded_predictions.append(tokenizer.decode(val_dataset[i]["input_ids"][start:end+1]))

bleu_results = bleu.compute(predictions=decoded_predictions, references=references)
print(f"BLEU Score: {bleu_results['bleu']}")


# Calculate Perplexity
evaluation = trainer.evaluate()

loss = evaluation["eval_loss"]

perplexity = np.exp(loss)

print(f"Perplexity: {perplexity}")


F1 Score (Start Positions): 0.44999999999999996
F1 Score (End Positions): 0.9090909090909091


Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

BLEU Score: 0.8929811426966997


Perplexity: 9.129824034130822


**6. Hyperparameter Tuning Experimentation**

In [19]:
# Define hyperparameter grid
hyperparams = [
    {"batch_size": 8, "learning_rate": 2e-5, "epochs": 3, "weight_decay": 0.01},
    {"batch_size": 8, "learning_rate": 3e-5, "epochs": 5, "weight_decay": 0.01},
    {"batch_size": 8, "learning_rate": 5e-5, "epochs": 7, "weight_decay": 0.01},
    {"batch_size": 16, "learning_rate": 2e-5, "epochs": 5, "weight_decay": 0.01},
    {"batch_size": 16, "learning_rate": 3e-5, "epochs": 7, "weight_decay": 0.05},
    {"batch_size": 16, "learning_rate": 5e-5, "epochs": 3, "weight_decay": 0.05},
    {"batch_size": 32, "learning_rate": 2e-5, "epochs": 7, "weight_decay": 0.01},
    {"batch_size": 32, "learning_rate": 3e-5, "epochs": 3, "weight_decay": 0.05},
    {"batch_size": 32, "learning_rate": 5e-5, "epochs": 5, "weight_decay": 0.05},
    {"batch_size": 16, "learning_rate": 3e-5, "epochs": 5, "weight_decay": 0.01},
    {"batch_size": 8, "learning_rate": 2e-5, "epochs": 7, "weight_decay": 0.05},
    {"batch_size": 32, "learning_rate": 5e-5, "epochs": 7, "weight_decay": 0.01},
]

# Initialize result storage
results = []

# BLEU evaluator
bleu = evaluate.load("bleu")

In [20]:
# Loop through hyperparameter sets
for i, params in enumerate(hyperparams):
    print(f"\n🚀 Running Experiment {i+1}/{len(hyperparams)}: {params}")

    training_args = TrainingArguments(
        output_dir=f"/content/drive/MyDrive/Covid-19/results/exp_{i+1}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=params["batch_size"],
        per_device_eval_batch_size=params["batch_size"],
        num_train_epochs=params["epochs"],
        learning_rate=params["learning_rate"],
        weight_decay=params["weight_decay"],
        logging_dir=f"/content/drive/MyDrive/Covid-19/logs/exp_{i+1}",
        logging_steps=1,
        logging_first_step=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Train model
    trainer.train()

    # Evaluate model
    evaluation = trainer.evaluate()
    loss = evaluation["eval_loss"]
    perplexity = np.exp(loss)

    # Make predictions
    predictions = trainer.predict(val_dataset)
    predicted_start_logits = predictions.predictions[0]
    predicted_end_logits = predictions.predictions[1]

    predicted_start_positions = predicted_start_logits.argmax(axis=-1)
    predicted_end_positions = predicted_end_logits.argmax(axis=-1)

    true_start_positions = [example["start_positions"] for example in val_dataset]
    true_end_positions = [example["end_positions"] for example in val_dataset]

    f1_start = f1_score(true_start_positions, predicted_start_positions, average='weighted')
    f1_end = f1_score(true_end_positions, predicted_end_positions, average='weighted')

    # Compute BLEU score
    references = [[example["answer"]] for example in val_dataset]
    decoded_predictions = [
        tokenizer.decode(val_dataset[i]["input_ids"][predicted_start_positions[i]:predicted_end_positions[i]+1])
        for i in range(len(predicted_start_positions))
    ]

    bleu_results = bleu.compute(predictions=decoded_predictions, references=references)

    # Store results
    results.append({
        "Experiment": i+1,
        "Batch Size": params["batch_size"],
        "Learning Rate": params["learning_rate"],
        "Epochs": params["epochs"],
        "Weight Decay": params["weight_decay"],
        "F1 Score (Start)": f1_start,
        "F1 Score (End)": f1_end,
        "BLEU Score": bleu_results["bleu"],
        "Perplexity": perplexity
    })

    # Save model if best performance
    if i == 0 or f1_start > max([r["F1 Score (Start)"] for r in results[:-1]]):
        trainer.save_model("/content/drive/MyDrive/Covid-19/best_model")
        tokenizer.save_pretrained("/content/drive/MyDrive/Covid-19/best_model")
        print("✅ Saved best model so far!")


🚀 Running Experiment 1/12: {'batch_size': 8, 'learning_rate': 2e-05, 'epochs': 3, 'weight_decay': 0.01}




Epoch,Training Loss,Validation Loss
1,1.3902,1.349511
2,0.5305,0.90679
3,0.3896,0.772988


✅ Saved best model so far!

🚀 Running Experiment 2/12: {'batch_size': 8, 'learning_rate': 3e-05, 'epochs': 5, 'weight_decay': 0.01}




Epoch,Training Loss,Validation Loss
1,0.1681,0.513018
2,0.0164,0.303378
3,0.0119,0.30165
4,0.0096,0.270512
5,0.0063,0.268434


✅ Saved best model so far!

🚀 Running Experiment 3/12: {'batch_size': 8, 'learning_rate': 5e-05, 'epochs': 7, 'weight_decay': 0.01}




Epoch,Training Loss,Validation Loss
1,0.0046,0.28763
2,0.0013,0.477144
3,0.0018,0.303587
4,0.0009,0.270628
5,0.0007,0.290412
6,0.0006,0.306167
7,0.001,0.310037



🚀 Running Experiment 4/12: {'batch_size': 16, 'learning_rate': 2e-05, 'epochs': 5, 'weight_decay': 0.01}




Epoch,Training Loss,Validation Loss
1,0.0003,0.327028
2,0.0002,0.281417
3,0.0002,0.31381
4,0.0001,0.37039
5,0.0001,0.38784



🚀 Running Experiment 5/12: {'batch_size': 16, 'learning_rate': 3e-05, 'epochs': 7, 'weight_decay': 0.05}




Epoch,Training Loss,Validation Loss
1,0.0001,0.477013
2,0.0001,0.435775
3,0.0001,0.387
4,0.0001,0.455082
5,0.0001,0.481021
6,0.0001,0.481765
7,0.0001,0.475595



🚀 Running Experiment 6/12: {'batch_size': 16, 'learning_rate': 5e-05, 'epochs': 3, 'weight_decay': 0.05}




Epoch,Training Loss,Validation Loss
1,0.0001,0.509926
2,0.0001,0.442889
3,0.0001,0.436966



🚀 Running Experiment 7/12: {'batch_size': 32, 'learning_rate': 2e-05, 'epochs': 7, 'weight_decay': 0.01}




Epoch,Training Loss,Validation Loss
1,0.0001,0.405599
2,0.1032,0.438189
3,0.0001,0.440254
4,0.0001,0.441251
5,0.0002,0.439575
6,0.0001,0.435891
7,0.0001,0.433815



🚀 Running Experiment 8/12: {'batch_size': 32, 'learning_rate': 3e-05, 'epochs': 3, 'weight_decay': 0.05}




Epoch,Training Loss,Validation Loss
1,0.0,0.420235
2,0.0,0.505972
3,0.0298,0.50526



🚀 Running Experiment 9/12: {'batch_size': 32, 'learning_rate': 5e-05, 'epochs': 5, 'weight_decay': 0.05}




Epoch,Training Loss,Validation Loss
1,0.0,0.346341
2,0.0109,0.324232
3,0.0001,0.382685
4,0.0858,0.400462
5,0.0001,0.400699



🚀 Running Experiment 10/12: {'batch_size': 16, 'learning_rate': 3e-05, 'epochs': 5, 'weight_decay': 0.01}




Epoch,Training Loss,Validation Loss
1,0.0,0.421671
2,0.0,0.548914
3,0.0,0.678344
4,0.0,0.67464
5,0.0,0.626044



🚀 Running Experiment 11/12: {'batch_size': 8, 'learning_rate': 2e-05, 'epochs': 7, 'weight_decay': 0.05}




Epoch,Training Loss,Validation Loss
1,0.0,0.492407
2,0.0,0.474903
3,0.0,0.454404
4,0.0,0.481566
5,0.0,0.517618
6,0.0,0.522689
7,0.0,0.521653



🚀 Running Experiment 12/12: {'batch_size': 32, 'learning_rate': 5e-05, 'epochs': 7, 'weight_decay': 0.01}




Epoch,Training Loss,Validation Loss
1,0.0,0.43821
2,0.0,0.433049
3,0.0,0.456997
4,0.0,0.496362
5,0.0,0.609168
6,0.0,0.633233
7,0.0,0.637469


In [21]:
# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results as CSV
results_df.to_csv("/content/drive/MyDrive/Covid-19/hyperparameter_results.csv", index=False)

# Print final results
print("\n📊 Final Hyperparameter Results:")
print(results_df)

# Find best experiment
best_experiment = results_df.loc[results_df["F1 Score (Start)"].idxmax()]
print("\n🏆 Best Experiment:", best_experiment)


📊 Final Hyperparameter Results:
    Experiment  Batch Size  Learning Rate  Epochs  Weight Decay  \
0            1           8        0.00002       3          0.01   
1            2           8        0.00003       5          0.01   
2            3           8        0.00005       7          0.01   
3            4          16        0.00002       5          0.01   
4            5          16        0.00003       7          0.05   
5            6          16        0.00005       3          0.05   
6            7          32        0.00002       7          0.01   
7            8          32        0.00003       3          0.05   
8            9          32        0.00005       5          0.05   
9           10          16        0.00003       5          0.01   
10          11           8        0.00002       7          0.05   
11          12          32        0.00005       7          0.01   

    F1 Score (Start)  F1 Score (End)  BLEU Score  Perplexity  
0           0.775758        0.96