# **Install Dependencies**

In [3]:
!pip install -q transformers datasets evaluate


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25h

# **Import Libraries**

In [4]:
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
from datasets import load_dataset
import evaluate

# **Load SQuAD v1.1 Dataset**

In [9]:
dataset = load_dataset("squad", split="validation[:50]")

README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

# **Pretrained QA Model**

In [5]:
model_name = "distilbert-base-cased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/473 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/261M [00:00<?, ?B/s]

Device set to use cpu


# **Question Demo**

In [6]:
context = """The Indus River is one of the longest rivers in Asia.
It flows through China, India, and Pakistan. It is the lifeline
of Pakistan's economy, supporting agriculture and industry."""
question = "Which countries does the Indus River flow through?"

result = qa_pipeline(question=question, context=context)
print("Answer:", result['answer'])


Answer: China, India, and Pakistan


# **Evaluation Function**

In [10]:
metric = evaluate.load("squad")

def evaluate_model(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForQuestionAnswering.from_pretrained(model_name)
    qa = pipeline("question-answering", model=model, tokenizer=tokenizer)

    preds = []
    refs = []

    for example in dataset:
        prediction = qa(question=example["question"], context=example["context"])
        preds.append({
            "id": example["id"],
            "prediction_text": prediction["answer"]
        })
        refs.append({
            "id": example["id"],
            "answers": example["answers"]
        })

    return metric.compute(predictions=preds, references=refs)


# **Bonus: Command-Line Interface**

In [16]:
print("\n=== Simple QA CLI ===")
while True:
    ctx = input("\nEnter context (or type 'exit' to quit): ")
    if ctx.lower() == "exit":
        break
    ques = input("Enter question: ")
    answer = qa_pipeline(question=ques, context=ctx)
    print("Answer:", answer["answer"])


=== Simple QA CLI ===

Enter context (or type 'exit' to quit): Albert Einstein was a theoretical physicist who developed the theory of relativity.
Enter question: What did Albert Einstein develop?
Answer: the theory of relativity

Enter context (or type 'exit' to quit): exit


# **Bonus: Compare BERT, RoBERTa, ALBERT**

In [11]:
models_to_test = {
    "BERT": "bert-large-uncased-whole-word-masking-finetuned-squad",
    "RoBERTa": "deepset/roberta-base-squad2",
    "ALBERT": "twmkn9/albert-base-v2-squad2"
}

results = {}
for name, model_id in models_to_test.items():
    print(f"Evaluating {name}...")
    results[name] = evaluate_model(model_id)

results


Evaluating BERT...


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


Evaluating RoBERTa...


tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

Device set to use cpu


Evaluating ALBERT...


tokenizer_config.json:   0%|          | 0.00/39.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/716 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/760k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Some weights of the model checkpoint at twmkn9/albert-base-v2-squad2 were not used when initializing AlbertForQuestionAnswering: ['albert.pooler.bias', 'albert.pooler.weight']
- This IS expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


model.safetensors:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

{'BERT': {'exact_match': 88.0, 'f1': 94.99047619047619},
 'RoBERTa': {'exact_match': 96.0, 'f1': 98.45714285714286},
 'ALBERT': {'exact_match': 82.0, 'f1': 84.04444444444445}}

# **Results**

In [12]:
import pandas as pd

df = pd.DataFrame(results).T
df


Unnamed: 0,exact_match,f1
BERT,88.0,94.990476
RoBERTa,96.0,98.457143
ALBERT,82.0,84.044444
