In [17]:
!pip install transformers torch datasets
from transformers import BertTokenizer, BertModel, pipeline, AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, AutoModelForQuestionAnswering
from datasets import load_dataset, Dataset
import torch
import torch.nn.functional as F
import re
from nltk.tokenize import sent_tokenize
import nltk
import json
import os
import numpy as np
os.environ["WANDB_DISABLED"] = "true"
nltk.download('punkt')
nltk.download('punkt_tab')
from google.colab import drive
drive.mount('/content/drive')


book_path = "/content/drive/My Drive/NLP_Project2/a study in scarlet.txt"





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def load_text(file_path):
    """
    Load text from a file.
    """
    print("Loading the text from the file...")
    with open(book_path, 'r', encoding='utf-8') as file:
        return file.read()

def remove_gutenberg_header_footer(text, start_marker, end_marker):
    """
    Remove the header and footer from Project Gutenberg text.

    Finds specified start and end markers and returns the content in between.

    Parameters:
        text (str): The text to clean.
        start_marker (str): Marker indicating the start of main content.
        end_marker (str): Marker indicating the end of main content.

    Returns:
        str: Text with header and footer removed, or original text if markers are not found.
    """
    start_index = text.find(start_marker)
    if start_index == -1:
        return text

    end_index = text.find(end_marker)
    if end_index == -1:
        return text

    cleaned_text = text[start_index + len(start_marker):end_index].strip()
    return cleaned_text

In [None]:
# here we are cleaning the text and removing the headers and footers
raw_text = load_text(book_path)

start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"

cleaned_text = remove_gutenberg_header_footer(raw_text, start_marker, end_marker)

print("Cleaned text sample:\n")
print(cleaned_text[:500])

Loading the text from the file...
Cleaned text sample:

A STUDY IN SCARLET

By A. Conan Doyle




CONTENTS

 A STUDY IN SCARLET.

 PART I.
 CHAPTER I. MR. SHERLOCK HOLMES.
 CHAPTER II. THE SCIENCE OF DEDUCTION.
 CHAPTER III. THE LAURISTON GARDENS MYSTERY
 CHAPTER IV. WHAT JOHN RANCE HAD TO TELL.
 CHAPTER V. OUR ADVERTISEMENT BRINGS A VISITOR.
 CHAPTER VI. TOBIAS GREGSON SHOWS WHAT HE CAN DO.
 CHAPTER VII. LIGHT IN THE DARKNESS.

 PART II. THE COUNTRY OF THE SAINTS
 CHAPTER I. ON THE GREAT ALKALI PLAIN.
 CHAPTER II. THE FLOWER OF UTAH.
 CHAPTER III. J


In [None]:
def tokenize_sentences(text):
    """
    Tokenize the input text into sentences using NLTK.

    Parameters:
        text (str): The text to tokenize.

    Returns:
        list: A list of tokenized sentences.
    """
    print("Tokenizing the text into sentences...")
    sentences = sent_tokenize(text)
    print(f"Total sentences: {len(sentences)}")
    return sentences

sentences = tokenize_sentences(cleaned_text)

print(f"Number of tokenized sentences: {len(sentences)}")
if len(sentences) == 0:
    raise ValueError("No sentences were tokenized. Please check the input text.")

print("\nSample tokenized sentences:")
for i, sentence in enumerate(sentences[:5], 1):
    print(f"{i}: {sentence}")

Tokenizing the text into sentences...
Total sentences: 2208
Number of tokenized sentences: 2208

Sample tokenized sentences:
1: A STUDY IN SCARLET

By A. Conan Doyle




CONTENTS

 A STUDY IN SCARLET.
2: PART I.
3: CHAPTER I. MR. SHERLOCK HOLMES.
4: CHAPTER II.
5: THE SCIENCE OF DEDUCTION.


In [None]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print("BERT model initialized successfully!")

def tokenize_for_bert(sentences, tokenizer, max_length=512):
    """
    Tokenize sentences for BERT and prepare input tensors.

    Parameters:
        sentences (list): List of sentences to tokenize.
        tokenizer (BertTokenizer): Pre-trained BERT tokenizer.
        max_length (int): Maximum sequence length for BERT.

    Returns:
        dict: Tokenized inputs with attention masks.
    """
    print("Tokenizing sentences for BERT...")
    encoded_inputs = tokenizer(
        sentences,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    )
    return encoded_inputs


bert_inputs = tokenize_for_bert(sentences, bert_tokenizer)

if not bert_inputs or "input_ids" not in bert_inputs:
    raise ValueError("BERT inputs were not created properly. Check the tokenizer function.")
print("BERT inputs created successfully!")

print("\nSample tokenized input IDs:")
print(bert_inputs["input_ids"][:2])
print("\nAttention masks:")
print(bert_inputs["attention_mask"][:2])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

BERT model initialized successfully!
Tokenizing sentences for BERT...
BERT inputs created successfully!

Sample tokenized input IDs:
tensor([[  101,  1037,  2817,  1999, 11862,  2011,  1037,  1012, 16608, 11294,
          8417,  1037,  2817,  1999, 11862,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0

In [None]:
bert_inputs = tokenize_for_bert(sentences, bert_tokenizer)
def generate_embeddings_in_batches(inputs, model, batch_size=32):
    """
    Generate embeddings for tokenized inputs using BERT in smaller batches.

    Parameters:
        inputs (dict): Tokenized inputs containing input_ids and attention_mask.
        model (BertModel): Pre-trained BERT model.
        batch_size (int): Number of sentences to process in each batch.

    Returns:
        torch.Tensor: Embeddings for all input sentences.
    """
    print("Generating embeddings in batches...")
    all_embeddings = []


    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    num_samples = input_ids.size(0)

    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)
        batch_input_ids = input_ids[start_idx:end_idx]
        batch_attention_mask = attention_mask[start_idx:end_idx]

        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            batch_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeddings.append(batch_embeddings)

        print(f"Processed batch {start_idx // batch_size + 1}/{(num_samples + batch_size - 1) // batch_size}")


    return torch.cat(all_embeddings, dim=0)

bert_model = BertModel.from_pretrained('bert-base-uncased')

batch_size = 16
embeddings = generate_embeddings_in_batches(bert_inputs,bert_model,batch_size=batch_size)


print("Embeddings shape:", embeddings.shape)


torch.save(embeddings, '/content/drive/My Drive/NLP_Project2/bert_embeddings.pt')
print("Embeddings saved successfully!")

Tokenizing sentences for BERT...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating embeddings in batches...
Processed batch 1/138
Processed batch 2/138
Processed batch 3/138
Processed batch 4/138
Processed batch 5/138
Processed batch 6/138
Processed batch 7/138
Processed batch 8/138
Processed batch 9/138
Processed batch 10/138
Processed batch 11/138
Processed batch 12/138
Processed batch 13/138
Processed batch 14/138
Processed batch 15/138
Processed batch 16/138
Processed batch 17/138
Processed batch 18/138
Processed batch 19/138
Processed batch 20/138
Processed batch 21/138
Processed batch 22/138
Processed batch 23/138
Processed batch 24/138
Processed batch 25/138
Processed batch 26/138
Processed batch 27/138
Processed batch 28/138
Processed batch 29/138
Processed batch 30/138
Processed batch 31/138
Processed batch 32/138
Processed batch 33/138
Processed batch 34/138
Processed batch 35/138
Processed batch 36/138
Processed batch 37/138
Processed batch 38/138
Processed batch 39/138
Processed batch 40/138
Processed batch 41/138
Processed batch 42/138
Process

In [None]:

embeddings = torch.load('/content/drive/My Drive/NLP_Project2/bert_embeddings.pt')
print("Loaded embeddings with shape:", embeddings.shape)

Loaded embeddings with shape: torch.Size([2208, 768])


  embeddings = torch.load('/content/drive/My Drive/NLP_Project2/bert_embeddings.pt')


In [None]:

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


questions = [
    "Who introduced Watson to Holmes?",
    "Where did Watson and Holmes live?",
    "Who was the murderer?",
]


question_inputs = bert_tokenizer(questions, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [None]:
def find_most_similar(embeddings, question_embedding):
    """
    Find the index of the most similar embedding based on cosine similarity.

    Parameters:
        embeddings (torch.Tensor): Sentence embeddings.
        question_embedding (torch.Tensor): Embedding of the question.

    Returns:
        int: Index of the most similar sentence.
    """
    similarities = F.cosine_similarity(embeddings, question_embedding, dim=1)
    return torch.argmax(similarities).item()


question_embeddings = bert_model(**question_inputs).last_hidden_state[:, 0, :]


for i, question in enumerate(questions):
    most_similar_idx = find_most_similar(embeddings, question_embeddings[i].unsqueeze(0))
    print(f"Q: {question}")
    print(f"Answer: {sentences[most_similar_idx]}\n")




qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")


context = """
Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford.
They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police
in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later
revealed as the murderer, seeking revenge for past wrongs.
"""


print("\nUsing the QA pipeline with pre-trained BERT for QA:\n")
for question in questions:
    result = qa_pipeline(question=question, context=context)
    print(f"Q: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Score: {result['score']:.4f}")
    print("-" * 50)

Q: Who introduced Watson to Holmes?
Answer: What was that?

Q: Where did Watson and Holmes live?
Answer: What was that?

Q: Who was the murderer?
Answer: Where did the blood come from?



config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



Using the QA pipeline with pre-trained BERT for QA:

Q: Who introduced Watson to Holmes?
Answer: Stamford
Score: 0.9813
--------------------------------------------------
Q: Where did Watson and Holmes live?
Answer: 221B Baker Street
Score: 0.9657
--------------------------------------------------
Q: Who was the murderer?
Answer: Jefferson Hope
Score: 0.9951
--------------------------------------------------


In [19]:

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):

        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits


        start_positions = inputs["start_positions"]
        end_positions = inputs["end_positions"]


        loss_start = F.cross_entropy(start_logits, start_positions)
        loss_end = F.cross_entropy(end_logits, end_positions)


        loss = (loss_start + loss_end) / 2
        return (loss, outputs) if return_outputs else loss





dataset = {
    "data": [
        {
            "title": "A Study in Scarlet",
            "paragraphs": [
                {
                    "context": "Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.",
                    "qas": [
                        {
                            "id": "1",
                            "question": "Who introduced Watson to Holmes?",
                            "answers": [{"text": "Stamford", "answer_start": 63}],
                            "is_impossible": False
                        },
                        {
                            "id": "2",
                            "question": "Where did Watson and Holmes live?",
                            "answers": [{"text": "221B Baker Street", "answer_start": 108}],
                            "is_impossible": False
                        },
                        {
                            "id": "3",
                            "question": "What word was written in blood at the crime scene?",
                            "answers": [{"text": "RACHE", "answer_start": 223}],
                            "is_impossible": False
                        },
                        {
                            "id": "4",
                            "question": "Who was revealed as the murderer?",
                            "answers": [{"text": "Jefferson Hope", "answer_start": 260}],
                            "is_impossible": False
                        }
                    ]
                },
                {
                    "context": "Sherlock Holmes has remarkable skills in observation and deduction. He surprises Watson by identifying that Watson had served in Afghanistan simply by observing his appearance and behavior. Holmes also has an eccentric personality, often engaging in chemical experiments and playing the violin.",
                    "qas": [
                        {
                            "id": "5",
                            "question": "How did Holmes deduce that Watson had served in Afghanistan?",
                            "answers": [{"text": "by observing his appearance and behavior", "answer_start": 123}],
                            "is_impossible": False
                        },
                        {
                            "id": "6",
                            "question": "What instrument does Holmes play?",
                            "answers": [{"text": "violin", "answer_start": 216}],
                            "is_impossible": False
                        }
                    ]
                },
                {
                    "context": "Holmes received a case involving the murder of Enoch J. Drebber at Lauriston Gardens. The scene included a mysterious word, 'RACHE,' written in blood on the wall. Holmes deduced the murder was motivated by revenge.",
                    "qas": [
                        {
                            "id": "7",
                            "question": "What was the location of the murder scene?",
                            "answers": [{"text": "Lauriston Gardens", "answer_start": 46}],
                            "is_impossible": False
                        },
                        {
                            "id": "8",
                            "question": "What did Holmes deduce was the motive for the murder?",
                            "answers": [{"text": "revenge", "answer_start": 126}],
                            "is_impossible": False
                        }
                    ]
                },
                {
                    "context": "Holmes explained his investigative methods to Watson, emphasizing the importance of observation and deduction. He stated that the human brain is like an attic and must be furnished only with useful knowledge to avoid clutter.",
                    "qas": [
                        {
                            "id": "9",
                            "question": "How did Holmes describe the human brain?",
                            "answers": [{"text": "like an attic", "answer_start": 104}],
                            "is_impossible": False
                        },
                        {
                            "id": "10",
                            "question": "What did Holmes believe about acquiring knowledge irrelevant to his work?",
                            "answers": [{"text": "it crowded out useful facts", "answer_start": 150}],
                            "is_impossible": False
                        }
                    ]
                },
                {
                    "context": "Holmes' investigation of Lauriston Gardens revealed a complex trail of evidence, including the size of the footprints, a woman's wedding ring, and the type of cigar ash left at the scene.",
                    "qas": [
                        {
                            "id": "11",
                            "question": "What object found at the scene suggested a woman might be involved?",
                            "answers": [{"text": "a woman's wedding ring", "answer_start": 91}],
                            "is_impossible": False
                        },
                        {
                            "id": "12",
                            "question": "What clue did the cigar ash provide?",
                            "answers": [{"text": "the brand of cigar, Trichinopoly", "answer_start": 130}],
                            "is_impossible": False
                        }
                    ]
                }
            ]
        }
    ]
}


with open('/content/drive/My Drive/NLP_Project2/qa_dataset.json', 'w') as f:
    json.dump(dataset, f)
print("Dataset saved successfully!")


dataset_path = "/content/drive/My Drive/NLP_Project2/qa_dataset.json"
raw_datasets = load_dataset("json", data_files={"train": dataset_path, "validation": dataset_path})
print("Raw datasets loaded successfully!")
print("Dataset structure:", raw_datasets)
print("First train entry:", raw_datasets["train"][0])


def flatten_dataset(dataset):
    flattened_data = {"context": [], "question": [], "answers": []}
    for entry in dataset:
        for paragraph in entry["data"][0]["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                flattened_data["context"].append(context)
                flattened_data["question"].append(qa["question"])
                flattened_data["answers"].append(qa["answers"])
    return flattened_data

flattened_train = flatten_dataset(raw_datasets["train"])
flattened_validation = flatten_dataset(raw_datasets["validation"])


train_dataset = Dataset.from_dict(flattened_train)
validation_dataset = Dataset.from_dict(flattened_validation)


model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    start_positions = []
    end_positions = []

    for i, answer in enumerate(examples["answers"]):

        start_char = answer[0]["answer_start"]
        end_char = start_char + len(answer[0]["text"])


        start_token = inputs.char_to_token(i, start_char)
        end_token = inputs.char_to_token(i, end_char - 1)


        if start_token is None or start_token >= tokenizer.model_max_length:
            start_token = tokenizer.model_max_length - 1
        if end_token is None or end_token >= tokenizer.model_max_length:
            end_token = tokenizer.model_max_length - 1

        start_positions.append(start_token)
        end_positions.append(end_token)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_validation = validation_dataset.map(preprocess_function, batched=True)


training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/NLP_Project2/qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=500,
    logging_steps=500,
    fp16=True
)


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer
)


trainer.train()


trainer.save_model("/content/drive/My Drive/NLP_Project2/qa_finetuned_model")
tokenizer.save_pretrained("/content/drive/My Drive/NLP_Project2/qa_finetuned_model")
print("Fine-tuned model saved successfully!")


Dataset saved successfully!


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Raw datasets loaded successfully!
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['data'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['data'],
        num_rows: 1
    })
})
First train entry: {'data': [{'title': 'A Study in Scarlet', 'paragraphs': [{'context': "Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.", 'qas': [{'id': '1', 'question': 'Who introduced Watson to Holmes?', 'answers': [{'text': 'Stamford', 'answer_start': 63}], 'is_impossible': False}, {'id': '2', 'question': 'Where did Watson and Holmes live?', 'answers': [{'text': '221B Baker Street', 'answer_start': 108}], 'is_impos

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Map:   0%|          | 0/12 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Epoch,Training Loss,Validation Loss
0,No log,6.223967
2,No log,6.143372


Fine-tuned model saved successfully!


In [18]:
#Load models (out of box and fine tuned) for evaluation and comparison
#read qa dataset
#calculate Exact match and f1 matrices
#compare models




fine_tuned_model_path = "/content/drive/My Drive/NLP_Project2/qa_finetuned_model"
dataset_path = "/content/drive/My Drive/NLP_Project2/qa_dataset.json"


print("Loading the fine-tuned model...")
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(fine_tuned_model_path)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)


print("Loading the pre-trained model...")
pre_trained_model_name = "bert-large-uncased-whole-word-masking-finetuned-squad"
pre_trained_model = AutoModelForQuestionAnswering.from_pretrained(pre_trained_model_name)
pre_trained_tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name)


print("Loading the dataset...")
raw_datasets = load_dataset("json", data_files={"validation": dataset_path})

validation_dataset = raw_datasets["validation"]
print(f"Loaded {len(validation_dataset)} examples.")
print(validation_dataset[0])
print(validation_dataset.column_names)


def flatten_evaluation_dataset(dataset):
    """Flatten the dataset for evaluation purposes."""
    flattened_data = []


    for entry in dataset:

        if "data" in entry:
            entry_data = entry["data"]
            for sub_entry in entry_data:
                for paragraph in sub_entry["paragraphs"]:
                    context = paragraph["context"]
                    for qa in paragraph["qas"]:
                        flattened_data.append({
                            "context": context,
                            "question": qa["question"],
                            "answers": qa["answers"]
                        })
        else:
            raise ValueError("Dataset entry does not contain a 'data' key.")

    return flattened_data


evaluation_data = flatten_evaluation_dataset(validation_dataset)


def calculate_em_and_f1(prediction, ground_truth):
    """Calculate Exact Match (EM) and F1 score."""
    prediction_tokens = prediction.split()
    ground_truth_tokens = ground_truth.split()


    em = int(prediction == ground_truth)


    common_tokens = set(prediction_tokens) & set(ground_truth_tokens)
    if len(common_tokens) == 0:
        f1 = 0
    else:
        precision = len(common_tokens) / len(prediction_tokens)
        recall = len(common_tokens) / len(ground_truth_tokens)
        f1 = 2 * (precision * recall) / (precision + recall)

    return em, f1


def evaluate_qa_model(model, tokenizer, dataset):
    """Evaluate the QA model on the dataset."""
    em_scores = []
    f1_scores = []


    for example in dataset:

        context = example["context"]
        question = example["question"]
        answers = example["answers"]


        if isinstance(answers, list) and len(answers) > 0:
            ground_truth = answers[0]["text"]
        else:
            raise ValueError(f"Invalid 'answers' format: {answers}")


        inputs = tokenizer(
            question, context, return_tensors="pt", truncation=True, padding=True, max_length=512
        )


        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits


        start_idx = torch.argmax(start_logits)
        end_idx = torch.argmax(end_logits) + 1
        predicted_answer = tokenizer.decode(inputs["input_ids"][0][start_idx:end_idx], skip_special_tokens=True)


        em, f1 = calculate_em_and_f1(predicted_answer, ground_truth)
        em_scores.append(em)
        f1_scores.append(f1)


    avg_em = np.mean(em_scores)
    avg_f1 = np.mean(f1_scores)
    return avg_em, avg_f1


print("Evaluating the fine-tuned model...")
fine_tuned_em, fine_tuned_f1 = evaluate_qa_model(fine_tuned_model, fine_tuned_tokenizer, evaluation_data)


print("Evaluating the pre-trained model...")
pre_trained_em, pre_trained_f1 = evaluate_qa_model(pre_trained_model, pre_trained_tokenizer, evaluation_data)


print("\nEvaluation Results:")
print(f"Fine-Tuned Model: EM = {fine_tuned_em:.4f}, F1 = {fine_tuned_f1:.4f}")
print(f"Pre-Trained Model: EM = {pre_trained_em:.4f}, F1 = {pre_trained_f1:.4f}")

Loading the fine-tuned model...
Loading the pre-trained model...


Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Loading the dataset...
Loaded 1 examples.
{'data': [{'title': 'A Study in Scarlet', 'paragraphs': [{'context': "Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.", 'qas': [{'id': '1', 'question': 'Who introduced Watson to Holmes?', 'answers': [{'text': 'Stamford', 'answer_start': 63}], 'is_impossible': False}, {'id': '2', 'question': 'Where did Watson and Holmes live?', 'answers': [{'text': '221B Baker Street', 'answer_start': 108}], 'is_impossible': False}, {'id': '3', 'question': 'What word was written in blood at the crime scene?', 'answers': [{'text': 'RACHE', 'answer_start': 223}], 'is_impossible': False}, {'id': '4', 'question': 'Who was