In [1]:
!pip install transformers torch datasets
!pip install evaluate
from evaluate import load

from transformers import BertTokenizer, BertModel, pipeline, AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, AutoModelForQuestionAnswering
from datasets import load_dataset, Dataset
import torch
import torch.nn.functional as F
import re
from nltk.tokenize import sent_tokenize
import nltk
import json
import os
import numpy as np
os.environ["WANDB_DISABLED"] = "true"
nltk.download('punkt')
nltk.download('punkt_tab')
from google.colab import drive
drive.mount('/content/drive')


book_path = "/content/drive/My Drive/NLP_Project2/a study in scarlet.txt"





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
def load_text(file_path):
    """
    Load text from a file.
    """
    print("Loading the text from the file...")
    with open(book_path, 'r', encoding='utf-8') as file:
        return file.read()

def remove_gutenberg_header_footer(text, start_marker, end_marker):
    """
    Remove the header and footer from Project Gutenberg text.

    Finds specified start and end markers and returns the content in between.

    Parameters:
        text (str): The text to clean.
        start_marker (str): Marker indicating the start of main content.
        end_marker (str): Marker indicating the end of main content.

    Returns:
        str: Text with header and footer removed, or original text if markers are not found.
    """
    start_index = text.find(start_marker)
    if start_index == -1:
        return text

    end_index = text.find(end_marker)
    if end_index == -1:
        return text

    cleaned_text = text[start_index + len(start_marker):end_index].strip()
    return cleaned_text

In [3]:
# here we are cleaning the text and removing the headers and footers
raw_text = load_text(book_path)

start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"

cleaned_text = remove_gutenberg_header_footer(raw_text, start_marker, end_marker)

print("Cleaned text sample:\n")
print(cleaned_text[:500])

Loading the text from the file...
Cleaned text sample:

A STUDY IN SCARLET

By A. Conan Doyle




CONTENTS

 A STUDY IN SCARLET.

 PART I.
 CHAPTER I. MR. SHERLOCK HOLMES.
 CHAPTER II. THE SCIENCE OF DEDUCTION.
 CHAPTER III. THE LAURISTON GARDENS MYSTERY
 CHAPTER IV. WHAT JOHN RANCE HAD TO TELL.
 CHAPTER V. OUR ADVERTISEMENT BRINGS A VISITOR.
 CHAPTER VI. TOBIAS GREGSON SHOWS WHAT HE CAN DO.
 CHAPTER VII. LIGHT IN THE DARKNESS.

 PART II. THE COUNTRY OF THE SAINTS
 CHAPTER I. ON THE GREAT ALKALI PLAIN.
 CHAPTER II. THE FLOWER OF UTAH.
 CHAPTER III. J


In [4]:
def tokenize_sentences(text):
    """
    Tokenize the input text into sentences using NLTK.

    Parameters:
        text (str): The text to tokenize.

    Returns:
        list: A list of tokenized sentences.
    """
    print("Tokenizing the text into sentences...")
    sentences = sent_tokenize(text)
    print(f"Total sentences: {len(sentences)}")
    return sentences

sentences = tokenize_sentences(cleaned_text)

print(f"Number of tokenized sentences: {len(sentences)}")
if len(sentences) == 0:
    raise ValueError("No sentences were tokenized. Please check the input text.")

print("\nSample tokenized sentences:")
for i, sentence in enumerate(sentences[:5], 1):
    print(f"{i}: {sentence}")

Tokenizing the text into sentences...
Total sentences: 2208
Number of tokenized sentences: 2208

Sample tokenized sentences:
1: A STUDY IN SCARLET

By A. Conan Doyle




CONTENTS

 A STUDY IN SCARLET.
2: PART I.
3: CHAPTER I. MR. SHERLOCK HOLMES.
4: CHAPTER II.
5: THE SCIENCE OF DEDUCTION.


In [5]:
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print("BERT model initialized successfully!")

def tokenize_for_bert(sentences, tokenizer, max_length=512):
    """
    Tokenize sentences for BERT and prepare input tensors.

    Parameters:
        sentences (list): List of sentences to tokenize.
        tokenizer (BertTokenizer): Pre-trained BERT tokenizer.
        max_length (int): Maximum sequence length for BERT.

    Returns:
        dict: Tokenized inputs with attention masks.
    """
    print("Tokenizing sentences for BERT...")
    encoded_inputs = tokenizer(
        sentences,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    )
    return encoded_inputs


bert_inputs = tokenize_for_bert(sentences, bert_tokenizer)

if not bert_inputs or "input_ids" not in bert_inputs:
    raise ValueError("BERT inputs were not created properly. Check the tokenizer function.")
print("BERT inputs created successfully!")

print("\nSample tokenized input IDs:")
print(bert_inputs["input_ids"][:2])
print("\nAttention masks:")
print(bert_inputs["attention_mask"][:2])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

BERT model initialized successfully!
Tokenizing sentences for BERT...
BERT inputs created successfully!

Sample tokenized input IDs:
tensor([[  101,  1037,  2817,  1999, 11862,  2011,  1037,  1012, 16608, 11294,
          8417,  1037,  2817,  1999, 11862,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0

In [6]:
bert_inputs = tokenize_for_bert(sentences, bert_tokenizer)
def generate_embeddings_in_batches(inputs, model, batch_size=32):
    """
    Generate embeddings for tokenized inputs using BERT in smaller batches.

    Parameters:
        inputs (dict): Tokenized inputs containing input_ids and attention_mask.
        model (BertModel): Pre-trained BERT model.
        batch_size (int): Number of sentences to process in each batch.

    Returns:
        torch.Tensor: Embeddings for all input sentences.
    """
    print("Generating embeddings in batches...")
    all_embeddings = []


    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    num_samples = input_ids.size(0)

    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)
        batch_input_ids = input_ids[start_idx:end_idx]
        batch_attention_mask = attention_mask[start_idx:end_idx]

        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            batch_embeddings = outputs.last_hidden_state[:, 0, :]
            all_embeddings.append(batch_embeddings)

        print(f"Processed batch {start_idx // batch_size + 1}/{(num_samples + batch_size - 1) // batch_size}")


    return torch.cat(all_embeddings, dim=0)

bert_model = BertModel.from_pretrained('bert-base-uncased')

batch_size = 16
embeddings = generate_embeddings_in_batches(bert_inputs,bert_model,batch_size=batch_size)


print("Embeddings shape:", embeddings.shape)


torch.save(embeddings, '/content/drive/My Drive/NLP_Project2/bert_embeddings.pt')
print("Embeddings saved successfully!")

Tokenizing sentences for BERT...


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Generating embeddings in batches...
Processed batch 1/138
Processed batch 2/138
Processed batch 3/138
Processed batch 4/138
Processed batch 5/138
Processed batch 6/138
Processed batch 7/138
Processed batch 8/138
Processed batch 9/138
Processed batch 10/138
Processed batch 11/138
Processed batch 12/138
Processed batch 13/138
Processed batch 14/138
Processed batch 15/138
Processed batch 16/138
Processed batch 17/138
Processed batch 18/138
Processed batch 19/138
Processed batch 20/138
Processed batch 21/138
Processed batch 22/138
Processed batch 23/138
Processed batch 24/138
Processed batch 25/138
Processed batch 26/138
Processed batch 27/138
Processed batch 28/138
Processed batch 29/138
Processed batch 30/138
Processed batch 31/138
Processed batch 32/138
Processed batch 33/138
Processed batch 34/138
Processed batch 35/138
Processed batch 36/138
Processed batch 37/138
Processed batch 38/138
Processed batch 39/138
Processed batch 40/138
Processed batch 41/138
Processed batch 42/138
Process

In [7]:

embeddings = torch.load('/content/drive/My Drive/NLP_Project2/bert_embeddings.pt')
print("Loaded embeddings with shape:", embeddings.shape)

Loaded embeddings with shape: torch.Size([2208, 768])


  embeddings = torch.load('/content/drive/My Drive/NLP_Project2/bert_embeddings.pt')


In [8]:

bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


questions = [
    "Who introduced Watson to Holmes?",
    "Where did Watson and Holmes live?",
    "Who was the murderer?",
]


question_inputs = bert_tokenizer(questions, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [9]:
def find_most_similar(embeddings, question_embedding):
    """
    Find the index of the most similar embedding based on cosine similarity.

    Parameters:
        embeddings (torch.Tensor): Sentence embeddings.
        question_embedding (torch.Tensor): Embedding of the question.

    Returns:
        int: Index of the most similar sentence.
    """
    similarities = F.cosine_similarity(embeddings, question_embedding, dim=1)
    return torch.argmax(similarities).item()


question_embeddings = bert_model(**question_inputs).last_hidden_state[:, 0, :]


for i, question in enumerate(questions):
    most_similar_idx = find_most_similar(embeddings, question_embeddings[i].unsqueeze(0))
    print(f"Q: {question}")
    print(f"Answer: {sentences[most_similar_idx]}\n")




qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")


context = """
Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford.
They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police
in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later
revealed as the murderer, seeking revenge for past wrongs.
"""


print("\nUsing the QA pipeline with pre-trained BERT for QA:\n")
for question in questions:
    result = qa_pipeline(question=question, context=context)
    print(f"Q: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Score: {result['score']:.4f}")
    print("-" * 50)

Q: Who introduced Watson to Holmes?
Answer: What was that?

Q: Where did Watson and Holmes live?
Answer: What was that?

Q: Who was the murderer?
Answer: Where did the blood come from?



Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.



Using the QA pipeline with pre-trained BERT for QA:

Q: Who introduced Watson to Holmes?
Answer: Stamford
Score: 0.9813
--------------------------------------------------
Q: Where did Watson and Holmes live?
Answer: 221B Baker Street
Score: 0.9657
--------------------------------------------------
Q: Who was the murderer?
Answer: Jefferson Hope
Score: 0.9951
--------------------------------------------------


In [17]:

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):

        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits


        start_positions = inputs["start_positions"]
        end_positions = inputs["end_positions"]


        loss_start = F.cross_entropy(start_logits, start_positions)
        loss_end = F.cross_entropy(end_logits, end_positions)


        loss = (loss_start + loss_end) / 2
        return (loss, outputs) if return_outputs else loss





dataset = {
  "data": [
    {
      "title": "A Study in Scarlet",
      "paragraphs": [
        {
          "context": "Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.",
          "qas": [
            {
              "id": "1",
              "question": "Who introduced Watson to Holmes?",
              "answers": [{"text": "Stamford", "answer_start": 63}],
              "is_impossible": False
            },
            {
              "id": "2",
              "question": "Where did Watson and Holmes live?",
              "answers": [{"text": "221B Baker Street", "answer_start": 108}],
              "is_impossible": False
            },
            {
              "id": "3",
              "question": "What word was written in blood at the crime scene?",
              "answers": [{"text": "RACHE", "answer_start": 223}],
              "is_impossible": False
            },
            {
              "id": "4",
              "question": "Who was revealed as the murderer?",
              "answers": [{"text": "Jefferson Hope", "answer_start": 260}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Sherlock Holmes has remarkable skills in observation and deduction. He surprises Watson by identifying that Watson had served in Afghanistan simply by observing his appearance and behavior. Holmes also has an eccentric personality, often engaging in chemical experiments and playing the violin.",
          "qas": [
            {
              "id": "5",
              "question": "How did Holmes deduce that Watson had served in Afghanistan?",
              "answers": [{"text": "by observing his appearance and behavior", "answer_start": 123}],
              "is_impossible": False
            },
            {
              "id": "6",
              "question": "What instrument does Holmes play?",
              "answers": [{"text": "violin", "answer_start": 216}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes received a case involving the murder of Enoch J. Drebber at Lauriston Gardens. The scene included a mysterious word, 'RACHE,' written in blood on the wall. Holmes deduced the murder was motivated by revenge.",
          "qas": [
            {
              "id": "7",
              "question": "What was the location of the murder scene?",
              "answers": [{"text": "Lauriston Gardens", "answer_start": 46}],
              "is_impossible": False
            },
            {
              "id": "8",
              "question": "What did Holmes deduce was the motive for the murder?",
              "answers": [{"text": "revenge", "answer_start": 126}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes explained his investigative methods to Watson, emphasizing the importance of observation and deduction. He stated that the human brain is like an attic and must be furnished only with useful knowledge to avoid clutter.",
          "qas": [
            {
              "id": "9",
              "question": "How did Holmes describe the human brain?",
              "answers": [{"text": "like an attic", "answer_start": 104}],
              "is_impossible": False
            },
            {
              "id": "10",
              "question": "What did Holmes believe about acquiring knowledge irrelevant to his work?",
              "answers": [{"text": "it crowded out useful facts", "answer_start": 150}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes' investigation of Lauriston Gardens revealed a complex trail of evidence, including the size of the footprints, a woman's wedding ring, and the type of cigar ash left at the scene.",
          "qas": [
            {
              "id": "11",
              "question": "What object found at the scene suggested a woman might be involved?",
              "answers": [{"text": "a woman's wedding ring", "answer_start": 91}],
              "is_impossible": False
            },
            {
              "id": "12",
              "question": "What clue did the cigar ash provide?",
              "answers": [{"text": "the brand of cigar, Trichinopoly", "answer_start": 130}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Dr. John Watson, after sustaining a jezail bullet wound at the battle of Maiwand and suffering from enteric fever, was sent back to England for recovery. He spent time in London, seeking affordable accommodations while reflecting on his time in the Afghan war.",
          "qas": [
            {
              "id": "13",
              "question": "What injury did Watson sustain in Afghanistan?",
              "answers": [{"text": "a jezail bullet wound", "answer_start": 53}],
              "is_impossible": False
            },
            {
              "id": "14",
              "question": "Why was Watson sent back to England?",
              "answers": [{"text": "suffering from enteric fever", "answer_start": 86}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Sherlock Holmes deduced that Watson had served in Afghanistan by noticing his tan skin, medical knowledge, and military posture. Holmes explains his method as a series of observations and logical connections.",
          "qas": [
            {
              "id": "15",
              "question": "How did Holmes deduce Watsonâ€™s military background?",
              "answers": [{"text": "noticing his military posture", "answer_start": 82}],
              "is_impossible": False
            },
            {
              "id": "16",
              "question": "What specific detail led Holmes to conclude Watson served in a tropical climate?",
              "answers": [{"text": "his tan skin", "answer_start": 47}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes discovered a reagent that reacts only with hemoglobin, making it a reliable test for blood stains. He demonstrated the test by adding chemicals to a solution containing a minute amount of blood, resulting in a clear reaction.",
          "qas": [
            {
              "id": "17",
              "question": "What did Holmes' reagent test for?",
              "answers": [{"text": "hemoglobin", "answer_start": 43}],
              "is_impossible": False
            },
            {
              "id": "18",
              "question": "Why is Holmes' test significant in forensic science?",
              "answers": [{"text": "it is a reliable test for blood stains", "answer_start": 88}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "At Lauriston Gardens, the body of Enoch Drebber was found with no visible wounds. Blood marks surrounded the scene, and the word 'RACHE' was scrawled in blood on the wall. Holmes deduced the word was a clue left by the murderer.",
          "qas": [
            {
              "id": "19",
              "question": "What word was written on the wall at the crime scene?",
              "answers": [{"text": "RACHE", "answer_start": 112}],
              "is_impossible": False
            },
            {
              "id": "20",
              "question": "What did Holmes deduce about the word 'RACHE'?",
              "answers": [{"text": "it was a clue left by the murderer", "answer_start": 153}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Watson noted Holmes' exceptional knowledge in chemistry and law, while observing his ignorance in astronomy, politics, and philosophy. Holmes emphasized that his focus was solely on practical knowledge.",
          "qas": [
            {
              "id": "21",
              "question": "Which area of knowledge is Holmes profoundly skilled in?",
              "answers": [{"text": "chemistry", "answer_start": 50}],
              "is_impossible": False
            },
            {
              "id": "22",
              "question": "What is Holmesâ€™ view on acquiring non-practical knowledge?",
              "answers": [{"text": "he avoids it to focus on practical knowledge", "answer_start": 143}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Sherlock Holmes emphasized the importance of avoiding premature conclusions and focused on gathering evidence before forming theories. He explained this principle while investigating the Lauriston Gardens murder case.",
          "qas": [
            {
              "id": "23",
              "question": "What principle did Holmes emphasize during the investigation?",
              "answers": [{"text": "avoiding premature conclusions", "answer_start": 50}],
              "is_impossible": False
            },
            {
              "id": "24",
              "question": "Why does Holmes prefer gathering evidence before forming theories?",
              "answers": [{"text": "to avoid bias in judgment", "answer_start": 110}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes deduced from the trail of evidence at the scene that the murderer was likely left-handed, based on the direction of the blood spatter and positioning of objects.",
          "qas": [
            {
              "id": "25",
              "question": "What led Holmes to believe the murderer was left-handed?",
              "answers": [{"text": "direction of the blood spatter", "answer_start": 68}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "In the year 1878, Dr. John Watson completed his studies to become a Doctor of Medicine at the University of London. Shortly after, he was attached to the Fifth Northumberland Fusiliers, but before he could join, the second Afghan war broke out.",
          "qas": [
            {
              "id": "26",
              "question": "What year did Watson complete his studies?",
              "answers": [{"text": "1878", "answer_start": 7}],
              "is_impossible": False
            },
            {
              "id": "27",
              "question": "What regiment was Watson attached to?",
              "answers": [{"text": "Fifth Northumberland Fusiliers", "answer_start": 126}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "At the Criterion Bar in London, Watson unexpectedly met Stamford, an old acquaintance. Stamford introduced Watson to Sherlock Holmes, who was searching for someone to share lodgings at 221B Baker Street.",
          "qas": [
            {
              "id": "28",
              "question": "Where did Watson meet Stamford?",
              "answers": [{"text": "Criterion Bar", "answer_start": 6}],
              "is_impossible": False
            },
            {
              "id": "29",
              "question": "Who introduced Watson to Sherlock Holmes?",
              "answers": [{"text": "Stamford", "answer_start": 53}],
              "is_impossible": False
            },
            {
              "id": "30",
              "question": "What address did Holmes propose for shared lodgings?",
              "answers": [{"text": "221B Baker Street", "answer_start": 111}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Sherlock Holmes demonstrated his deductive abilities when he first met Watson by deducing that Watson had recently returned from Afghanistan. Holmes explained that he noticed Watson's tan skin, military bearing, and signs of illness.",
          "qas": [
            {
              "id": "31",
              "question": "How did Holmes deduce Watson's return from Afghanistan?",
              "answers": [{"text": "Watson's tan skin, military bearing, and signs of illness", "answer_start": 142}],
              "is_impossible": False
            },
            {
              "id": "32",
              "question": "What did Holmes deduce about Watson during their first meeting?",
              "answers": [{"text": "that Watson had recently returned from Afghanistan", "answer_start": 75}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes used a newly discovered reagent to test for blood. The reagent reacts only with hemoglobin and produces a distinct reaction, making it a reliable test for detecting blood stains, even if the blood is old.",
          "qas": [
            {
              "id": "33",
              "question": "What does Holmes' reagent test for?",
              "answers": [{"text": "hemoglobin", "answer_start": 43}],
              "is_impossible": False
            },
            {
              "id": "34",
              "question": "Why is Holmes' reagent significant?",
              "answers": [{"text": "It is a reliable test for detecting blood stains, even if the blood is old", "answer_start": 99}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "The body of Enoch Drebber was discovered at Lauriston Gardens. Blood was found at the scene, along with the word 'RACHE' scrawled on the wall. Holmes deduced that the word was a clue left by the murderer.",
          "qas": [
            {
              "id": "35",
              "question": "Where was Enoch Drebber's body found?",
              "answers": [{"text": "Lauriston Gardens", "answer_start": 38}],
              "is_impossible": False
            },
            {
              "id": "36",
              "question": "What word was written on the wall at the crime scene?",
              "answers": [{"text": "RACHE", "answer_start": 86}],
              "is_impossible": False
            },
            {
              "id": "37",
              "question": "What did Holmes deduce about the word 'RACHE'?",
              "answers": [{"text": "It was a clue left by the murderer", "answer_start": 120}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Sherlock Holmes emphasized the importance of observation and deduction. He compared the human brain to an attic that must only be furnished with useful knowledge to avoid clutter.",
          "qas": [
            {
              "id": "38",
              "question": "How did Holmes describe the human brain?",
              "answers": [{"text": "like an attic", "answer_start": 83}],
              "is_impossible": False
            },
            {
              "id": "39",
              "question": "What did Holmes believe about irrelevant knowledge?",
              "answers": [{"text": "It crowds out useful facts", "answer_start": 129}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "During his investigation, Holmes deduced that the murderer of Enoch Drebber was a man over six feet tall who smoked a Trichinopoly cigar. Holmes based this on the ash found at the scene.",
          "qas": [
            {
              "id": "40",
              "question": "How tall did Holmes deduce the murderer was?",
              "answers": [{"text": "over six feet tall", "answer_start": 64}],
              "is_impossible": False
            },
            {
              "id": "41",
              "question": "What type of cigar did Holmes identify at the scene?",
              "answers": [{"text": "Trichinopoly cigar", "answer_start": 100}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Watson observed that Holmes had extraordinary knowledge in chemistry and criminal history but was ignorant of astronomy, politics, and philosophy. Holmes justified this by saying he only retained knowledge useful to his work.",
          "qas": [
            {
              "id": "42",
              "question": "Which subject did Watson note Holmes was skilled in?",
              "answers": [{"text": "chemistry", "answer_start": 54}],
              "is_impossible": False
            },
            {
              "id": "43",
              "question": "Why did Holmes ignore subjects like astronomy and politics?",
              "answers": [{"text": "He only retained knowledge useful to his work", "answer_start": 170}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes advised against forming theories before gathering evidence, emphasizing that premature conclusions can bias judgment. He applied this principle during the investigation of Lauriston Gardens.",
          "qas": [
            {
              "id": "44",
              "question": "What principle did Holmes emphasize during investigations?",
              "answers": [{"text": "Avoid forming theories before gathering evidence", "answer_start": 52}],
              "is_impossible": False
            },
            {
              "id": "45",
              "question": "Why did Holmes avoid premature conclusions?",
              "answers": [{"text": "They can bias judgment", "answer_start": 112}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Dr. John Watson, recovering from enteric fever and a jezail bullet wound from the battle of Maiwand, returned to England after being deemed unfit for further duty in Afghanistan.",
          "qas": [
            {
              "id": "46",
              "question": "What illness did Watson suffer from in Afghanistan?",
              "answers": [{"text": "enteric fever", "answer_start": 41}],
              "is_impossible": False
            },
            {
              "id": "47",
              "question": "What injury did Watson sustain at the battle of Maiwand?",
              "answers": [{"text": "a jezail bullet wound", "answer_start": 22}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Sherlock Holmes described his profession as a 'consulting detective,' assisting both government and private detectives when they were unable to solve a case.",
          "qas": [
            {
              "id": "48",
              "question": "What did Holmes call his profession?",
              "answers": [{"text": "consulting detective", "answer_start": 34}],
              "is_impossible": False
            },
            {
              "id": "49",
              "question": "Who did Holmes assist in his profession?",
              "answers": [{"text": "government and private detectives", "answer_start": 69}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes pointed out that the bloodstains at Lauriston Gardens likely belonged to the murderer, as there were no visible wounds on Enoch Drebber's body.",
          "qas": [
            {
              "id": "50",
              "question": "Whose blood did Holmes believe was found at Lauriston Gardens?",
              "answers": [{"text": "the murderer's", "answer_start": 48}],
              "is_impossible": False
            },
            {
              "id": "51",
              "question": "Why did Holmes believe the blood was not Drebber's?",
              "answers": [{"text": "there were no visible wounds on Enoch Drebber's body", "answer_start": 77}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes explained that the word 'RACHE' written in blood was not a name but the German word for 'revenge,' providing a clue about the murder's motive.",
          "qas": [
            {
              "id": "52",
              "question": "What does the word 'RACHE' mean in German?",
              "answers": [{"text": "revenge", "answer_start": 69}],
              "is_impossible": False
            },
            {
              "id": "53",
              "question": "What did the word 'RACHE' suggest about the motive?",
              "answers": [{"text": "providing a clue about the murder's motive", "answer_start": 95}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes deduced that the murderer had small feet for his height, smoked a specific type of cigar, and wore square-toed boots, based on the evidence at the scene.",
          "qas": [
            {
              "id": "54",
              "question": "What type of boots did the murderer wear?",
              "answers": [{"text": "square-toed boots", "answer_start": 108}],
              "is_impossible": False
            },
            {
              "id": "55",
              "question": "What did Holmes deduce about the murderer's feet?",
              "answers": [{"text": "small feet for his height", "answer_start": 45}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Watson observed Holmes' extraordinary ability to identify soils and splashes on his trousers, deducing exactly where he had been walking in London.",
          "qas": [
            {
              "id": "56",
              "question": "What skill did Holmes demonstrate with soil analysis?",
              "answers": [{"text": "identifying soils and splashes", "answer_start": 27}],
              "is_impossible": False
            },
            {
              "id": "57",
              "question": "What could Holmes deduce from splashes on his trousers?",
              "answers": [{"text": "exactly where he had been walking in London", "answer_start": 81}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes believed that knowledge not directly relevant to his work should be discarded, arguing that the brain has limited capacity, which should not be cluttered with irrelevant facts.",
          "qas": [
            {
              "id": "58",
              "question": "Why did Holmes believe irrelevant knowledge should be discarded?",
              "answers": [{"text": "the brain has limited capacity", "answer_start": 94}],
              "is_impossible": False
            },
            {
              "id": "59",
              "question": "What was Holmes' argument against cluttering the brain?",
              "answers": [{"text": "it should not be cluttered with irrelevant facts", "answer_start": 140}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "While inspecting the crime scene, Holmes collected a sample of cigar ash and determined it was from a Trichinopoly cigar, which narrowed down the list of suspects.",
          "qas": [
            {
              "id": "60",
              "question": "What type of cigar ash did Holmes find at the crime scene?",
              "answers": [{"text": "Trichinopoly cigar", "answer_start": 74}],
              "is_impossible": False
            },
            {
              "id": "61",
              "question": "Why was the cigar ash significant to Holmes?",
              "answers": [{"text": "it narrowed down the list of suspects", "answer_start": 104}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "Holmes demonstrated his violin-playing skills, often improvising pieces to match his thoughts, which Watson found both impressive and eccentric.",
          "qas": [
            {
              "id": "62",
              "question": "What instrument did Holmes play?",
              "answers": [{"text": "violin", "answer_start": 40}],
              "is_impossible": False
            },
            {
              "id": "63",
              "question": "How did Watson describe Holmes' violin playing?",
              "answers": [{"text": "impressive and eccentric", "answer_start": 114}],
              "is_impossible": False
            }
          ]
        },
        {
          "context": "At Lauriston Gardens, Holmes deduced that the murderer had arrived in a cab with his victim, noting specific wheel and hoof marks outside the house.",
          "qas": [
            {
              "id": "64",
              "question": "How did Holmes deduce the murderer arrived at Lauriston Gardens?",
              "answers": [{"text": "noting specific wheel and hoof marks outside the house", "answer_start": 89}],
              "is_impossible": False
            },
            {
              "id": "65",
              "question": "Who did Holmes believe arrived in the cab?",
              "answers": [{"text": "the murderer and his victim", "answer_start": 66}],
              "is_impossible": False
            }
          ]
        }
      ]
    }
  ]
}



with open('/content/drive/My Drive/NLP_Project2/qa_dataset.json', 'w') as f:
    json.dump(dataset, f)
print("Dataset saved successfully!")


dataset_path = "/content/drive/My Drive/NLP_Project2/qa_dataset.json"

raw_datasets = load_dataset(
    "json",
    data_files={"train": dataset_path, "validation": dataset_path},
    field="data"
)
print("Raw datasets loaded successfully!")
print("Dataset structure:", raw_datasets)
print("First train entry:", raw_datasets["train"][0])


def flatten_dataset(dataset):
    flattened_data = {"context": [], "question": [], "answers": []}
    for entry in dataset:
        for paragraph in entry["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                flattened_data["context"].append(context)
                flattened_data["question"].append(qa["question"])
                if len(qa["answers"]) > 0:

                    answer_texts = [answer["text"] for answer in qa["answers"]]
                    answer_starts = [answer["answer_start"] for answer in qa["answers"]]
                    flattened_data["answers"].append(
                        {"text": answer_texts, "answer_start": answer_starts}
                    )
                else:

                    flattened_data["answers"].append({"text": [], "answer_start": []})
    return flattened_data

flattened_train = flatten_dataset(raw_datasets["train"])
flattened_validation = flatten_dataset(raw_datasets["validation"])


train_dataset = Dataset.from_dict(flattened_train)
validation_dataset = Dataset.from_dict(flattened_validation)


model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)


def preprocess_function(examples):
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):

        sequence_ids = tokenized_examples.sequence_ids(i)


        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]


        if len(answers["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:

            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])


            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1


            token_end_index = len(offsets) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1


            if offsets[token_start_index][0] > end_char or offsets[token_end_index][1] < start_char:

                start_positions.append(0)
                end_positions.append(0)
            else:

                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions
    return tokenized_examples


tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names
)
tokenized_validation = validation_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=validation_dataset.column_names
)


training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/NLP_Project2/qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=50,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
    fp16=True
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer
)


trainer.train()


trainer.save_model("/content/drive/My Drive/NLP_Project2/qa_finetuned_model")
tokenizer.save_pretrained("/content/drive/My Drive/NLP_Project2/qa_finetuned_model")
print("Fine-tuned model saved successfully!")

Dataset saved successfully!


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Raw datasets loaded successfully!
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 1
    })
})
First train entry: {'title': 'A Study in Scarlet', 'paragraphs': [{'context': "Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.", 'qas': [{'answers': [{'answer_start': 63, 'text': 'Stamford'}], 'id': '1', 'is_impossible': False, 'question': 'Who introduced Watson to Holmes?'}, {'answers': [{'answer_start': 108, 'text': '221B Baker Street'}], 'id': '2', 'is_impossible': False, 'question': 'W

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Map:   0%|          | 0/65 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,No log,3.847251
2,No log,3.176337
3,No log,2.566496
4,No log,2.11398
5,No log,1.626942
6,No log,1.406156
7,No log,0.933556
8,No log,0.864672
9,No log,0.616648
10,No log,0.40196


Fine-tuned model saved successfully!


In [18]:
#Load models (out of box and fine tuned) for evaluation and comparison
#read qa dataset
#calculate Exact match and f1 matrices
#compare models





squad_metric = load("squad")

fine_tuned_model_path = "/content/drive/My Drive/NLP_Project2/qa_finetuned_model"
dataset_path = "/content/drive/My Drive/NLP_Project2/qa_dataset.json"

print("Loading the fine-tuned model...")
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(fine_tuned_model_path)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_path)

print("Loading the pre-trained model...")
pre_trained_model_name = "bert-base-uncased"
pre_trained_model = AutoModelForQuestionAnswering.from_pretrained(pre_trained_model_name)
pre_trained_tokenizer = AutoTokenizer.from_pretrained(pre_trained_model_name)

print("Loading the dataset...")
raw_datasets = load_dataset("json", data_files={"validation": dataset_path})

validation_dataset = raw_datasets["validation"]
print(f"Loaded {len(validation_dataset)} examples.")
print(validation_dataset[0])
print(validation_dataset.column_names)

def flatten_evaluation_dataset(dataset):
    """Flatten the dataset for evaluation purposes."""
    flattened_data = []

    for entry in dataset:
        if "data" in entry:
            entry_data = entry["data"]
            for sub_entry in entry_data:
                for paragraph in sub_entry["paragraphs"]:
                    context = paragraph["context"]
                    for qa in paragraph["qas"]:
                        flattened_data.append({
                            "context": context,
                            "question": qa["question"],
                            "answers": qa["answers"]
                        })
        else:
            raise ValueError("Dataset entry does not contain a 'data' key.")

    return flattened_data

evaluation_data = flatten_evaluation_dataset(validation_dataset)

def evaluate_qa_model(model, tokenizer, dataset):
    """Evaluate the QA model on the dataset using the SQuAD metric."""
    predictions = []
    references = []

    for idx, example in enumerate(dataset):
        context = example["context"]
        question = example["question"]
        answers = example["answers"]
        qas_id = str(idx)

        if isinstance(answers, list) and len(answers) > 0:
            ground_truth_texts = [ans["text"] for ans in answers]
        else:
            raise ValueError(f"Invalid 'answers' format: {answers}")

        inputs = tokenizer(
            question,
            context,
            return_tensors="pt",
            truncation=True,
            padding=True,
            max_length=512
        )

        with torch.no_grad():
            outputs = model(**inputs)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits


        start_index = torch.argmax(start_logits, dim=1).item()
        end_index = torch.argmax(end_logits, dim=1).item() + 1


        input_ids = inputs["input_ids"][0]
        predicted_answer = tokenizer.decode(
            input_ids[start_index:end_index],
            skip_special_tokens=True
        )


        predictions.append({
            "id": qas_id,
            "prediction_text": predicted_answer
        })

        references.append({
            "id": qas_id,
            "answers": {
                "text": ground_truth_texts,
                "answer_start": [ans["answer_start"] for ans in answers],
            }
        })


    results = squad_metric.compute(predictions=predictions, references=references)
    return results["exact_match"], results["f1"]

print("Evaluating the fine-tuned model...")
fine_tuned_em, fine_tuned_f1 = evaluate_qa_model(fine_tuned_model, fine_tuned_tokenizer, evaluation_data)

print("Evaluating the pre-trained model...")
pre_trained_em, pre_trained_f1 = evaluate_qa_model(pre_trained_model, pre_trained_tokenizer, evaluation_data)

print("\nEvaluation Results:")
print(f"Fine-Tuned Model: EM = {fine_tuned_em:.2f}, F1 = {fine_tuned_f1:.2f}")
print(f"Pre-Trained Model: EM = {pre_trained_em:.2f}, F1 = {pre_trained_f1:.2f}")

Loading the fine-tuned model...
Loading the pre-trained model...


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loading the dataset...


Generating validation split: 0 examples [00:00, ? examples/s]

Loaded 1 examples.
{'data': [{'title': 'A Study in Scarlet', 'paragraphs': [{'context': "Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.", 'qas': [{'id': '1', 'question': 'Who introduced Watson to Holmes?', 'answers': [{'text': 'Stamford', 'answer_start': 63}], 'is_impossible': False}, {'id': '2', 'question': 'Where did Watson and Holmes live?', 'answers': [{'text': '221B Baker Street', 'answer_start': 108}], 'is_impossible': False}, {'id': '3', 'question': 'What word was written in blood at the crime scene?', 'answers': [{'text': 'RACHE', 'answer_start': 223}], 'is_impossible': False}, {'id': '4', 'question': 'Who was revealed as the murder