In [None]:
# Install required libraries
!pip install transformers torch datasets

# Import necessary libraries
from transformers import BertTokenizer, BertModel, pipeline, AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import torch
import torch.nn.functional as F
import re
from nltk.tokenize import sent_tokenize
import nltk
import json
import os
os.environ["WANDB_DISABLED"] = "true"
nltk.download('punkt')
nltk.download('punkt_tab')
from google.colab import drive
drive.mount('/content/drive')
# Load the book content


book_path = "/content/drive/My Drive/NLP_Project2/a study in scarlet.txt" # Update this path to match the uploaded file location





[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def load_text(file_path):
    """
    Load text from a file.
    """
    print("Loading the text from the file...")
    with open(book_path, 'r', encoding='utf-8') as file:
        return file.read()

def remove_gutenberg_header_footer(text, start_marker, end_marker):
    """
    Remove the header and footer from Project Gutenberg text.

    Finds specified start and end markers and returns the content in between.

    Parameters:
        text (str): The text to clean.
        start_marker (str): Marker indicating the start of main content.
        end_marker (str): Marker indicating the end of main content.

    Returns:
        str: Text with header and footer removed, or original text if markers are not found.
    """
    start_index = text.find(start_marker)
    if start_index == -1:
        return text

    end_index = text.find(end_marker)
    if end_index == -1:
        return text

    cleaned_text = text[start_index + len(start_marker):end_index].strip()
    return cleaned_text

In [None]:
# Load the raw text using the load_text function
raw_text = load_text(book_path)

# Define the markers for Project Gutenberg
start_marker = "*** START OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"
end_marker = "*** END OF THE PROJECT GUTENBERG EBOOK A STUDY IN SCARLET ***"

# Remove the headers and footers
cleaned_text = remove_gutenberg_header_footer(raw_text, start_marker, end_marker)

# Display a sample of the cleaned text
print("Cleaned text sample:\n")
print(cleaned_text[:500])

Loading the text from the file...
Cleaned text sample:

A STUDY IN SCARLET

By A. Conan Doyle




CONTENTS

 A STUDY IN SCARLET.

 PART I.
 CHAPTER I. MR. SHERLOCK HOLMES.
 CHAPTER II. THE SCIENCE OF DEDUCTION.
 CHAPTER III. THE LAURISTON GARDENS MYSTERY
 CHAPTER IV. WHAT JOHN RANCE HAD TO TELL.
 CHAPTER V. OUR ADVERTISEMENT BRINGS A VISITOR.
 CHAPTER VI. TOBIAS GREGSON SHOWS WHAT HE CAN DO.
 CHAPTER VII. LIGHT IN THE DARKNESS.

 PART II. THE COUNTRY OF THE SAINTS
 CHAPTER I. ON THE GREAT ALKALI PLAIN.
 CHAPTER II. THE FLOWER OF UTAH.
 CHAPTER III. J


In [None]:
# Tokenize the cleaned text into sentences
def tokenize_sentences(text):
    """
    Tokenize the input text into sentences using NLTK.

    Parameters:
        text (str): The text to tokenize.

    Returns:
        list: A list of tokenized sentences.
    """
    print("Tokenizing the text into sentences...")
    sentences = sent_tokenize(text)
    print(f"Total sentences: {len(sentences)}")
    return sentences

# Apply sentence tokenization
sentences = tokenize_sentences(cleaned_text)

# Verify sentence tokenization
print(f"Number of tokenized sentences: {len(sentences)}")
if len(sentences) == 0:
    raise ValueError("No sentences were tokenized. Please check the input text.")

# Display the first 5 sentences for verification
print("\nSample tokenized sentences:")
for i, sentence in enumerate(sentences[:5], 1):
    print(f"{i}: {sentence}")

Tokenizing the text into sentences...
Total sentences: 2208
Number of tokenized sentences: 2208

Sample tokenized sentences:
1: A STUDY IN SCARLET

By A. Conan Doyle




CONTENTS

 A STUDY IN SCARLET.
2: PART I.
3: CHAPTER I. MR. SHERLOCK HOLMES.
4: CHAPTER II.
5: THE SCIENCE OF DEDUCTION.


In [None]:
# Initialize BERT tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
print("BERT model initialized successfully!")

def tokenize_for_bert(sentences, tokenizer, max_length=512):
    """
    Tokenize sentences for BERT and prepare input tensors.

    Parameters:
        sentences (list): List of sentences to tokenize.
        tokenizer (BertTokenizer): Pre-trained BERT tokenizer.
        max_length (int): Maximum sequence length for BERT.

    Returns:
        dict: Tokenized inputs with attention masks.
    """
    print("Tokenizing sentences for BERT...")
    encoded_inputs = tokenizer(
        sentences,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=max_length
    )
    return encoded_inputs

# Tokenize sentences
bert_inputs = tokenize_for_bert(sentences, bert_tokenizer)

# Verify BERT tokenizer inputs
if not bert_inputs or "input_ids" not in bert_inputs:
    raise ValueError("BERT inputs were not created properly. Check the tokenizer function.")
print("BERT inputs created successfully!")

# Display a sample of the tokenized inputs
print("\nSample tokenized input IDs:")
print(bert_inputs["input_ids"][:2])  # Display first two tokenized inputs
print("\nAttention masks:")
print(bert_inputs["attention_mask"][:2])  # Display first two attention masks

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Tokenizing sentences for BERT...
BERT inputs created successfully!

Sample tokenized input IDs:
tensor([[  101,  1037,  2817,  1999, 11862,  2011,  1037,  1012, 16608, 11294,
          8417,  1037,  2817,  1999, 11862,  1012,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0

In [None]:
bert_inputs = tokenize_for_bert(sentences, bert_tokenizer)
def generate_embeddings_in_batches(inputs, model, batch_size=32):
    """
    Generate embeddings for tokenized inputs using BERT in smaller batches.

    Parameters:
        inputs (dict): Tokenized inputs containing input_ids and attention_mask.
        model (BertModel): Pre-trained BERT model.
        batch_size (int): Number of sentences to process in each batch.

    Returns:
        torch.Tensor: Embeddings for all input sentences.
    """
    print("Generating embeddings in batches...")
    all_embeddings = []

    # Split inputs into batches
    input_ids = inputs["input_ids"]
    attention_mask = inputs["attention_mask"]
    num_samples = input_ids.size(0)

    for start_idx in range(0, num_samples, batch_size):
        end_idx = min(start_idx + batch_size, num_samples)
        batch_input_ids = input_ids[start_idx:end_idx]
        batch_attention_mask = attention_mask[start_idx:end_idx]

        with torch.no_grad():
            outputs = model(input_ids=batch_input_ids, attention_mask=batch_attention_mask)
            batch_embeddings = outputs.last_hidden_state[:, 0, :]  # Extract [CLS] token embeddings
            all_embeddings.append(batch_embeddings)

        print(f"Processed batch {start_idx // batch_size + 1}/{(num_samples + batch_size - 1) // batch_size}")

    # Concatenate all batch embeddings
    return torch.cat(all_embeddings, dim=0)

bert_model = BertModel.from_pretrained('bert-base-uncased')
# Generate embeddings in batches
batch_size = 16  # Adjust batch size based on available memory
embeddings = generate_embeddings_in_batches(bert_inputs,bert_model,batch_size=batch_size)

# Display the shape of the embeddings
print("Embeddings shape:", embeddings.shape)

# Save embeddings for future use
torch.save(embeddings, '/content/drive/My Drive/NLP_Project2/bert_embeddings.pt')
print("Embeddings saved successfully!")

Tokenizing sentences for BERT...
Generating embeddings in batches...
Processed batch 1/138
Processed batch 2/138
Processed batch 3/138
Processed batch 4/138
Processed batch 5/138
Processed batch 6/138
Processed batch 7/138
Processed batch 8/138
Processed batch 9/138
Processed batch 10/138
Processed batch 11/138
Processed batch 12/138
Processed batch 13/138
Processed batch 14/138
Processed batch 15/138
Processed batch 16/138
Processed batch 17/138
Processed batch 18/138
Processed batch 19/138
Processed batch 20/138
Processed batch 21/138
Processed batch 22/138
Processed batch 23/138
Processed batch 24/138
Processed batch 25/138
Processed batch 26/138
Processed batch 27/138
Processed batch 28/138
Processed batch 29/138
Processed batch 30/138
Processed batch 31/138
Processed batch 32/138
Processed batch 33/138
Processed batch 34/138
Processed batch 35/138
Processed batch 36/138
Processed batch 37/138
Processed batch 38/138
Processed batch 39/138
Processed batch 40/138
Processed batch 41/1

In [None]:
# Load the embeddings
embeddings = torch.load('/content/drive/My Drive/NLP_Project2/bert_embeddings.pt')
print("Loaded embeddings with shape:", embeddings.shape)

Loaded embeddings with shape: torch.Size([2208, 768])


  embeddings = torch.load('/content/drive/My Drive/NLP_Project2/bert_embeddings.pt')


In [None]:
# Initialize the tokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example questions
questions = [
    "Who introduced Watson to Holmes?",
    "Where did Watson and Holmes live?",
    "Who was the murderer?",
]

# Encode the questions
question_inputs = bert_tokenizer(questions, return_tensors="pt", padding=True, truncation=True, max_length=512)

In [None]:
def find_most_similar(embeddings, question_embedding):
    """
    Find the index of the most similar embedding based on cosine similarity.

    Parameters:
        embeddings (torch.Tensor): Sentence embeddings.
        question_embedding (torch.Tensor): Embedding of the question.

    Returns:
        int: Index of the most similar sentence.
    """
    similarities = F.cosine_similarity(embeddings, question_embedding, dim=1)
    return torch.argmax(similarities).item()

# Get the [CLS] embedding for each question
question_embeddings = bert_model(**question_inputs).last_hidden_state[:, 0, :]  # [num_questions, hidden_dim]

# Find and display the most relevant sentences for each question
for i, question in enumerate(questions):
    most_similar_idx = find_most_similar(embeddings, question_embeddings[i].unsqueeze(0))
    print(f"Q: {question}")
    print(f"Answer: {sentences[most_similar_idx]}\n")



# Set up a QA pipeline with a pre-trained QA model
qa_pipeline = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad")

# Example context (a chunk of the novel for QA testing)
context = """
Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford.
They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police
in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later
revealed as the murderer, seeking revenge for past wrongs.
"""

# Ask questions
print("\nUsing the QA pipeline with pre-trained BERT for QA:\n")
for question in questions:
    result = qa_pipeline(question=question, context=context)
    print(f"Q: {question}")
    print(f"Answer: {result['answer']}")
    print(f"Score: {result['score']:.4f}")
    print("-" * 50)

Q: Who introduced Watson to Holmes?
Answer: What was that?

Q: Where did Watson and Holmes live?
Answer: What was that?

Q: Who was the murderer?
Answer: Where did the blood come from?



config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]


Using the QA pipeline with pre-trained BERT for QA:

Q: Who introduced Watson to Holmes?
Answer: Stamford
Score: 0.9813
--------------------------------------------------
Q: Where did Watson and Holmes live?
Answer: 221B Baker Street
Score: 0.9657
--------------------------------------------------
Q: Who was the murderer?
Answer: Jefferson Hope
Score: 0.9951
--------------------------------------------------


In [None]:

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        # Forward pass
        outputs = model(**inputs)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Extract start and end positions
        start_positions = inputs["start_positions"]
        end_positions = inputs["end_positions"]

        # Compute the loss for both start and end logits
        loss_start = F.cross_entropy(start_logits, start_positions)
        loss_end = F.cross_entropy(end_logits, end_positions)

        # Combine the losses
        loss = (loss_start + loss_end) / 2
        return (loss, outputs) if return_outputs else loss




# Step 1: Create a QA Dataset
dataset = {
    "data": [
        {
            "title": "A Study in Scarlet",
            "paragraphs": [
                {
                    "context": "Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.",
                    "qas": [
                        {
                            "id": "1",
                            "question": "Who introduced Watson to Holmes?",
                            "answers": [{"text": "Stamford", "answer_start": 63}],
                            "is_impossible": False
                        },
                        {
                            "id": "2",
                            "question": "Where did Watson and Holmes live?",
                            "answers": [{"text": "221B Baker Street", "answer_start": 108}],
                            "is_impossible": False
                        },
                        {
                            "id": "3",
                            "question": "What word was written in blood at the crime scene?",
                            "answers": [{"text": "RACHE", "answer_start": 223}],
                            "is_impossible": False
                        },
                        {
                            "id": "4",
                            "question": "Who was revealed as the murderer?",
                            "answers": [{"text": "Jefferson Hope", "answer_start": 260}],
                            "is_impossible": False
                        }
                    ]
                },
                {
                    "context": "Sherlock Holmes has remarkable skills in observation and deduction. He surprises Watson by identifying that Watson had served in Afghanistan simply by observing his appearance and behavior. Holmes also has an eccentric personality, often engaging in chemical experiments and playing the violin.",
                    "qas": [
                        {
                            "id": "5",
                            "question": "How did Holmes deduce that Watson had served in Afghanistan?",
                            "answers": [{"text": "by observing his appearance and behavior", "answer_start": 123}],
                            "is_impossible": False
                        },
                        {
                            "id": "6",
                            "question": "What instrument does Holmes play?",
                            "answers": [{"text": "violin", "answer_start": 216}],
                            "is_impossible": False
                        }
                    ]
                }
            ]
        }
    ]
}

# Save the dataset
with open('/content/drive/My Drive/NLP_Project2/qa_dataset.json', 'w') as f:
    json.dump(dataset, f)
print("Dataset saved successfully!")

# Step 2: Inspect Dataset Structure
dataset_path = "/content/drive/My Drive/NLP_Project2/qa_dataset.json"
raw_datasets = load_dataset("json", data_files={"train": dataset_path, "validation": dataset_path})
print("Raw datasets loaded successfully!")
print("Dataset structure:", raw_datasets)
print("First train entry:", raw_datasets["train"][0])

# Step 3: Flatten the Dataset
def flatten_dataset(dataset):
    flattened_data = {"context": [], "question": [], "answers": []}
    for entry in dataset:  # Iterate directly over the dataset rows
        for paragraph in entry["data"][0]["paragraphs"]:  # Access the "paragraphs" inside "data"
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                flattened_data["context"].append(context)
                flattened_data["question"].append(qa["question"])
                flattened_data["answers"].append(qa["answers"])
    return flattened_data

flattened_train = flatten_dataset(raw_datasets["train"])
flattened_validation = flatten_dataset(raw_datasets["validation"])

# Convert to Dataset format
train_dataset = Dataset.from_dict(flattened_train)
validation_dataset = Dataset.from_dict(flattened_validation)

# Step 4: Load Pre-trained Tokenizer and Model
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Tokenize the flattened dataset
def preprocess_function(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation=True,
        padding="max_length",
        max_length=512,
    )
    start_positions = []
    end_positions = []

    for i, answer in enumerate(examples["answers"]):
        # Find the start and end character positions in the context
        start_char = answer[0]["answer_start"]
        end_char = start_char + len(answer[0]["text"])

        # Map the character positions to token positions
        start_token = inputs.char_to_token(i, start_char)
        end_token = inputs.char_to_token(i, end_char - 1)

        # Handle cases where mapping fails or goes out of bounds
        if start_token is None or start_token >= tokenizer.model_max_length:
            start_token = tokenizer.model_max_length - 1
        if end_token is None or end_token >= tokenizer.model_max_length:
            end_token = tokenizer.model_max_length - 1

        start_positions.append(start_token)
        end_positions.append(end_token)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

tokenized_train = train_dataset.map(preprocess_function, batched=True)
tokenized_validation = validation_dataset.map(preprocess_function, batched=True)

# Step 5: Fine-tuning Arguments
training_args = TrainingArguments(
    output_dir="/content/drive/My Drive/NLP_Project2/qa_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,  # Reduced batch size
    per_device_eval_batch_size=4,  # Reduced batch size
    gradient_accumulation_steps=2,  # Simulate batch size of 8
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    save_total_limit=1,
    save_steps=500,
    logging_steps=500,
    fp16=True  # Enable mixed precision training
)

# Trainer
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer
)

# Step 6: Train the Model
trainer.train()

# Save the fine-tuned model and tokenizer
trainer.save_model("/content/drive/My Drive/NLP_Project2/qa_finetuned_model")
tokenizer.save_pretrained("/content/drive/My Drive/NLP_Project2/qa_finetuned_model")
print("Fine-tuned model saved successfully!")


Dataset saved successfully!


Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Raw datasets loaded successfully!
Dataset structure: DatasetDict({
    train: Dataset({
        features: ['data'],
        num_rows: 1
    })
    validation: Dataset({
        features: ['data'],
        num_rows: 1
    })
})
First train entry: {'data': [{'title': 'A Study in Scarlet', 'paragraphs': [{'context': "Dr. John Watson, recently returned from Afghanistan, is introduced to Sherlock Holmes by Stamford. They decide to share an apartment at 221B Baker Street. Holmes demonstrates his deductive skills and assists the police in solving the murder of Enoch Drebber. The word 'RACHE' is written in blood at the crime scene. Jefferson Hope is later revealed as the murderer, seeking revenge for past wrongs.", 'qas': [{'id': '1', 'question': 'Who introduced Watson to Holmes?', 'answers': [{'text': 'Stamford', 'answer_start': 63}], 'is_impossible': False}, {'id': '2', 'question': 'Where did Watson and Holmes live?', 'answers': [{'text': '221B Baker Street', 'answer_start': 108}], 'is_impos

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Map:   0%|          | 0/6 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = CustomTrainer(


Epoch,Training Loss,Validation Loss
1,No log,6.169132
2,No log,6.115574
3,No log,6.089988


Fine-tuned model saved successfully!
