In [None]:
!pip install transformers torch datasets
!pip install evaluate

from evaluate import load
import evaluate
from transformers import BertTokenizer, BertModel, pipeline, AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, AutoModelForQuestionAnswering
from datasets import load_dataset, Dataset
import torch
import torch.nn.functional as F
import re
from nltk.tokenize import sent_tokenize
import nltk
import json
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
os.environ["WANDB_DISABLED"] = "true"
nltk.download('punkt')
nltk.download('punkt_tab')
from google.colab import drive
drive.mount('/content/drive')


book_path = "/content/drive/My Drive/NLP_Project2/a study in scarlet.txt"

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Mounted at /content/drive


In [36]:
dataset_path = '/content/drive/My Drive/NLP_Project2/validated_qa_dataset1.json'

with open(dataset_path, 'r') as f:
    all_qa_pairs = json.load(f)
print("Dataset loaded successfully!")

train_data, val_data = train_test_split(all_qa_pairs, test_size=0.2, random_state=42)

train_path = '/content/drive/My Drive/NLP_Project2/qa_train.json'
val_path = '/content/drive/My Drive/NLP_Project2/qa_validation.json'

with open(train_path, 'w') as f:
    json.dump(train_data, f)

with open(val_path, 'w') as f:
    json.dump(val_data, f)

print("Train and validation datasets saved successfully!")

df_train = pd.DataFrame(train_data)
hf_train_dataset = Dataset.from_pandas(df_train)
print("First training sample:")
print(hf_train_dataset[0])  # Debug first training sample
print("Training dataset structure:")
print(hf_train_dataset)

df_val = pd.DataFrame(val_data)
hf_val_dataset = Dataset.from_pandas(df_val)

print("First validation sample:")
print(hf_val_dataset[0])  # Debug first validation sample
print("Validation dataset structure:")
print(hf_val_dataset)


Dataset loaded successfully!
Train and validation datasets saved successfully!
First training sample:
{'context': "Watson observed Holmes' extraordinary ability to identify soils and splashes on his trousers, deducing exactly where he had been walking in London.", 'question': 'What skill did Holmes demonstrate with soil analysis?', 'answer': 'identify soils and splashes', 'start_position': 49, 'end_position': 75}
Training dataset structure:
Dataset({
    features: ['context', 'question', 'answer', 'start_position', 'end_position'],
    num_rows: 54
})
First validation sample:
{'context': "Sherlock Holmes described his profession as a 'consulting detective,' assisting both government and private detectives when they were unable to solve a case.", 'question': 'What did Holmes call his profession?', 'answer': 'consulting detective', 'start_position': 47, 'end_position': 66}
Validation dataset structure:
Dataset({
    features: ['context', 'question', 'answer', 'start_position', 'end_posit

In [37]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def prepare_train_features(examples):

    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",  # truncate context, not the question
        max_length=512,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")

    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):

        input_ids = tokenized_examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answer = examples["answer"][sample_index]
        answer_start_position = examples["start_position"][sample_index]
        answer_end_position = examples["end_position"][sample_index]

        # If no answers are given, set the cls_index as answer.
        if answer_start_position == -1:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answer_start_position
            end_char = answer_end_position

            # Start token index of the current span in the text.
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # End token index of the current span in the text.
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    return tokenized_examples

# Tokenize the dataset
tokenized_dataset_train = hf_train_dataset.map(prepare_train_features, batched=True)
tokenized_dataset_val = hf_val_dataset.map(prepare_train_features, batched=True)

print("Tokenized training dataset structure:")
print(tokenized_dataset_train)

# Load the model
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=3e-5,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset_train,
    eval_dataset=tokenized_dataset_val,  # Use separate dataset for evaluation in practice
)

# Fine-tune the model
trainer.train()
drive_path = '/content/drive/My Drive/NLP_Project2'
fine_tuned_model_save_path = os.path.join(drive_path, 'fine_tuned_model')
trainer.save_model(fine_tuned_model_save_path)
print("Fine-tuned model saved successfully!")



Map:   0%|          | 0/54 [00:00<?, ? examples/s]

Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Tokenized training dataset structure:
Dataset({
    features: ['context', 'question', 'answer', 'start_position', 'end_position', 'input_ids', 'token_type_ids', 'attention_mask', 'start_positions', 'end_positions'],
    num_rows: 54
})


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,No log,5.007095
2,No log,3.708181
3,No log,3.338447
4,No log,3.25476


Fine-tuned model saved successfully!


In [42]:
# Define paths
drive_path = '/content/drive/My Drive/NLP_Project2'
fine_tuned_model_save_path = os.path.join(drive_path, 'fine_tuned_model')
val_path = os.path.join(drive_path, 'Inference_QA_2.json')

# Load pre-trained model and tokenizer directly from Hugging Face
pretrained_model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
pretrained_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load the fine-tuned model and tokenizer from the saved path
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(fine_tuned_model_save_path)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_save_path)

print("Pre-trained and fine-tuned models loaded successfully.")

# Initialize QA pipelines for both models
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

pretrained_qa_pipeline = pipeline(
    "question-answering",
    model=pretrained_model,
    tokenizer=pretrained_tokenizer,
    device=device
)

fine_tuned_qa_pipeline = pipeline(
    "question-answering",
    model=fine_tuned_model,
    tokenizer=fine_tuned_tokenizer,
    device=device
)

print("QA Pipelines initialized successfully.")

# Load the validation dataset
with open(val_path, 'r') as f:
    val_data = json.load(f)
print("Validation dataset loaded successfully!")

# Prepare lists to store predictions
pretrained_predictions = []
fine_tuned_predictions = []

# Generate predictions with the pre-trained model
print("Generating predictions with the pre-trained model...")
for i, qa in enumerate(val_data):
    result = pretrained_qa_pipeline(question=qa["question"], context=qa["context"])
    pretrained_predictions.append({
        "id": str(i),
        "prediction_text": result['answer']
    })

    # Print question, predicted answer, and expected answer for verification
    print(f"Pre-trained Model - Question: {qa['question']}")
    print(f"Predicted Answer: {result['answer']}")
    print(f"Expected Answer: {qa['answer']}\n")

# Generate predictions with the fine-tuned model
print("Generating predictions with the fine-tuned model...")
for i, qa in enumerate(val_data):
    result = fine_tuned_qa_pipeline(question=qa["question"], context=qa["context"])
    fine_tuned_predictions.append({
        "id": str(i),
        "prediction_text": result['answer']
    })

    # Print question, predicted answer, and expected answer for verification
    print(f"Fine-tuned Model - Question: {qa['question']}")
    print(f"Predicted Answer: {result['answer']}")
    print(f"Expected Answer: {qa['answer']}\n")

# Prepare references in SQuAD format
references = [{
    "id": str(i),
    "answers": {
        "text": [qa["answer"]],
        "answer_start": [qa["start_position"]]
    }
} for i, qa in enumerate(val_data)]

# Load the SQuAD metric
squad_metric = evaluate.load("squad")

# Compute metrics for the pre-trained model
print("Computing SQuAD metrics for the pre-trained model...")
pretrained_metrics = squad_metric.compute(predictions=pretrained_predictions, references=references)
print("Pre-trained BERT SQuAD Metrics:")
print(pretrained_metrics)

# Compute metrics for the fine-tuned model
print("Computing SQuAD metrics for the fine-tuned model...")
fine_tuned_metrics = squad_metric.compute(predictions=fine_tuned_predictions, references=references)
print("Fine-tuned BERT SQuAD Metrics:")
print(fine_tuned_metrics)

# Compare Exact Match (EM) and F1 scores
comparison_detailed = pd.DataFrame({
    "Metric": ["exact_match", "f1"],
    "Pre-trained BERT": [pretrained_metrics.get("exact_match", 0), pretrained_metrics.get("f1", 0)],
    "Fine-tuned BERT": [fine_tuned_metrics.get("exact_match", 0), fine_tuned_metrics.get("f1", 0)]
})

print("\nDetailed Comparison of Pre-trained and Fine-tuned BERT Models:")
print(comparison_detailed)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pre-trained and fine-tuned models loaded successfully.
QA Pipelines initialized successfully.
Validation dataset loaded successfully!
Generating predictions with the pre-trained model...
Pre-trained Model - Question: Who was revealed to be the murderer?
Predicted Answer: cab driver, was revealed to be the
Expected Answer: Jefferson Hope

Pre-trained Model - Question: What motivated Jefferson Hope's actions?
Predicted Answer: Drebber
Expected Answer: revenge against Enoch Drebber

Pre-trained Model - Question: How did Lucy Ferrier die?
Predicted Answer: father
Expected Answer: fleeing persecution

Pre-trained Model - Question: What did Holmes use to stimulate his mind?
Predicted Answer: Sherlock Holmes used his violin
Expected Answer: violin playing

Pre-trained Model - Question: What evidence did Holmes use to solve the case?
Predicted Answer: footprints, cigarette ash, and cab
Expected Answer: footprints, cigarette ash, and cab tracks

Pre-trained Model - Question: Where did Dr. Watso