In [1]:
!pip install transformers torch datasets
!pip install evaluate

from evaluate import load
import evaluate
from transformers import BertTokenizer, BertModel, pipeline, AutoTokenizer, AutoModelForQuestionAnswering, Trainer, TrainingArguments, AutoModelForQuestionAnswering
from datasets import load_dataset, Dataset
import torch
import torch.nn.functional as F
import re
from nltk.tokenize import sent_tokenize
import nltk
import json
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
os.environ["WANDB_DISABLED"] = "true"
nltk.download('punkt')
nltk.download('punkt_tab')
from google.colab import drive
drive.mount('/content/drive')


book_path = "/content/drive/My Drive/NLP_Project2/a study in scarlet.txt"

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec (from torch)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m31.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (179 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


Mounted at /content/drive


In [3]:
dataset_path = '/content/drive/My Drive/NLP_Project2/QA_Corrected.json'

with open(dataset_path, 'r') as f:
    all_qa_pairs = json.load(f)
print("Dataset loaded successfully!")

train_data, val_data = train_test_split(all_qa_pairs, test_size=0.2, random_state=42)

train_path = '/content/drive/My Drive/NLP_Project2/qa_train.json'
val_path = '/content/drive/My Drive/NLP_Project2/qa_validation.json'

with open(train_path, 'w') as f:
    json.dump(train_data, f)

with open(val_path, 'w') as f:
    json.dump(val_data, f)

print("Train and validation datasets saved successfully!")

df_train = pd.DataFrame(train_data)
hf_train_dataset = Dataset.from_pandas(df_train)
print("First training sample:")
print(hf_train_dataset[0])  # Debug first training sample
print("Training dataset structure:")
print(hf_train_dataset)

df_val = pd.DataFrame(val_data)
hf_val_dataset = Dataset.from_pandas(df_val)

print("First validation sample:")
print(hf_val_dataset[0])  # Debug first validation sample
print("Validation dataset structure:")
print(hf_val_dataset)


Dataset loaded successfully!
Train and validation datasets saved successfully!
First training sample:
{'context': 'Holmes believed that knowledge not directly relevant to his work should be discarded, arguing that the brain has limited capacity, which should not be cluttered with irrelevant facts.', 'question': "What was Holmes' argument against cluttering the brain?", 'answer': 'which should not be cluttered with irrelevant facts.', 'start_position': 131, 'end_position': 183}
Training dataset structure:
Dataset({
    features: ['context', 'question', 'answer', 'start_position', 'end_position'],
    num_rows: 55
})
First validation sample:
{'context': 'Sherlock Holmes emphasized the importance of avoiding premature conclusions and focused on gathering evidence before forming theories. He explained this principle while investigating the Lauriston Gardens murder case.', 'question': 'What principle did Holmes emphasize during the investigation?', 'answer': 'avoiding premature conclusions'

In [15]:

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def preprocess_function(examples):
    tokenized_inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=512,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,
    )

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(tokenized_inputs["offset_mapping"]):
        start_char = examples["start_position"][i]
        end_char = examples["end_position"][i]
        sequence_ids = tokenized_inputs.sequence_ids(i)

        # Map character positions to token positions
        context_start = sequence_ids.index(1) # 1 corresponds to context, 0 to question
        # context_start gives the starting position of context
        context_end = len(sequence_ids) - sequence_ids[::-1].index(1) - 1 # end position of context

        start_token = end_token = None
        for idx, (start, end) in enumerate(offsets): # within context, trying to find
        # token id of the answer
            if start == start_char:
                start_token = idx # find token id corresponding to answer
            if end == end_char:
                end_token = idx
                break

        if start_token is None:
            start_token = 0 # or another appropriate default value
        if end_token is None:
            end_token = 0

        # finding relative position of answer with
        # respect to the starting position of context
        start_positions.append(start_token - context_start)
        end_positions.append(end_token - context_start)

    tokenized_inputs["start_positions"] = start_positions
    tokenized_inputs["end_positions"] = end_positions
    tokenized_inputs.pop("offset_mapping")  # Remove unnecessary information
    return tokenized_inputs

# Tokenize the dataset
tokenized_dataset = hf_train_dataset.map(preprocess_function, batched=True)

# Load the model
model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    save_steps=1000,
    save_total_limit=2,
    learning_rate=3e-5,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset,  # Use separate dataset for evaluation in practice
)

# Fine-tune the model
trainer.train()
drive_path = '/content/drive/My Drive/NLP_Project2'
fine_tuned_model_save_path = os.path.join(drive_path, 'fine_tuned_model')
trainer.save_model(fine_tuned_model_save_path)
print("Fine-tuned model saved successfully!")



Map:   0%|          | 0/55 [00:00<?, ? examples/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss
1,No log,4.803004
2,No log,3.675366
3,No log,3.254762
4,No log,2.878564
5,No log,2.562125
6,No log,2.295991
7,No log,2.096707
8,No log,1.961978
9,No log,1.881263
10,No log,1.846656


Fine-tuned model saved successfully!


In [16]:
# Define paths
drive_path = '/content/drive/My Drive/NLP_Project2'
fine_tuned_model_save_path = os.path.join(drive_path, 'fine_tuned_model')
val_path = os.path.join(drive_path, 'qa_validation.json')

# Load pre-trained model and tokenizer directly from Hugging Face
pretrained_model = AutoModelForQuestionAnswering.from_pretrained("bert-base-uncased")
pretrained_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load the fine-tuned model and tokenizer from the saved path
fine_tuned_model = AutoModelForQuestionAnswering.from_pretrained(fine_tuned_model_save_path)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained(fine_tuned_model_save_path)

print("Pre-trained and fine-tuned models loaded successfully.")

# Initialize QA pipelines for both models
device = 0 if torch.cuda.is_available() else -1  # Use GPU if available

pretrained_qa_pipeline = pipeline(
    "question-answering",
    model=pretrained_model,
    tokenizer=pretrained_tokenizer,
    device=device
)

fine_tuned_qa_pipeline = pipeline(
    "question-answering",
    model=fine_tuned_model,
    tokenizer=fine_tuned_tokenizer,
    device=device
)

print("QA Pipelines initialized successfully.")

# Load the validation dataset
with open(val_path, 'r') as f:
    val_data = json.load(f)
print("Validation dataset loaded successfully!")

# Prepare lists to store predictions
pretrained_predictions = []
fine_tuned_predictions = []

# Generate predictions with the pre-trained model
print("Generating predictions with the pre-trained model...")
for i, qa in enumerate(val_data):
    result = pretrained_qa_pipeline(question=qa["question"], context=qa["context"])
    pretrained_predictions.append({
        "id": str(i),
        "prediction_text": result['answer']
    })

# Generate predictions with the fine-tuned model
print("Generating predictions with the fine-tuned model...")
for i, qa in enumerate(val_data):
    result = fine_tuned_qa_pipeline(question=qa["question"], context=qa["context"])
    fine_tuned_predictions.append({
        "id": str(i),
        "prediction_text": result['answer']
    })

# Prepare references in SQuAD format
references = [{
    "id": str(i),
    "answers": {
        "text": [qa["answer"]],
        "answer_start": [qa["start_position"]]
    }
} for i, qa in enumerate(val_data)]

# Load the SQuAD metric
squad_metric = evaluate.load("squad")

# Compute metrics for the pre-trained model
print("Computing SQuAD metrics for the pre-trained model...")
pretrained_metrics = squad_metric.compute(predictions=pretrained_predictions, references=references)
print("Pre-trained BERT SQuAD Metrics:")
print(pretrained_metrics)

# Compute metrics for the fine-tuned model
print("Computing SQuAD metrics for the fine-tuned model...")
fine_tuned_metrics = squad_metric.compute(predictions=fine_tuned_predictions, references=references)
print("Fine-tuned BERT SQuAD Metrics:")
print(fine_tuned_metrics)

# Compare Exact Match (EM) and F1 scores
comparison_detailed = pd.DataFrame({
    "Metric": ["exact_match", "f1"],
    "Pre-trained BERT": [pretrained_metrics.get("exact_match", 0), pretrained_metrics.get("f1", 0)],
    "Fine-tuned BERT": [fine_tuned_metrics.get("exact_match", 0), fine_tuned_metrics.get("f1", 0)]
})

print("\nDetailed Comparison of Pre-trained and Fine-tuned BERT Models:")
print(comparison_detailed)


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Pre-trained and fine-tuned models loaded successfully.
QA Pipelines initialized successfully.
Validation dataset loaded successfully!
Generating predictions with the pre-trained model...
Generating predictions with the fine-tuned model...
Computing SQuAD metrics for the pre-trained model...
Pre-trained BERT SQuAD Metrics:
{'exact_match': 7.142857142857143, 'f1': 17.3015873015873}
Computing SQuAD metrics for the fine-tuned model...
Fine-tuned BERT SQuAD Metrics:
{'exact_match': 0.0, 'f1': 16.57996531946112}

Detailed Comparison of Pre-trained and Fine-tuned BERT Models:
        Metric  Pre-trained BERT  Fine-tuned BERT
0  exact_match          7.142857         0.000000
1           f1         17.301587        16.579965
