In [None]:
#############################
# Install and Import Packages
#############################
%pip install transformers datasets TorchCRF  # Install required packages

import os
import json
import random
import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

# Set WandB API key (if you wish to use wandb logging; otherwise, set report_to="none")
os.environ["WANDB_API_KEY"] = "9005f3d03485025996bc83adb773e446b2887451"

# Import CRF from TorchCRF (this version does NOT support the 'batch_first' argument)
from TorchCRF import CRF
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    AutoModel,
    TrainingArguments,
    Trainer
)


Collecting TorchCRF
  Downloading TorchCRF-1.1.0-py3-none-any.whl.metadata (2.3 kB)
Downloading TorchCRF-1.1.0-py3-none-any.whl (5.2 kB)
Installing collected packages: TorchCRF
Successfully installed TorchCRF-1.1.0


In [2]:


#############################
# 1. Load Local SQuAD v2 Data
#############################

train_path = "/kaggle/input/squad-v2/transformers/default/1/train-v2.0.json"
dev_path   = "/kaggle/input/squad-v2/transformers/default/1/dev-v2.0.json"

def load_squad_v2(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        squad_dict = json.load(f)

    flattened = []
    for article in squad_dict["data"]:
        for paragraph in article["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                if "answers" in qa and len(qa["answers"]) > 0:
                    answers = {
                        "text": [ans["text"] for ans in qa["answers"]],
                        "answer_start": [ans["answer_start"] for ans in qa["answers"]]
                    }
                else:
                    answers = {"text": [], "answer_start": []}
                
                flattened.append({
                    "context": context,
                    "question": question,
                    "answers": answers
                })
    return flattened

train_data = load_squad_v2(train_path)
dev_data   = load_squad_v2(dev_path)

# Convert to Hugging Face Datasets
train_dataset_full = Dataset.from_list(train_data)
official_dev_dataset = Dataset.from_list(dev_data)

dataset = DatasetDict({
    "train_full": train_dataset_full,
    "test_official": official_dev_dataset
})

print("Loaded local SQuAD v2 dataset:")
print(dataset)

Loaded local SQuAD v2 dataset:
DatasetDict({
    train_full: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 130319
    })
    test_official: Dataset({
        features: ['context', 'question', 'answers'],
        num_rows: 11873
    })
})


In [3]:

#############################
# 2. 80:20 Split & Basic Setup
#############################

seed_val = 42
# Subsample 15k examples from the full training set (for speed)
train_dataset_full = train_dataset_full.shuffle(seed=seed_val).select(range(15000))

# Split 80:20 into training and validation sets
split_dataset = train_dataset_full.train_test_split(test_size=0.2, seed=seed_val)
train_dataset = split_dataset["train"]   # ~80%
val_dataset   = split_dataset["test"]    # ~20%

# Reserve the official dev set for final evaluation
test_dataset  = dataset["test_official"]

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

#############################
# 3. Tokenizer
#############################

tokenizer = AutoTokenizer.from_pretrained("SpanBERT/spanbert-base-cased")

Using device: cuda


config.json:   0%|          | 0.00/413 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

In [4]:
#############################
# 4. Preprocessing for Baseline QA
#############################

def preprocess_qa_examples(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    
    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
    
    start_positions = []
    end_positions   = []
    
    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        
        if len(answers["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            answer_start = answers["answer_start"][0]
            answer_text  = answers["text"][0]
            answer_end   = answer_start + len(answer_text)
            
            token_start_index = 0
            token_end_index   = len(offsets) - 1
            
            while token_start_index < len(offsets) and offsets[token_start_index][0] <= 0:
                token_start_index += 1
            while token_end_index >= 0 and offsets[token_end_index][1] == 0:
                token_end_index -= 1
            
            start_index, end_index = token_start_index, token_end_index
            while start_index < len(offsets) and offsets[start_index][0] < answer_start:
                start_index += 1
            while end_index >= 0 and offsets[end_index][1] > answer_end:
                end_index -= 1
            
            if start_index >= len(offsets) or end_index < 0 or start_index > end_index:
                start_positions.append(0)
                end_positions.append(0)
            else:
                start_positions.append(start_index)
                end_positions.append(end_index)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"]   = end_positions
    return inputs

# Preprocess the baseline datasets
tokenized_train = train_dataset.map(preprocess_qa_examples, batched=True, remove_columns=train_dataset.column_names)
tokenized_val   = val_dataset.map(preprocess_qa_examples,   batched=True, remove_columns=val_dataset.column_names)
tokenized_test  = test_dataset.map(preprocess_qa_examples,  batched=True, remove_columns=test_dataset.column_names)

#############################
# 5. Exact Match Metric
#############################

def exact_match_score(predictions, references):
    assert len(predictions) == len(references), "Lists must have the same length"
    matches = sum(p == r for p, r in zip(predictions, references))
    return matches / len(references) * 100  # percentage


Map:   0%|          | 0/12000 [00:00<?, ? examples/s]

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [None]:
#############################
# 6. Baseline Model: SpanBERT
#############################

baseline_model = AutoModelForQuestionAnswering.from_pretrained("SpanBERT/spanbert-base-cased")
baseline_model.to(device)

baseline_training_args = TrainingArguments(
    output_dir="./results_baseline",
    eval_strategy="epoch",        # Use eval_strategy instead of evaluation_strategy
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="wandb"  # Use "none" to disable WandB logging
)

# (Optional) A dummy compute_metrics function so that Trainer prints evaluation metrics
def compute_metrics(eval_pred):
    return {}

baseline_trainer = Trainer(
    model=baseline_model,
    args=baseline_training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

print("Training baseline SpanBERT model...")
baseline_train_result = baseline_trainer.train()
baseline_trainer.save_model()

pytorch_model.bin:   0%|          | 0.00/215M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  baseline_trainer = Trainer(


Training baseline SpanBERT model...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mamartya22062[0m ([33mamartya22062-indraprastha-institute-of-information-techn[0m). Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,2.0054,1.797424
2,1.8518,2.357016
3,1.5967,1.871988
4,1.0939,2.090552
5,0.7512,2.169038


In [None]:
def create_crf_labels(offset_mapping, answer_start, answer_text):
    labels = [0] * len(offset_mapping)  # 0=O, 1=B, 2=I
    if answer_text == "":
        return labels
    answer_end = answer_start + len(answer_text)
    found_b = False
    for i, (start, end) in enumerate(offset_mapping):
        if start is None or end is None or (start == 0 and end == 0):
            continue
        if start >= answer_start and end <= answer_end:
            if not found_b:
                labels[i] = 1  # B
                found_b = True
            else:
                labels[i] = 2  # I
    return labels

def preprocess_crf_examples(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length"
    )
    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
    
    all_labels = []
    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        if len(answers["answer_start"]) == 0:
            label_ids = [0] * len(offsets)
        else:
            answer_start = answers["answer_start"][0]
            answer_text  = answers["text"][0]
            label_ids = create_crf_labels(offsets, answer_start, answer_text)
        all_labels.append(label_ids)
    
    inputs["labels"] = all_labels
    return inputs

# Preprocess for CRF model
tokenized_train_crf = train_dataset.map(preprocess_crf_examples, batched=True, remove_columns=train_dataset.column_names)
tokenized_val_crf   = val_dataset.map(preprocess_crf_examples,   batched=True, remove_columns=val_dataset.column_names)
tokenized_test_crf  = test_dataset.map(preprocess_crf_examples,  batched=True, remove_columns=test_dataset.column_names)

# Ensure datasets return PyTorch tensors
tokenized_train_crf.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_val_crf.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_crf.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

class SpanBERTCRFForQA(nn.Module):
    def __init__(self, model_name):
        super(SpanBERTCRFForQA, self).__init__()
        self.spanbert = AutoModel.from_pretrained(model_name)
        # Increase dropout to mitigate overfitting
        self.dropout  = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.spanbert.config.hidden_size, 3)
        # Initialize CRF; our TorchCRF expects input in shape (batch, seq_len, num_tags)
        self.crf = CRF(3)
        
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.spanbert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs.last_hidden_state)  # (batch, seq_len, hidden_size)
        emissions = self.classifier(sequence_output)               # (batch, seq_len, 3)
        # Do NOT transpose: keep emissions as (batch, seq_len, num_tags)
        mask = attention_mask.bool()  # (batch, seq_len)
        
        if labels is not None:
            labels = labels.long()
            loss = -self.crf(emissions, labels, mask=mask)
            return {"loss": loss, "emissions": emissions}
        else:
            pred_tags = self.crf.decode(emissions, mask=mask)
            return pred_tags

crf_model = SpanBERTCRFForQA("SpanBERT/spanbert-base-cased")
crf_model.to(device)

# Define a custom Trainer to override compute_loss.
from transformers import Trainer

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, num_items_in_batch=None, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs, labels=labels)
        loss = outputs["loss"]
        return (loss, outputs) if return_outputs else loss

crf_training_args = TrainingArguments(
    output_dir="./results_crf",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="epoch",
    push_to_hub=False,
    report_to="wandb"  # Set to "none" to disable wandb logging if desired
)

# Use CustomTrainer for CRF model training.
crf_trainer = CustomTrainer(
    model=crf_model,
    args=crf_training_args,
    train_dataset=tokenized_train_crf,
    eval_dataset=tokenized_val_crf,
    tokenizer=tokenizer
)

print("Training SpanBERT-CRF model...")
crf_train_result = crf_trainer.train()
crf_trainer.save_model()

In [None]:
#############################
# 8. Plot Training/Validation Loss
#############################

def plot_trainer_logs(trainer, title):
    logs = trainer.state.log_history
    train_loss = []
    eval_loss  = []
    epochs_tl  = []
    epochs_el  = []
    for log in logs:
        if "loss" in log and "epoch" in log:
            train_loss.append(log["loss"])
            epochs_tl.append(log["epoch"])
        if "eval_loss" in log and "epoch" in log:
            eval_loss.append(log["eval_loss"])
            epochs_el.append(log["epoch"])
    
    plt.figure(figsize=(7,5))
    plt.plot(epochs_tl, train_loss, label="Train Loss")
    plt.plot(epochs_el, eval_loss, label="Validation Loss")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title(title)
    plt.legend()
    plt.show()

print("Plotting Baseline (SpanBERT) Loss Curves...")
plot_trainer_logs(baseline_trainer, "Baseline SpanBERT QA")

print("Plotting CRF Model (SpanBERT-CRF) Loss Curves...")
plot_trainer_logs(crf_trainer, "SpanBERT-CRF QA")


In [None]:
#############################
# 9. Evaluation: Exact Match Score
#############################

def postprocess_qa_predictions(features, raw_predictions):
    start_logits, end_logits = raw_predictions
    predictions = []
    for i in range(len(features["input_ids"])):
        input_ids = features["input_ids"][i]
        start_idx = int(np.argmax(start_logits[i]))
        end_idx   = int(np.argmax(end_logits[i]))
        if start_idx > end_idx:
            predictions.append("")
        else:
            pred_ids = input_ids[start_idx : end_idx+1]
            prediction = tokenizer.decode(
                pred_ids,
                skip_special_tokens=True,
                clean_up_tokenization_spaces=True
            )
            predictions.append(prediction)
    return predictions

# Evaluate on the official dev set (test_dataset)
tokenized_test_baseline = tokenized_test
baseline_raw_preds = baseline_trainer.predict(tokenized_test_baseline)
baseline_preds = postprocess_qa_predictions(
    tokenized_test_baseline,
    (baseline_raw_preds.predictions[0], baseline_raw_preds.predictions[1])
)

baseline_refs = [
    ex["answers"]["text"][0] if len(ex["answers"]["text"]) > 0 else ""
    for ex in test_dataset
]
baseline_em = exact_match_score(baseline_preds, baseline_refs)
print(f"\nBaseline SpanBERT Exact Match (Official Dev): {baseline_em:.2f}%")

# CRF Model Evaluation on official dev set
tokenized_test_crf_final = tokenized_test_crf
crf_model.eval()
crf_predictions = []
for i in range(len(tokenized_test_crf_final)):
    input_ids      = torch.tensor(tokenized_test_crf_final[i]["input_ids"]).unsqueeze(0).to(device)
    attention_mask = torch.tensor(tokenized_test_crf_final[i]["attention_mask"]).unsqueeze(0).to(device)
    
    with torch.no_grad():
        pred_tags = crf_model(input_ids=input_ids, attention_mask=attention_mask)
    
    # Reconstruct answer text from the predicted tags
    answer_tokens = []
    found_b = False
    for tag, tid in zip(pred_tags[0], tokenized_test_crf_final[i]["input_ids"]):
        if tag == 1:  # B tag
            found_b = True
            answer_tokens.append(tid)
        elif found_b and tag == 2:  # I tag
            answer_tokens.append(tid)
        elif found_b:
            break
    if answer_tokens:
        answer_text = tokenizer.decode(
            answer_tokens,
            skip_special_tokens=True,
            clean_up_tokenization_spaces=True
        )
    else:
        answer_text = ""
    crf_predictions.append(answer_text)

crf_refs = [
    ex["answers"]["text"][0] if len(ex["answers"]["text"]) > 0 else ""
    for ex in test_dataset
]
crf_em = exact_match_score(crf_predictions, crf_refs)
print(f"SpanBERT-CRF Exact Match (Official Dev): {crf_em:.2f}%")

#############################
# 10. Final Notes
#############################
# 1) The training set was subsampled to 15k examples and split 80:20 for training vs. validation.
# 2) The official dev set is used only for final evaluation.
# 3) Loss curves are plotted locally and also logged to WandB if enabled.
# 4) The CRF model now transposes emissions and mask to match TorchCRF's expected input shape.
# 5) "evaluation_strategy" was replaced with "eval_strategy" to avoid deprecation warnings.
