In [None]:
# # This Python 3 environment comes with many helpful analytics libraries installed
# # It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# # For example, here's several helpful packages to load

# import numpy as np # linear algebra
# import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# # Input data files are available in the read-only "../input/" directory
# # For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# # You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [9]:
print("HI")

HI


In [10]:
!pip install pytorch-crf




In [None]:
import os
import logging
import random
import numpy as np
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import matplotlib.pyplot as plt
from transformers import (AutoTokenizer, AutoModelForQuestionAnswering,
                          AutoModel, AdamW, get_linear_schedule_with_warmup, AutoConfig)
from datasets import load_dataset
from torchcrf import CRF
print("CELL_1")
logging.basicConfig(format="%(asctime)s - %(levelname)s - %(message)s", level=logging.INFO)
logger = logging.getLogger(__name__)

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(42)

def exact_match_score(predictions, references):
    assert len(predictions) == len(references), "Lists must have the same length"
    matches = sum(p == r for p, r in zip(predictions, references))
    return matches / len(references) * 100 

def preprocess_standard(example, tokenizer, max_length=384, doc_stride=128):
    """
    Tokenizes the question and context, then computes start/end token indices for the answer.
    For unanswerable questions, positions are set to 0.
    """
    question = example['question']
    context = example['context']
    if len(example['answers']['answer_start']) == 0:
        answer_text = ""
        start_char = 0
    else:
        answer_text = example['answers']['text'][0]
        start_char = example['answers']['answer_start'][0]
    tokenized = tokenizer.encode_plus(
        question,
        context,
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
    )
    tokenized = {key: val[0] if isinstance(val, list) else val for key, val in tokenized.items()}
    offsets = tokenized.pop("offset_mapping")
    if answer_text == "":
        tokenized['start_positions'] = 0
        tokenized['end_positions'] = 0
    else:
        end_char = start_char + len(answer_text)
        start_token, end_token = 0, 0
        for i, (start, end) in enumerate(offsets):
            if start <= start_char < end:
                start_token = i
            if start < end_char <= end:
                end_token = i
                break
        tokenized['start_positions'] = start_token
        tokenized['end_positions'] = end_token
    return tokenized

def preprocess_crf(example, tokenizer, max_length=384, doc_stride=128):
    question = example['question']
    context = example['context']
    if len(example['answers']['answer_start']) == 0:
        answer_text = ""
        start_char = None
        end_char = None
    else:
        answer_text = example['answers']['text'][0]
        start_char = example['answers']['answer_start'][0]
        end_char = start_char + len(answer_text)
    tokenized = tokenizer.encode_plus(
        question,
        context,
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=False,
        return_offsets_mapping=True,
    )
    offsets = tokenized.pop("offset_mapping")
    labels = [0] * len(offsets)
    if answer_text != "":
        started = False
        for i, (start, end) in enumerate(offsets):
            if start == 0 and end == 0:
                continue
            if start >= start_char and end <= end_char:
                if not started:
                    labels[i] = 1  # B
                    started = True
                else:
                    labels[i] = 2  # I
    tokenized['labels'] = labels
    return tokenized

class QADataset(Dataset):
    def __init__(self, dataset, tokenizer, preprocess_fn):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.preprocess_fn = preprocess_fn

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        example = self.dataset[idx]
        processed = self.preprocess_fn(example, self.tokenizer)
        return processed

def collate_fn(batch, tokenizer):
    batch = tokenizer.pad(batch, return_tensors="pt")
    return batch

class SpanBERTCRFForQA(nn.Module):
    def __init__(self, model_name, num_labels=3):
        super(SpanBERTCRFForQA, self).__init__()
        self.config = AutoConfig.from_pretrained(model_name)
        self.spanbert = AutoModel.from_pretrained(model_name, config=self.config)
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.config.hidden_size, num_labels)
        self.crf = CRF(num_labels, batch_first=True)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.spanbert(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = self.dropout(outputs[0])
        emissions = self.classifier(sequence_output)  # (batch, seq_len, num_labels) ! ! ! ! 
        if labels is not None:
            loss = -self.crf(emissions, labels, mask=attention_mask.bool(), reduction='mean')
            return loss
        else:
            prediction = self.crf.decode(emissions, mask=attention_mask.bool())
            return prediction

def bio_tags_to_answer(pred_tags, input_ids, tokenizer, example):
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    answer_tokens = []
    found = False
    for tag, token in zip(pred_tags, tokens):
        if tag == 1:
            found = True
            answer_tokens.append(token)
        elif found and tag == 2:
            answer_tokens.append(token)
        elif found:
            break
    if answer_tokens:
        answer = tokenizer.convert_tokens_to_string(answer_tokens)
        return answer.strip()
    else:
        return ""

def train_standard(model, train_loader, val_loader, optimizer, scheduler, device, epochs=6):
    train_losses = []
    val_losses = []
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                            start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        logger.info(f"[Standard QA] Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss:.4f}")
        
        # Validation!!!
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                                start_positions=start_positions, end_positions=end_positions)
                loss = outputs.loss
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        logger.info(f"[Standard QA] Epoch {epoch+1}/{epochs} - Validation Loss: {avg_val_loss:.4f}")
    return train_losses, val_losses

def train_crf(model, train_loader, val_loader, optimizer, scheduler, device, epochs=6):
    train_losses = []
    val_losses = []
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        logger.info(f"[CRF QA] Epoch {epoch+1}/{epochs} - Training Loss: {avg_train_loss:.4f}")
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        logger.info(f"[CRF QA] Epoch {epoch+1}/{epochs} - Validation Loss: {avg_val_loss:.4f}")
    return train_losses, val_losses

def evaluate_standard(model, loader, tokenizer, device):
    model.eval()
    predictions = []
    references = []
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            start_logits = outputs.start_logits
            end_logits = outputs.end_logits
            for i in range(input_ids.size(0)):
                start_index = torch.argmax(start_logits[i]).item()
                end_index = torch.argmax(end_logits[i]).item()
                pred_ids = input_ids[i][start_index:end_index+1]
                pred_answer = tokenizer.decode(pred_ids, skip_special_tokens=True)
                predictions.append(pred_answer.strip())
            for i in range(input_ids.size(0)):
                start_pos = batch['start_positions'][i].item()
                end_pos = batch['end_positions'][i].item()
                ref_ids = input_ids[i][start_pos:end_pos+1]
                ref_answer = tokenizer.decode(ref_ids, skip_special_tokens=True)
                references.append(ref_answer.strip())
    score = exact_match_score(predictions, references)
    logger.info(f"[Standard QA] Exact Match Score: {score:.2f}%")
    return score

def evaluate_crf(model, loader, tokenizer, device, original_dataset):
    model.eval()
    predictions = []
    references = []
    idx = 0
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            batch_preds = model(input_ids=input_ids, attention_mask=attention_mask)
            for i in range(len(batch_preds)):
                pred_tags = batch_preds[i]
                example = original_dataset[idx]
                answer = bio_tags_to_answer(pred_tags, input_ids[i].cpu().tolist(), tokenizer, example)
                predictions.append(answer)
                if len(example['answers']['text']) > 0:
                    references.append(example['answers']['text'][0].strip())
                else:
                    references.append("")
                idx += 1
    score = exact_match_score(predictions, references)
    logger.info(f"[CRF QA] Exact Match Score: {score:.2f}%")
    return score

def plot_losses(train_losses, val_losses, title, filename):
    epochs = range(1, len(train_losses) + 1)
    plt.figure()
    plt.plot(epochs, train_losses, label='Training Loss')
    plt.plot(epochs, val_losses, label='Validation Loss')
    plt.title(title)
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    plt.savefig(filename)
    plt.close()

CELL_1


In [None]:
print("CELL_2")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")
print(f"Using device: {device}")
model_name = "SpanBERT/spanbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
print("Loading SQuAD v2 dataset...")
squad = load_dataset("squad_v2")
train_subset = squad['train'].shuffle(seed=42).select(range(15000))
val_subset = squad['validation'].shuffle(seed=42).select(range(1000))
print("Dataset loaded. Training samples:", len(train_subset), "Validation samples:", len(val_subset))

print("Setting up Standard QA datasets and dataloaders...")
train_dataset_standard = QADataset(train_subset, tokenizer, preprocess_standard)
val_dataset_standard = QADataset(val_subset, tokenizer, preprocess_standard)
train_loader_standard = DataLoader(train_dataset_standard, batch_size=8, shuffle=True,
                                   collate_fn=lambda x: collate_fn(x, tokenizer))
val_loader_standard = DataLoader(val_dataset_standard, batch_size=8, shuffle=False,
                                 collate_fn=lambda x: collate_fn(x, tokenizer))

print("Initializing Standard QA model...")
standard_model = AutoModelForQuestionAnswering.from_pretrained(model_name)
standard_model.to(device)

# Use weight decay for regularization
optimizer_std = AdamW(standard_model.parameters(), lr=3e-5, weight_decay=0.01)
total_steps_std = len(train_loader_standard) * 6  # 6 epochs
scheduler_std = get_linear_schedule_with_warmup(optimizer_std, num_warmup_steps=0, num_training_steps=total_steps_std)


def train_standard(model, train_loader, val_loader, optimizer, scheduler, device, epochs=6):
    train_losses = []
    val_losses = []
    model.to(device)
    for epoch in range(epochs):
        print(f"Standard QA - Epoch {epoch+1} training started.")
        model.train()
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                            start_positions=start_positions, end_positions=end_positions)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()
        avg_train_loss = total_loss / len(train_loader)
        train_losses.append(avg_train_loss)
        logger.info(f"[Standard QA] Epoch {epoch+1}/6 - Training Loss: {avg_train_loss:.4f}")
        model.eval()
        total_val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask,
                                start_positions=start_positions, end_positions=end_positions)
                loss = outputs.loss
                total_val_loss += loss.item()
        avg_val_loss = total_val_loss / len(val_loader)
        val_losses.append(avg_val_loss)
        logger.info(f"[Standard QA] Epoch {epoch+1}/6 - Validation Loss: {avg_val_loss:.4f}")

        em_score = evaluate_standard(model, val_loader, tokenizer, device)
        print(f"Standard QA - Epoch {epoch+1} completed. Training Loss: {avg_train_loss:.4f}, "
              f"Validation Loss: {avg_val_loss:.4f}, EM Score: {em_score:.2f}%")
    return train_losses, val_losses

print("Starting training for Standard SpanBERT QA model...")
train_losses_std, val_losses_std = train_standard(standard_model, train_loader_standard, val_loader_standard,
                                                  optimizer_std, scheduler_std, device, epochs=6)
logger.info("Standard QA model training completed.")
print("Standard QA training completed.")

std_exact_match = evaluate_standard(standard_model, val_loader_standard, tokenizer, device)
os.makedirs("models", exist_ok=True)
standard_model.save_pretrained("models/standard_spanbert_qa")
tokenizer.save_pretrained("models/standard_spanbert_qa")
logger.info("Standard QA model saved in models/standard_spanbert_qa.")
print("Standard QA model saved in models/standard_spanbert_qa.")

plot_losses(train_losses_std, val_losses_std, "Standard SpanBERT QA Training", "standard_loss.png")
logger.info("Standard QA training plot saved as standard_loss.png.")
print("Standard QA training plot saved as standard_loss.png.")


CELL_2
Using device: cuda
Loading SQuAD v2 dataset...
Dataset loaded. Training samples: 15000 Validation samples: 1000
Setting up Standard QA datasets and dataloaders...
Initializing Standard QA model...


Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at SpanBERT/spanbert-base-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Error during conversion: ChunkedEncodingError(ProtocolError('Response ended prematurely'))
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Starting training for Standard SpanBERT QA model...
Standard QA - Epoch 1 training started.
Standard QA - Epoch 1 completed. Training Loss: 2.3790, Validation Loss: 1.9067, EM Score: 39.30%
Standard QA - Epoch 2 training started.
Standard QA - Epoch 2 completed. Training Loss: 1.5690, Validation Loss: 1.7594, EM Score: 46.10%
Standard QA - Epoch 3 training started.
Standard QA - Epoch 3 completed. Training Loss: 1.1832, Validation Loss: 1.8329, EM Score: 51.60%
Standard QA - Epoch 4 training started.
Standard QA - Epoch 4 completed. Training Loss: 0.9324, Validation Loss: 1.9935, EM Score: 53.20%
Standard QA - Epoch 5 training started.
Standard QA - Epoch 5 completed. Training Loss: 0.7720, Validation Loss: 2.2132, EM Score: 53.40%
Standard QA - Epoch 6 training started.
Standard QA - Epoch 6 completed. Training Loss: 0.6873, Validation Loss: 2.2892, EM Score: 54.60%
Standard QA training completed.
Standard QA model saved in models/standard_spanbert_qa.
Standard QA training plot saved 