In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-question-answering-dataset/train-v1.1.json
/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json


In [7]:
from datasets import load_dataset

squad = load_dataset("squad", split="train[:5000]")

Downloading readme:   0%|          | 0.00/7.62k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [8]:
squad = squad.train_test_split(test_size=0.2)

In [10]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [11]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [12]:
tokenized_squad = squad.map(preprocess_function, batched=True, remove_columns=squad["train"].column_names)

Map:   0%|          | 0/4000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [13]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [14]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert/distilbert-base-uncased")

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
import os
os.environ["WANDB_DISABLED"] = "true"  # Disable WandB logging

training_args = TrainingArguments(
    output_dir="./my_awesome_qa_model",
    eval_strategy="epoch",  # Updated to eval_strategy
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",   # Optional for logging
    save_total_limit=1
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["test"],  # Use validation set for evaluation
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics  # Pass the custom metric function here
)


trainer.train()


Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Epoch,Training Loss,Validation Loss,Token Level Iou
1,No log,1.630697,0.526942
2,1.040400,1.729835,0.542635
3,1.040400,1.671823,0.548313
4,0.586200,1.825051,0.546232
5,0.586200,1.871797,0.550826


TrainOutput(global_step=1250, training_loss=0.7386638549804687, metrics={'train_runtime': 442.2205, 'train_samples_per_second': 45.226, 'train_steps_per_second': 2.827, 'total_flos': 1959796500480000.0, 'train_loss': 0.7386638549804687, 'epoch': 5.0})

In [24]:
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")


Evaluation results: {'eval_loss': 1.8717974424362183, 'eval_token_level_iou': 0.5508263698992213, 'eval_runtime': 7.2469, 'eval_samples_per_second': 137.989, 'eval_steps_per_second': 8.693, 'epoch': 5.0}


In [26]:
def compute_token_level_iou(pred_start, pred_end, true_start, true_end):
    pred_tokens = set(range(pred_start, pred_end + 1))
    true_tokens = set(range(true_start, true_end + 1))

    intersection = len(pred_tokens.intersection(true_tokens))
    union = len(pred_tokens.union(true_tokens))

    iou = intersection / union if union != 0 else 0
    return iou

def compute_metrics(eval_pred):
    start_preds, end_preds = eval_pred.predictions
    start_labels, end_labels = eval_pred.label_ids

    ious = []
    for i in range(len(start_preds)):
        iou = compute_token_level_iou(
            start_preds[i].argmax(), end_preds[i].argmax(),
            start_labels[i], end_labels[i]
        )
        ious.append(iou)

    mean_iou = sum(ious) / len(ious)
    return {"token_level_iou": mean_iou}

# Re-run evaluation with the custom metric
eval_results = trainer.evaluate(metric_key_prefix="eval")
print(f"Evaluation results with custom metric: {eval_results}")


Evaluation results with custom metric: {'eval_loss': 1.8717974424362183, 'eval_token_level_iou': 0.5508263698992213, 'eval_runtime': 7.2289, 'eval_samples_per_second': 138.335, 'eval_steps_per_second': 8.715, 'epoch': 5.0}


In [29]:
def answer_question(question, context):
    inputs = tokenizer.encode_plus(question, context, return_tensors="pt")

    # Move inputs to the same device as the model
    inputs = {key: value.to(device) for key, value in inputs.items()}

    # Forward pass through the model
    outputs = model(**inputs)
    answer_start = torch.argmax(outputs.start_logits)
    answer_end = torch.argmax(outputs.end_logits) + 1

    input_ids = inputs["input_ids"].tolist()[0]
    answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    return answer


# Example usage:
question = "What is the name of the repository?"
context = "The Hugging Face library provides an easy way to use transformer models."
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")


Question: What is the name of the repository?
Answer: hugging face library


In [38]:
import torch

# Check if GPU is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the correct device
model.to(device)


DistilBertForQuestionAnswering(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
      

In [35]:
model.save_pretrained("./my_awesome_qa_model")
tokenizer.save_pretrained("./my_awesome_qa_model")


('./my_awesome_qa_model/tokenizer_config.json',
 './my_awesome_qa_model/special_tokens_map.json',
 './my_awesome_qa_model/vocab.txt',
 './my_awesome_qa_model/added_tokens.json')

In [36]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering

# Reload the model and tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("./my_awesome_qa_model")
model = DistilBertForQuestionAnswering.from_pretrained("./my_awesome_qa_model")


In [42]:


# Testing the inference pipeline again
question = "Which country contains the majority of the Amazon rainforest?"
context = "The Amazon rainforest is the largest tropical rainforest in the world, with 60% of it located in Brazil."
answer = answer_question(question, context)
print(f"Question: {question}")
print(f"Answer: {answer}")


Question: Which country contains the majority of the Amazon rainforest?
Answer: brazil
