In [20]:
!pip install transformers datasets evaluate torch streamlit


Collecting evaluate
  Downloading evaluate-0.4.5-py3-none-any.whl.metadata (9.5 kB)
Collecting streamlit
  Downloading streamlit-1.49.1-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading evaluate-0.4.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading streamlit-1.49.1-py3-none-any.whl (10.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.0/10.0 MB[0m [31m83.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m110.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit, evaluate
Successfully installed evaluate-0.4.5 pydeck-0.9.1 streamlit-1.49.1


In [2]:
#!pip install datasets

In [18]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
from datasets import load_dataset
import numpy as np

In [4]:
squad = load_dataset("json", data_files={"train": "/content/train-v1.1.json", "validation": "/content/dev-v1.1.json"}, field="data")

print(squad)

DatasetDict({
    train: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 442
    })
    validation: Dataset({
        features: ['title', 'paragraphs'],
        num_rows: 48
    })
})


In [5]:
from datasets import Dataset, DatasetDict

def flatten_squad(dataset):
    contexts, questions, answers = [], [], []
    for item in dataset:
        for paragraph in item["paragraphs"]:
            context = paragraph["context"]
            for qa in paragraph["qas"]:
                question = qa["question"]
                ans_texts = [a["text"] for a in qa["answers"]]
                ans_starts = [a["answer_start"] for a in qa["answers"]]
                contexts.append(context)
                questions.append(question)
                answers.append({"text": ans_texts, "answer_start": ans_starts})
    return Dataset.from_dict({"context": contexts, "question": questions, "answers": answers})

train_dataset = flatten_squad(squad["train"])
valid_dataset = flatten_squad(squad["validation"])

squad = DatasetDict({
    "train": train_dataset,
    "validation": valid_dataset
})

print(squad["train"][0])


{'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'answer_start': [515], 'text': ['Saint Bernadette Soubirous']}}


In [6]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [7]:
from transformers import AutoModelForQuestionAnswering, AutoTokenizer

model_checkpoint = "distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = inputs["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)

        sequence_ids = inputs.sequence_ids(i)
        sample_index = sample_mapping[i]

        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            # find token start
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            # find token end
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs


In [9]:
tokenized_squad = squad.map(
    preprocess_function,
    batched=True,
    remove_columns=squad["train"].column_names
)

print(tokenized_squad)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 88524
    })
    validation: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 10784
    })
})


In [10]:
model = AutoModelForQuestionAnswering.from_pretrained(model_checkpoint)


Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#!pip install --upgrade transformers


In [12]:
import transformers
print(transformers.__version__)


4.56.0


In [13]:
args = TrainingArguments(
    "qa-checkpoint",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    report_to=[]
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_squad["train"],
    eval_dataset=tokenized_squad["validation"],
    tokenizer=tokenizer,
)

  trainer = Trainer(


In [14]:
import os
os.environ["WANDB_DISABLED"] = "true"
os.environ["HF_DISABLE_TRACKING"] = "1"


In [15]:
#!pip uninstall -y wandb


In [16]:
trainer.train()


Step,Training Loss
500,2.9307
1000,1.7513
1500,1.6033
2000,1.5195
2500,1.4766
3000,1.3609
3500,1.3484
4000,1.3287
4500,1.2629
5000,1.2611


Step,Training Loss
500,2.9307
1000,1.7513
1500,1.6033
2000,1.5195
2500,1.4766
3000,1.3609
3500,1.3484
4000,1.3287
4500,1.2629
5000,1.2611


TrainOutput(global_step=11066, training_loss=1.2773933224494476, metrics={'train_runtime': 6579.7048, 'train_samples_per_second': 26.908, 'train_steps_per_second': 1.682, 'total_flos': 1.7348902540849152e+16, 'train_loss': 1.2773933224494476, 'epoch': 2.0})

In [21]:
from evaluate import load
metric = load("squad")

def compute_metrics(eval_preds):
    start_logits, end_logits = eval_preds
    predictions = []
    references = []
    for i in range(len(start_logits)):
        start = np.argmax(start_logits[i])
        end = np.argmax(end_logits[i]) + 1
        context = squad["validation"][i]["context"]
        offset_mapping = tokenized_squad["validation"][i]["offset_mapping"]
        pred_answer = tokenizer.decode(tokenized_squad["validation"][i]["input_ids"][start:end])
        predictions.append({"id": squad["validation"][i]["id"], "prediction_text": pred_answer})
        references.append({"id": squad["validation"][i]["id"], "answers": squad["validation"][i]["answers"]})
    return metric.compute(predictions=predictions, references=references)

results = trainer.evaluate()
print("Evaluation Results:", results)

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

Evaluation Results: {'eval_loss': 1.1512441635131836, 'eval_runtime': 117.5903, 'eval_samples_per_second': 91.708, 'eval_steps_per_second': 5.732, 'epoch': 2.0}
