In [2]:
# !pip install transformers[torch]

In [4]:
# !pip install transformers datasets

In [66]:
# !pip install gradio

In [1]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
from datasets import load_dataset

squad_v2 = load_dataset("squad_v2", split="train[:5000]")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading readme:   0%|          | 0.00/8.92k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/16.4M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/130319 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/11873 [00:00<?, ? examples/s]

In [3]:
squad_v2

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 5000
})

In [4]:
# Squad_v2 train test split
squad_v2 = squad_v2.train_test_split(test_size=0.2)

In [5]:
squad_v2

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 4000
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1000
    })
})

In [6]:
from datasets import load_dataset

parent_dict_key = "answers"
subdict_key = "answer_start"

filtered_dataset = squad_v2.filter(lambda example: example[parent_dict_key] is not None and bool(example[parent_dict_key][subdict_key]))

Filter:   0%|          | 0/4000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [7]:
filtered_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 3727
    })
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 925
    })
})

In [8]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [9]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        answer = answers[i]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)
        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [10]:
# Squad_v2 preprocess
tokenized_squad_v2 = filtered_dataset.map(preprocess_function, batched=True, remove_columns=filtered_dataset["train"].column_names)

Map:   0%|          | 0/3727 [00:00<?, ? examples/s]

Map:   0%|          | 0/925 [00:00<?, ? examples/s]

In [11]:
tokenized_squad_v2

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 3727
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'start_positions', 'end_positions'],
        num_rows: 925
    })
})

In [12]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [13]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

In [59]:
from sklearn.metrics import f1_score

def compute_metrics(pred):

    start_preds, end_preds = pred.predictions
    start_positions, end_positions = pred.label_ids

    start_f1 = f1_score(start_positions, start_preds.argmax(axis=1), average='micro')
    end_f1 = f1_score(end_positions, end_preds.argmax(axis=1), average='micro')
    f1 = (start_f1 + end_f1) / 2  # Average F1 score for start and end positions

    return {"f1": f1}

In [61]:
training_args = TrainingArguments(
    output_dir="my_bert_qa_model_04",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    push_to_hub=True,
  )

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_squad_v2["train"],
    eval_dataset=tokenized_squad_v2["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [21]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,0.827426
2,0.153600,0.837485
3,0.120200,0.933214


Checkpoint destination directory my_bert_qa_model_04/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory my_bert_qa_model_04/checkpoint-1000 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=1398, training_loss=0.12352363029774678, metrics={'train_runtime': 934.8198, 'train_samples_per_second': 11.961, 'train_steps_per_second': 1.495, 'total_flos': 2191169277798912.0, 'train_loss': 0.12352363029774678, 'epoch': 3.0})

In [62]:
# Evaluate the model
results = trainer.evaluate()

print("F1 Score:", results["eval_f1"])

[ 21  39  41  91  68  27 227  37  46  30  77  38 114 102  15  98 208 112
  46  80  73 137 139 210 149   0 234 225  89  81 296  24  40  44 108  19
 317  57  88 122  87 177  43  48 104 135  23  27  32  16 102 125  45  88
  93  58  33  45  15  32 233  36 134 136  14  15  97  88  59  21  14  55
 326  22  13  44 106 129  86 175  98  25  90  44  91  64 110 137  97  13
  54 104  33  27  48  44  46  49 204  85  65  48  74 106  34  46  77  16
  99  58 137  67  20  18  26  69 118  17 156  51  17 159  12  47  43 142
 329  57  55 251  78 106  30 167  39 110  58  15  58 138  42  44  48  67
  18 173  41  52  64 110  62 215  74  63  37  72  90  14  67  95  28  18
  66  19 101  21 157  11  32  16  75  93  19 312  24  99  91  67  39  45
 184 100  98 149  72 128  80  27  29  68  45  23  35  47 121  18  22 234
 152  69 127  45 251  19 149  89  54  19  52  85  41 168  33  57  23  29
  32  14 111  40 176 257  46 132  15  21  33  23  11  18 187  72  61  15
  99  44 117  24 110  22 318  36  62  51  16  66 10

In [63]:
question = "How many programming languages does BLOOM support?"
context = "BLOOM has 176 billion parameters and can generate text in 46 languages natural languages and 13 programming languages."

In [64]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="my_bert_qa_model_04")
question_answerer(question=question, context=context)

{'score': 0.9999839067459106, 'start': 93, 'end': 95, 'answer': '13'}

In [68]:
# Gradio Deployment
import gradio as gr
from transformers import pipeline

def qa_system(context, question):

  question_answerer = pipeline("question-answering", model="my_bert_qa_model_04")
  answer = question_answerer(question=question, context=context)
  return answer['answer']

iface = gr.Interface(fn=qa_system, inputs=["text", "text"], outputs="text")
iface.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://d791f8fb9f04d27c58.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


