In [1]:
!pip install datasets transformers evaluate
!pip install -U accelerate



In [2]:
from torch.utils.data import Dataset
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from torch import nn
import torch
from transformers import AutoTokenizer


In [3]:
dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

def get_answer_start(row):
  return row["annotations"]["answer_start"][0]

def get_answer(row):
  return row["annotations"]["answer_text"][0]

def get_document(row):
  return row["document_plaintext"]

def get_question(row):
  return row["question_text"]

def oracle(answer, document):
  return answer != "" and answer in document

def get_language(dataset, lang):
  return dataset.filter(lambda x: x["language"] == lang)#[row for row in dataset if row['language'] == lang]

In [4]:
train_arabic = get_language(train_set, "arabic")
val_arabic = get_language(validation_set, "arabic")

train_bengali = get_language(train_set, "bengali")
val_bengali = get_language(validation_set, "bengali")

train_indonesian = get_language(train_set, "indonesian")
val_indonesian = get_language(validation_set, "indonesian")

In [5]:
train_arabic_tt = train_arabic.train_test_split(test_size=0.2)
train_bengali_tt = train_bengali.train_test_split(test_size=0.2)
train_indonesian_tt = train_indonesian.train_test_split(test_size=0.2)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [7]:
def preprocess(examples):

    questions = [q.strip() for q in examples["question_text"]]

    inputs = tokenizer(

        questions,

        examples["document_plaintext"],

        max_length=512,

        truncation="only_second",

        return_offsets_mapping=True,

        padding="max_length",

    )

    offset_mapping = inputs.pop("offset_mapping")

    answers = examples["annotations"]

    start_positions = []

    end_positions = []

    for i, offset in enumerate(offset_mapping):

        answer = answers[i]

        start_char = answer["answer_start"][0]

        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context

        idx = 0

        while sequence_ids[idx] != 1:

            idx += 1

        context_start = idx

        while sequence_ids[idx] == 1:

            idx += 1

        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:

            start_positions.append(0)

            end_positions.append(0)

        else:

            # Otherwise it's the start and end token positions

            idx = context_start

            while idx <= context_end and offset[idx][0] <= start_char:

                idx += 1

            start_positions.append(idx - 1)

            idx = context_end

            while idx >= context_start and offset[idx][1] >= end_char:

                idx -= 1

            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions

    inputs["end_positions"] = end_positions

    return inputs

In [8]:
tokenized_train_bn = train_bengali_tt.map(preprocess, batched=True, remove_columns=train_bengali_tt["train"].column_names)

Map:   0%|          | 0/3823 [00:00<?, ? examples/s]

Map:   0%|          | 0/956 [00:00<?, ? examples/s]

In [9]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [10]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-multilingual-cased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
training_args = TrainingArguments(

    output_dir="my_awesome_qa_model",

    evaluation_strategy="epoch",

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=3,

    weight_decay=0.01,

    push_to_hub=True

)

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_train_bn["train"],

    eval_dataset=tokenized_train_bn["test"],

    tokenizer=tokenizer,

    data_collator=data_collator,

)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,1.700044
2,No log,1.734977
3,1.029600,1.78305


TrainOutput(global_step=717, training_loss=0.9339619826738472, metrics={'train_runtime': 655.9566, 'train_samples_per_second': 17.484, 'train_steps_per_second': 1.093, 'total_flos': 1498460404267008.0, 'train_loss': 0.9339619826738472, 'epoch': 3.0})

In [20]:
from transformers import pipeline

question_answerer = pipeline("question-answering", model="my_awesome_qa_model")


  question = get_question(row)
  document = get_document(row)
  print(row["annotations"])
  print(question_answerer(question=question, context=document))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'answer_start': [274], 'answer_text': ['স্যার জর্জ সিডেনহাম ক্লার্ক']}
{'score': 0.06585487723350525, 'start': 274, 'end': 301, 'answer': 'স্যার জর্জ সিডেনহাম ক্লার্ক'}
{'answer_start': [209], 'answer_text': ['ঝাঁসির রানি']}
{'score': 0.09006297588348389, 'start': 209, 'end': 233, 'answer': 'ঝাঁসির রানি ঝাঁসির রানির'}
{'answer_start': [345], 'answer_text': ['১৯৮৪ সালে']}
{'score': 0.015746712684631348, 'start': 685, 'end': 705, 'answer': '১৯৮৬ সালের ২৯ এপ্রিল'}
{'answer_start': [81], 'answer_text': ['১৪৭৩ সালের ১৮ ফেব্রুয়ারী']}
{'score': 0.19014008343219757, 'start': 81, 'end': 106, 'answer': '১৪৭৩ সালের ১৮ ফেব্রুয়ারী'}
{'answer_start': [42], 'answer_text': ['২৯শে আগস্ট, ১৯৭১ সালে']}
{'score': 0.3674885034561157, 'start': 54, 'end': 58, 'answer': '১৯৭১'}
{'answer_start': [19], 'answer_text': ['মুর্থি']}
{'score': 0.00033903244184330106, 'start': 19, 'end': 26, 'answer': 'মুর্থির'}
{'answer_start': [516], 'answer_text': ['গুগলপ্লেক্স নামে মাউন্টেইন ভিউতে']}
{'score': 0.12454741448163

KeyboardInterrupt: ignored