In [1]:
!pip install datasets transformers evaluate
!pip install -U accelerate



In [2]:
from torch.utils.data import Dataset
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from torch import nn
import torch
from transformers import AutoTokenizer

In [3]:
dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

def get_answer_start(row):
  return row["annotations"]["answer_start"][0]

def get_answer(row):
  return row["annotations"]["answer_text"][0]

def get_document(row):
  return row["document_plaintext"]

def get_question(row):
  return row["question_text"]

def oracle(answer, document):
  return answer != "" and answer in document

def get_language(dataset, lang, answerable=False):
  lamb = lambda x: x["language"] == lang and x["annotations"]["answer_text"][0] != "" if answerable else lambda x: x["language"] == lang
  return dataset.filter(lamb)

In [4]:
train_arabic = get_language(train_set, "arabic", True)
val_arabic = get_language(validation_set, "arabic", True)

train_bengali = get_language(train_set, "bengali", True)
val_bengali = get_language(validation_set, "bengali", True)

train_indonesian = get_language(train_set, "indonesian", True)
val_indonesian = get_language(validation_set, "indonesian", True)

In [5]:
train_arabic_tt = train_arabic.train_test_split(test_size=0.2)
train_bengali_tt = train_bengali.train_test_split(test_size=0.2)
train_indonesian_tt = train_indonesian.train_test_split(test_size=0.2)

In [6]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [7]:
def preprocess(examples):

    questions = [q.strip() for q in examples["question_text"]]

    inputs = tokenizer(

        questions,

        examples["document_plaintext"],

        max_length=512,

        truncation="only_second",

        return_offsets_mapping=True,

        padding="max_length",

    )

    offset_mapping = inputs.pop("offset_mapping")

    answers = examples["annotations"]

    start_positions = []

    end_positions = []

    for i, offset in enumerate(offset_mapping):

        answer = answers[i]

        start_char = answer["answer_start"][0]

        end_char = answer["answer_start"][0] + len(answer["answer_text"][0])

        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context

        idx = 0

        while sequence_ids[idx] != 1:

            idx += 1

        context_start = idx

        while sequence_ids[idx] == 1:

            idx += 1

        context_end = idx - 1

        # If the answer is not fully inside the context, label it (0, 0)

        if offset[context_start][0] > end_char or offset[context_end][1] < start_char:

            start_positions.append(0)

            end_positions.append(0)

        else:

            # Otherwise it's the start and end token positions

            idx = context_start

            while idx <= context_end and offset[idx][0] <= start_char:

                idx += 1

            start_positions.append(idx - 1)

            idx = context_end

            while idx >= context_start and offset[idx][1] >= end_char:

                idx -= 1

            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions

    inputs["end_positions"] = end_positions

    return inputs

In [8]:
tokenized_train_bn = train_bengali_tt.map(preprocess, batched=True, remove_columns=train_bengali_tt["train"].column_names)

Map:   0%|          | 0/1912 [00:00<?, ? examples/s]

Map:   0%|          | 0/478 [00:00<?, ? examples/s]

In [9]:
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator()

In [10]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-multilingual-cased")

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
training_args = TrainingArguments(

    output_dir="my_awesome_qa_model",

    evaluation_strategy="epoch",

    learning_rate=2e-5,

    per_device_train_batch_size=16,

    per_device_eval_batch_size=16,

    num_train_epochs=3,

    weight_decay=0.01,

    push_to_hub=True

)

trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_train_bn["train"],

    eval_dataset=tokenized_train_bn["test"],

    tokenizer=tokenizer,

    data_collator=data_collator,

)

trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,2.743646
2,No log,2.461924
3,No log,2.431629


TrainOutput(global_step=360, training_loss=2.7214324951171873, metrics={'train_runtime': 326.4017, 'train_samples_per_second': 17.573, 'train_steps_per_second': 1.103, 'total_flos': 749426181783552.0, 'train_loss': 2.7214324951171873, 'epoch': 3.0})

In [13]:
!pip install torchmetrics



In [38]:
from torchmetrics.text import BLEUScore

from transformers import pipeline

question_answerer = pipeline("question-answering", model="my_awesome_qa_model")

preds = []
target = []

bleu = BLEUScore(n_gram=1)

for row in val_bengali:
  pred = question_answerer(question=get_question(row), context=get_document(row))['answer']
  answer = get_answer(row)
  print(pred, "-----", answer)
  print(bleu([pred], [[answer]]))
  preds.append(pred)
  target.append([answer])

bleu(preds, target)




কিং জর্জ ফাইভ ----- স্যার জর্জ সিডেনহাম ক্লার্ক
tensor(0.2388)
ঝাঁসির রানি ঝাঁসির রানির ----- ঝাঁসির রানি
tensor(0.5000)
১৯৮৬ সালের ২৯ এপ্রিল ----- ১৯৮৪ সালে
tensor(0.)
১৪৭৩ সালের ১৮ ফেব্রুয়ারী ----- ১৪৭৩ সালের ১৮ ফেব্রুয়ারী
tensor(1.)
২৯শে আগস্ট, ১৯৭১ ----- ২৯শে আগস্ট, ১৯৭১ সালে
tensor(0.7165)
টেক্সাস অঙ্গরাজ্যের হিউস্টনে ----- মুর্থি
tensor(0.)
ল্যারি পেজকে ----- গুগলপ্লেক্স নামে মাউন্টেইন ভিউতে
tensor(0.)
রাজমনি সেন ----- রাজমনি সেন
tensor(1.)
বাঙালি ----- পয়লা বৈশাখ
tensor(0.)
প্রায় ৫৭ একর ----- ৫৭ একর
tensor(0.6667)
আল-মদিনা ----- ইয়াসরিব
tensor(0.)
আগামী ----- আগামী
tensor(1.)
বহরমপুর ----- বহরমপুর
tensor(1.)
১লা জানুয়ারি, ১৯১৪ ----- তৎকালীন কুমিল্লা জেলার অধীনে ব্রাহ্মণবাড়ীয়া মহকুমার গোকর্ণঘাট গ্রামে
tensor(0.)
১৪৩ মিনিট ----- ১৪৩ মিনিট
tensor(1.)
ঋষভনাথ[19] ইক্ষ্বাকু ----- ঋষভনাথ
tensor(0.)
সাং চিয়েন ----- সাং চিয়েন
tensor(1.)
রিন-ছেন-ব্জাং-পো ----- রিন-ছেন-ব্জাং-পো
tensor(1.)
জর্জটাউন ----- গায়ানা সহযোগিতামূলক প্রজাতন্ত্র
tensor(0.)
১৯৮৩ সালে ----- দুই
tensor(0.)
কা

tensor(0.2675)