In [1]:
!pip install datasets transformers evaluate
!pip install -U accelerate

Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m98.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8

In [2]:
from torch.utils.data import Dataset
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from torch import nn
import torch
from transformers import AutoTokenizer


In [3]:
dataset = load_dataset("copenlu/answerable_tydiqa")

train_set = dataset["train"]
validation_set = dataset["validation"]

def get_answer_start(row):
  return row["annotations"]["answer_start"][0]

def get_answer(row):
  return row["annotations"]["answer_text"][0]

def get_document(row):
  return row["document_plaintext"]

def get_question(row):
  return row["question_text"]

def oracle(answer, document):
  return answer != "" and answer in document

def row_oracle(row):
  return oracle(get_answer(row), get_document(row))

def get_language(dataset, lang):
  return dataset.filter(lambda x: x["language"] == lang)#[row for row in dataset if row['language'] == lang]

Downloading readme:   0%|          | 0.00/4.94k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/71.6M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/7.49M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/116067 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [4]:
train_arabic = get_language(train_set, "arabic")
val_arabic = get_language(validation_set, "arabic")

train_bengali = get_language(train_set, "bengali")
val_bengali = get_language(validation_set, "bengali")

train_indonesian = get_language(train_set, "indonesian")
val_indonesian = get_language(validation_set, "indonesian")

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/116067 [00:00<?, ? examples/s]

Filter:   0%|          | 0/13325 [00:00<?, ? examples/s]

In [5]:
train_arabic_tt = train_arabic.train_test_split(test_size=0.2)
train_bengali_tt = train_bengali.train_test_split(test_size=0.2)
train_indonesian_tt = train_indonesian.train_test_split(test_size=0.2)

In [6]:
import nltk

nltk.download('stopwords')
from nltk.corpus import stopwords

bengali_removal_list = list(set(stopwords.words('bengali')))
arabic_removal_list = list(set(stopwords.words('arabic')))
indonesian_removal_list = list(set(stopwords.words('indonesian')))

def remove_stop_words(sentence, stopwords):
  words = sentence.split()
  filtered_words = [word for word in words if word not in stopwords]
  return ' '.join(filtered_words)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Validation

In [32]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

In [13]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [15]:
device = torch.device("cpu")
if torch.cuda.is_available():
  device = torch.device("cuda")

## Bengali

### Load Model

In [14]:
from transformers import AutoModelForSequenceClassification
tokenizer_bengali = AutoTokenizer.from_pretrained("Axel-0087/my_awesome_clas_model_bn")

model_bengali = AutoModelForSequenceClassification.from_pretrained("Axel-0087/my_awesome_clas_model_bn")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

### test


### Validate on Bengali

In [16]:
preds = []
labels = []

for row in val_bengali:
  document = remove_stop_words(get_document(row), bengali_removal_list)
  question = remove_stop_words(get_question(row), bengali_removal_list)

  text = question + ' <SEP> ' + document

  inputs = tokenizer_bengali(text, return_tensors="pt", truncation=True)
  with torch.no_grad():
      logits = model_bengali(**inputs).logits
      pred = np.argmax(logits, axis=1)
      preds.append(pred)

  labels.append(row_oracle(row))

In [17]:
accuracy.compute(predictions=preds, references=labels)

{'accuracy': 0.8303571428571429}

### cross validation

#### Arabic

In [18]:
preds_arabic = []
labels_arabic = []

for row in val_arabic:
  document = remove_stop_words(get_document(row), arabic_removal_list)
  question = remove_stop_words(get_question(row), arabic_removal_list)

  text = question + ' <SEP> ' + document

  inputs = tokenizer_bengali(text, return_tensors="pt", truncation=True)
  with torch.no_grad():
      logits = model_bengali(**inputs).logits
      pred = np.argmax(logits, axis=1)
      preds_arabic.append(pred)

  labels_arabic.append(row_oracle(row))

In [19]:
accuracy.compute(predictions=preds_arabic, references=labels_arabic)

{'accuracy': 0.8454258675078864}

#### Indonesian

In [20]:
preds_indonesian = []
labels_indonesian = []

for row in val_indonesian:
  document = remove_stop_words(get_document(row), indonesian_removal_list)
  question = remove_stop_words(get_question(row), indonesian_removal_list)

  text = question + ' <SEP> ' + document

  inputs = tokenizer_bengali(text, return_tensors="pt", truncation=True)
  with torch.no_grad():
      logits = model_bengali(**inputs).logits
      pred = np.argmax(logits, axis=1)
      preds_indonesian.append(pred)

  labels_indonesian.append(row_oracle(row))

In [21]:
accuracy.compute(predictions=preds_indonesian, references=labels_indonesian)

{'accuracy': 0.7959697732997482}

## Arabic

### Load model

In [22]:
from transformers import AutoModelForSequenceClassification
tokenizer_arabic = AutoTokenizer.from_pretrained("Axel-0087/my_awesome_clas_model_ar")

model_arabic = AutoModelForSequenceClassification.from_pretrained("Axel-0087/my_awesome_clas_model_ar")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

### Validation on Arabic

In [None]:
preds = []
labels = []

for row in val_arabic:
  document = remove_stop_words(get_document(row), arabic_removal_list)
  question = remove_stop_words(get_question(row), arabic_removal_list)

  text = question + ' <SEP> ' + document

  inputs = tokenizer_arabic(text, return_tensors="pt", truncation=True)
  with torch.no_grad():
      logits = model_arabic(**inputs).logits
      pred = np.argmax(logits, axis=1)
      preds.append(pred)

  labels.append(row_oracle(row))

In [None]:
accuracy.compute(predictions=preds, references=labels)

{'accuracy': 0.9227129337539433}

### cross validation

#### Bengali

In [23]:
preds_bengali = []
labels_bengali = []

for row in val_bengali:
  document = remove_stop_words(get_document(row), bengali_removal_list)
  question = remove_stop_words(get_question(row), bengali_removal_list)

  text = question + ' <SEP> ' + document

  inputs = tokenizer_arabic(text, return_tensors="pt", truncation=True)
  with torch.no_grad():
      logits = model_arabic(**inputs).logits
      pred = np.argmax(logits, axis=1)
      preds_bengali.append(pred)

  labels_bengali.append(row_oracle(row))

In [24]:
accuracy.compute(predictions=preds_bengali, references=labels_bengali)

{'accuracy': 0.8169642857142857}

#### Indonesian

In [25]:
preds_indonesian = []
labels_indonesian = []

for row in val_indonesian:
  document = remove_stop_words(get_document(row), indonesian_removal_list)
  question = remove_stop_words(get_question(row), indonesian_removal_list)

  text = question + ' <SEP> ' + document

  inputs = tokenizer_arabic(text, return_tensors="pt", truncation=True)
  with torch.no_grad():
      logits = model_arabic(**inputs).logits
      pred = np.argmax(logits, axis=1)
      preds_indonesian.append(pred)

  labels_indonesian.append(row_oracle(row))

In [26]:
accuracy.compute(predictions=preds_indonesian, references=labels_indonesian)

{'accuracy': 0.8303946263643996}

## Indonesian

### Load Model

In [27]:
from transformers import AutoModelForSequenceClassification
tokenizer_indonesian = AutoTokenizer.from_pretrained("Axel-0087/my_awesome_clas_model_in")

model_indonesian = AutoModelForSequenceClassification.from_pretrained("Axel-0087/my_awesome_clas_model_in")

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.92M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/895 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/711M [00:00<?, ?B/s]

### Validation on Indonesian

In [None]:
preds = []
labels = []

for row in val_indonesian:
  document = remove_stop_words(get_document(row), indonesian_removal_list)
  question = remove_stop_words(get_question(row), indonesian_removal_list)

  text = question + ' <SEP> ' + document

  inputs = tokenizer_indonesian(text, return_tensors="pt", truncation=True)
  with torch.no_grad():
      logits = model_indonesian(**inputs).logits
      pred = np.argmax(logits, axis=1)
      preds.append(pred)

  labels.append(row_oracle(row))

In [None]:
accuracy.compute(predictions=preds, references=labels)

{'accuracy': 0.8706968933669186}

### Cross validation

#### Arabic

In [30]:
preds_arabic = []
labels_arabic = []

for row in val_arabic:
  document = remove_stop_words(get_document(row), arabic_removal_list)
  question = remove_stop_words(get_question(row), arabic_removal_list)

  text = question + ' <SEP> ' + document

  inputs = tokenizer_indonesian(text, return_tensors="pt", truncation=True)
  with torch.no_grad():
      logits = model_indonesian(**inputs).logits
      pred = np.argmax(logits, axis=1)
      preds_arabic.append(pred)

  labels_arabic.append(row_oracle(row))

In [31]:
accuracy.compute(predictions=preds_arabic, references=labels_arabic)

{'accuracy': 0.8980021030494216}

#### Bengali

In [28]:
preds_bengali = []
labels_bengali = []

for row in val_bengali:
  document = remove_stop_words(get_document(row), bengali_removal_list)
  question = remove_stop_words(get_question(row), bengali_removal_list)

  text = question + ' <SEP> ' + document

  inputs = tokenizer_indonesian(text, return_tensors="pt", truncation=True)
  with torch.no_grad():
      logits = model_indonesian(**inputs).logits
      pred = np.argmax(logits, axis=1)
      preds_bengali.append(pred)

  labels_bengali.append(row_oracle(row))

In [29]:
accuracy.compute(predictions=preds_bengali, references=labels_bengali)

{'accuracy': 0.78125}