In [None]:
import nltk
import string
from datasets import load_dataset
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

<a href="https://colab.research.google.com/github/Axel0087/NLP2023/blob/main/project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install datasets



In [None]:
dataset = load_dataset("copenlu/answerable_tydiqa")

In [None]:
train_set = dataset["train"]
validation_set = dataset["validation"]

In [None]:
def get_answer_start(row):
  return row["annotations"]["answer_start"][0]

def get_answer(row):
  return row["annotations"]["answer_text"][0]

def get_document(row):
  return row["document_plaintext"]

def get_question(row):
  return row["question_text"]

def oracle(answer, document):
  return answer != "" and answer in document

def get_language(dataset, lang):
  return [row for row in dataset if row['language'] == lang]

In [None]:


train_arabic = get_language(train_set, "arabic")
val_arabic = get_language(validation_set, "arabic")

train_bengali = get_language(train_set, "bengali")
val_bengali = get_language(validation_set, "bengali")

train_indonesian = get_language(train_set, "indonesian")
val_indonesian = get_language(validation_set, "indonesian")

In [1]:
def ratio_string(train, val):
  val_ratio = round(len(val)/len(train)*100)
  train_ratio = 100-val_ratio
  return f"{train_ratio} / {val_ratio}"

def answerable_ratio(ds):
  answerable = round(sum([1 for row in ds if get_answer_start(row) == -1])/len(ds)*100)
  nonansw = 100-answerable
  return f"{answerable} / {nonansw}"

print(f"""
Dataset features:

{train_set.column_names}

Dataset sizes:

(Arabic) Training set:                                          {len(train_arabic)}
(Arabic) Validation set:                                        {len(val_arabic)}
(Arabic) Ratio (Training/Val):                                  {ratio_string(train_arabic, val_arabic)}
(Arabic) Training balance (Answerable / Not answerable):        {answerable_ratio(train_arabic)}
(Arabic) Validation balance (Answerable / Not answerable):      {answerable_ratio(val_arabic)}

(Bengali) Training set:                                         {len(train_bengali)}
(Bengali) Validation set:                                       {len(val_bengali)}
(Bengali) Ratio (Training/Val):                                 {ratio_string(train_bengali, val_bengali)}
(Bengali) Training balance (Answerable / Not answerable):       {answerable_ratio(train_bengali)}
(Bengali) Validation balance (Answerable / Not answerable):     {answerable_ratio(val_bengali)}

(Indonesian) Training set:                                      {len(train_indonesian)}
(Indonesian) Validation set:                                    {len(val_indonesian)}
(Indonesian) Ratio (Training/Val):                              {ratio_string(train_indonesian, val_indonesian)}
(Indonesian) Training balance (Answerable / Not answerable):    {answerable_ratio(train_indonesian)}
(Indonesian) Validation balance (Answerable / Not answerable):  {answerable_ratio(val_indonesian)}
""")

NameError: ignored

In [None]:
def bag_of_words(dataset, column):
  bag = {}
  for row in dataset:
    tokens = nltk.word_tokenize(row[column])

    for token in tokens:

      if not token in bag:
        bag[token] = 0

      bag[token] += 1
      #print(bag)
  return sorted(bag.items(), key=lambda item: item[1], reverse=True)

#def sort_bag(bag):
#  return sorted(bag.items(), key=lambda item: item[1], reverse=True)

In [None]:
arabic_doc_bow = bag_of_words(train_arabic, "document_plaintext")
arabic_question_bow = bag_of_words(train_arabic, "question_text")

bengali_doc_bow = bag_of_words(train_bengali, "document_plaintext")
bengali_question_bow = bag_of_words(train_bengali, "question_text")

indonesian_doc_bow = bag_of_words(train_indonesian, "document_plaintext")
indonesian_question_bow = bag_of_words(train_indonesian, "question_text")

In [None]:
print(f"""

Most common words:

(Arabic) Documents: {arabic_doc_bow[0:5]}
(Arabic) Questions: {arabic_question_bow[0:5]}

(Bengali) Documents: {bengali_doc_bow[0:5]}
(Bengali) Questions: {bengali_question_bow[0:5]}

(Indonesian) Documents: {indonesian_doc_bow[0:5]}
(Indonesian) Questions: {indonesian_question_bow[0:5]}
""")



Most common words:

(Arabic) Documents: [('في', 89705), ('.', 88299), ('من', 61719), ('[', 38120), (']', 38119)]
(Arabic) Questions: [('؟', 10061), ('ما', 7451), ('متى', 7130), ('هو', 6760), ('من', 6309)]

(Bengali) Documents: [(',', 12184), (']', 7123), ('[', 7120), ('ও', 5195), ('এবং', 5102)]
(Bengali) Questions: [('?', 4777), ('কী', 940), ('নাম', 837), ('কত', 802), ('হয়', 800)]

(Indonesian) Documents: [(',', 54165), ('.', 43063), ('yang', 24077), ('dan', 23741), ('di', 16604)]
(Indonesian) Questions: [('?', 11368), ('yang', 1814), ('Kapan', 1811), ('Apa', 1633), ('Apakah', 1227)]



In [None]:
def get_ratio(question, document, stop_words):
  tokens = nltk.word_tokenize(question)
  count = 0
  stripped_tokens = set(tokens) - stop_words
  for token in stripped_tokens:
    if token in document:
      count += 1
  return count/len(stripped_tokens)


def avg(lst):
  return sum(lst)/len(lst)

def get_average_ratios(training, stop_words):
  answerable_ratios = []
  nonanswerable_ratios = []
  for row in training:
    ratio = get_ratio(get_question(row), get_document(row), stop_words)
    lst = answerable_ratios if oracle(get_answer(row), get_document(row)) else nonanswerable_ratios
    lst.append(ratio)
  return avg(answerable_ratios), avg(nonanswerable_ratios)

class NaiveModel:
  def __init__(self, stop_words):
    self.stop_words = stop_words
    self.ratio = -1

  def train(self, training):
    answerable_ratio, nonanswerable_ratio = get_average_ratios(training, self.stop_words)
    self.ratio = (answerable_ratio + nonanswerable_ratio)/2

  def classify(self, question, document):
    return get_ratio(question, document, self.stop_words) > self.ratio

def evaluate(validation, model):
  res = [int(oracle(get_answer(row), get_document(row)) == model.classify(get_question(row), get_document(row))) for row in validation]
  acc = avg(res)

  ### Manual generation of confusion matrix for scores like Balanced Accuray and F-score
  #tp, fp, tn, fn = 0, 0, 0, 0
  #for row in validation:
  #  gt = oracle(get_answer(row), get_document(row))
  #  cl = model.classify(get_question(row), get_document(row))
  #  if (cl):
  #    if (gt):
  #      tp += 1
  #    else:
  #      fp += 1
  #  else:
  #    if (gt):
  #      fn += 1
  #    else:
  #      tn += 1
  #tpr = tp / (tp + fn)
  #tnr = tn / (tn + fp)
  #ba = (tpr + tnr) / 2

  print(f"Accuracy: {round(acc*100, 4)}%\n")

In [None]:
from nltk.corpus import stopwords

nltk.download('stopwords')

arabic_stop_words = set(stopwords.words('indonesian')) | set(string.punctuation) | set("؟")
bengali_stop_words = set(stopwords.words('bengali')) | set(string.punctuation)
indonesian_stop_words = set(stopwords.words('indonesian')) | set(string.punctuation)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print("Evaluating arabic:")

arabic_model = NaiveModel(arabic_stop_words)
arabic_model.train(train_arabic)
evaluate(val_arabic, arabic_model)

print("Evaluating bengali:")

bengali_model = NaiveModel(bengali_stop_words)
bengali_model.train(train_bengali)
evaluate(val_bengali, bengali_model)

print("Evaluating indonesian:")

indonesian_model = NaiveModel(indonesian_stop_words)
indonesian_model.train(train_indonesian)
evaluate(val_indonesian, indonesian_model)

Evaluating arabic:
Accuracy: 71.6614%

Evaluating bengali:
Accuracy: 72.3214%

Evaluating indonesian:
Accuracy: 71.2007%

