In [1]:
!pip install transformers

Successfully installed huggingface-hub-0.5.1 pyyaml-6.0 sacremoses-0.0.53 tokenizers-0.12.1 transformers-4.18.0


In [2]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertForQuestionAnswering, AutoTokenizer
from collections import Counter
import string
import regex as re

In [190]:
#Model
model1 = 'salti/bert-base-multilingual-cased-finetuned-squad' # Just multilingual on squad : F1 Score:  0.4282051282051282
model2 = 'mrm8488/bert-base-spanish-wwm-cased-finetuned-spa-squad2-es' # Spanish bert on spanish squad : F1 Score:  0.2
model3 = 'jcblaise/bert-tagalog-base-cased' # Tagalog Bert, not fine-tuned on squad : F1 Score:  0.065


model4 = '/content/drive/MyDrive/mnlp/final/tagalog_wikipedia' # multilingual trained on wikipedia manual 54 in tagalog : F1 Score:  0.1831179956580414
model5 = '/content/drive/MyDrive/mnlp/final/spanish_bert_manual_wikipedia' # spanish bert trained on wikipedia manual 54 in tagalog : 0.1499
model6 = '/content/drive/MyDrive/mnlp/final/tagalog_bert_manual_wikipedia' # tagalog bert trained on wikipedia manual 


model7 = '/content/drive/MyDrive/mnlp/final/mbert_tagalog_from_ind_squad' # multilingual bert on indonesian->tagalog squad
model8 = '/content/drive/MyDrive/mnlp/final/spanish_bert_tagalog_from_ind_squad' # spanish bert on indonesian->tagalog squad
model9 = '/content/drive/MyDrive/mnlp/final/tagalog_bert_tagalog_from_ind_squad' # tagalog bert on indonesian->tagalog squad


model10 = '/content/drive/MyDrive/mnlp/final/mbert_spanish_from_ind_squad' # multilingual bert on spanish->tagalog squad
model11 = '/content/drive/MyDrive/mnlp/final/spanish_bert_spanish_from_ind_squad' # spanish bert on spanish->tagalog squad
model12 = '/content/drive/MyDrive/mnlp/final/tagalog_bert_spanish_from_ind_squad' # tagalog bert on spanish->tagalog squad


#NLP POWER

model = BertForQuestionAnswering.from_pretrained('/content/drive/MyDrive/mnlp/final/power_120')


#Tokenizer
tokenizer1 = 'bert-base-multilingual-cased'
tokenizer2 = 'dccuchile/bert-base-spanish-wwm-uncased'

tokenizer = BertTokenizer.from_pretrained(tokenizer1)

#tokenizer = AutoTokenizer.from_pretrained('jcblaise/bert-tagalog-base-cased')

In [199]:
df = pd.read_csv('/content/qa_120.csv')

In [200]:
def get_answer(context, question):
  context = context.replace("\n", "").strip()
  question = question.replace("\n", "").strip()

  # encoding = tokenizer.encode_plus(text=question,text_pair=context, add_special=True)
  encoding = tokenizer.encode_plus(text=question,text_pair=context)

  inputs = encoding['input_ids']  #Token embeddings
  sentence_embedding = encoding['token_type_ids']  #Segment embeddings
  tokens = tokenizer.convert_ids_to_tokens(inputs) #input tokens

  output = model(input_ids=torch.tensor([inputs]), token_type_ids=torch.tensor([sentence_embedding]))
  start_index = torch.argmax(output.start_logits)
  end_index = torch.argmax(output.end_logits)

  answer = ' '.join(tokens[start_index:end_index+1])


  corrected_answer = ''

  for word in answer.split():
    
    #If it's a subword token
    if word[0:2] == '##':
        corrected_answer += word[2:]
    else:
        corrected_answer += ' ' + word
  
  return corrected_answer

In [201]:
df['predicted'] = df.apply(lambda row: get_answer(row['Context'], row['Question']), axis=1)

In [202]:
df

Unnamed: 0,Question,Context,Answer,predicted
0,Saan ang tagpuan?,Bagaman pangkalahatang pinakakahuluganan nila ...,sa Asya na napapangibabawan,[CLS] Sa
1,Saang mga bansa pumupunta ang mga Pilipino nur...,Maraming umalis na mga nurses mula sa Pilipina...,"Ang mga bansang ito ay Inglatira, Germanya, Es...",[CLS] Saang mga bansa
2,Ano ang pinakamalaking korporasyon sa bansa sa...,Ang NAPOCOR ay ang pinakamalaking korporasyon ...,Ang NAPOCOR ay ang pinakamalaking korporasyon,[CLS] Ano ang pinakamalaking korporasyon sa b...
3,Ano ang sukat ng tainga at anong kulay ng marka?,May habang 13.5 sentimetro ang mga tainga na m...,May habang 13.5 sentimetro ang mga tainga na m...,[CLS] Ano ang sukat ng
4,Ilang ang populasyong ng tamaraw noong 1953?,"Noong mga unang panahon ng mga dekada ng 1900,...","Noong 1953, kulang sa ang tinatayang nabubuhay...",[CLS] Il
...,...,...,...,...
115,Ilan ang panlalaro ang kumuha ng double digit?,Subalit hindi ito naging dahilan upang panghin...,ang limang manlalaro,[CLS] Ilan ang panlalaro ang kumuha ng double...
116,Sa aeral rural ano ang pag aari ng babae?,"Sa mga areang rural, pag-aari ng tahanan ang b...",ng tahanan ang babaeng Pilipino,[CLS] Sa
117,Sino ang nakatuklas ng keso?,Hindi alam kung kailan eksaktong sinimulan ang...,at nomadikong Turko ng Gitnang Asya,[CLS] Sino
118,Sino ang ang naginf diktadura?,munlad ang ekonomiya ng Pilipinas noong dekada...,Pangulong Ferdinand Marcos na nagpahayag,[CLS] Sino ang ang naginf diktadura ? [SEP] m...


In [203]:
def normalize_answer(s):
    def remove_articles(text):
        return re.sub(r"\b(a|an|the)\b", " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [204]:
def get_f1(ground_truth, answer):
  prediction_tokens = normalize_answer(answer).split()
  ground_truth_tokens = normalize_answer(ground_truth).split()
  common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
  num_same = sum(common.values())
  if num_same == 0:
      return 0
  precision = 1.0 * num_same / len(prediction_tokens)
  recall = 1.0 * num_same / len(ground_truth_tokens)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

In [205]:
df['f1'] = df.apply(lambda row: get_f1(row['predicted'], row['Answer']), axis=1)

In [206]:
f1_score = df['f1'].mean()
print("F1 Score: ", f1_score)

F1 Score:  0.14433413021523528
