In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] ='3'

In [2]:
import pandas as pd
import fasttext
from gensim.models import FastText
from gensim.similarities import WmdSimilarity
from pyemd import emd
import stanza
import string
import re
import nltk
from nltk.tokenize import wordpunct_tokenize, word_tokenize
import nltk.data
import matplotlib.pyplot as plt
import numpy as np
from numpy import inf
import seaborn as sns
import ast
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForTokenClassification
import torch

pd.set_option('display.max_columns', None)
pd.set_option('max_colwidth', None)

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import logging
logging.getLogger('gensim').setLevel(logging.WARNING)

# Load Dataset

In [694]:
squad_test = pd.read_csv("squad-test.csv",converters = {'answer': ast.literal_eval},index_col=0)
tydiqa_test = pd.read_csv("tydiqa-test.csv", converters = {'answer': ast.literal_eval},index_col=0)
idkmrc_test = pd.read_csv("idkmrc-test.csv",converters = {'answer': ast.literal_eval},index_col=0)

# Util

## Preprocess

In [None]:
nlp = stanza.Pipeline('id', processors='tokenize,lemma')

def preprocess_text(text):
    doc = nlp(text)
    tokenized = [word.lemma for sentence in doc.sentences for word in sentence.words if word.lemma != None]
    try:
        text = [w for w in tokenized if w not in string.punctuation]
    except TypeError:
        print(text)
    return text

## WordMoverDistance

In [641]:
model_wm_1 = FastText.load_fasttext_format("fasttext-4B-id-uncased/fasttext.4B.id.300.epoch5_uncased_no-oov_pos-idn_uncased.bin")

INFO:gensim.models._fasttext_bin:loading 457946 words for fastText model from fasttext-4B-id-uncased/fasttext.4B.id.300.epoch5_uncased_no-oov_pos-idn_uncased.bin
INFO:gensim.utils:FastText lifecycle event {'params': 'FastText<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2023-07-01T09:32:48.467754', 'gensim': '4.3.1', 'python': '3.8.10 (default, Mar 13 2023, 10:26:41) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-90-generic-x86_64-with-glibc2.29', 'event': 'created'}
INFO:gensim.models.word2vec:Updating model with new vocabulary
INFO:gensim.utils:FastText lifecycle event {'msg': 'added 457946 new unique words (100.00% of original 457946) and increased the count of 0 pre-existing words (0.00% of original 457946)', 'datetime': '2023-07-01T09:32:50.136800', 'gensim': '4.3.1', 'python': '3.8.10 (default, Mar 13 2023, 10:26:41) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-90-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}
INFO:gensim.models.word2vec:deleting the raw counts dictiona

In [642]:
model_wm_1 = model_wm_1.wv

In [643]:
model_wm_2 = FastText.load_fasttext_format("fasttext.4B.id.300.epoch5.uncased.bin")

INFO:gensim.models._fasttext_bin:loading 5196198 words for fastText model from fasttext.4B.id.300.epoch5.uncased.bin
INFO:gensim.utils:FastText lifecycle event {'params': 'FastText<vocab=0, vector_size=100, alpha=0.025>', 'datetime': '2023-07-01T09:33:10.949393', 'gensim': '4.3.1', 'python': '3.8.10 (default, Mar 13 2023, 10:26:41) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-90-generic-x86_64-with-glibc2.29', 'event': 'created'}
INFO:gensim.models.word2vec:Updating model with new vocabulary
INFO:gensim.utils:FastText lifecycle event {'msg': 'added 5196198 new unique words (100.00% of original 5196198) and increased the count of 0 pre-existing words (0.00% of original 5196198)', 'datetime': '2023-07-01T09:33:33.032344', 'gensim': '4.3.1', 'python': '3.8.10 (default, Mar 13 2023, 10:26:41) \n[GCC 9.4.0]', 'platform': 'Linux-5.4.0-90-generic-x86_64-with-glibc2.29', 'event': 'prepare_vocab'}
INFO:gensim.utils:FastText lifecycle event {'msg': 'loaded (5196198, 100) weight matrix for fastText m

In [646]:
model_wm_2 = model_wm_2.wv

In [647]:
def get_distance(text_1, text_2, model= model_wm_1, model_2=model_wm_2):
    a_prep = preprocess_text(text_1)
    b_prep = preprocess_text(text_2)
    dis_1 = model_wm_1.wmdistance(a_prep, b_prep)
    if dis_1 == inf:
        return model_wm_2.wmdistance(a_prep, b_prep)
    return dis_1

In [None]:
get_distance("kipas angin","angin kipas")

## Split Sentence

In [16]:
nltk.download('punkt')
nltk.download('stopwords')

def split_sentences(text):
    # Menggunakan PunktSentenceTokenizer untuk Bahasa Indonesia
    tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    sentences = tokenizer.tokenize(text)
    
    return sentences

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## NER, TIMEX, POSTAG, Chunking

In [17]:
tokenizer_ner = AutoTokenizer.from_pretrained("model/indobert-large-p2-finetuned-ner")
modul_ner = AutoModelForTokenClassification.from_pretrained("model/indobert-large-p2-finetuned-ner")
tokenizer_pos = AutoTokenizer.from_pretrained("model/indobert-large-p2-finetuned-pos")
modul_pos = AutoModelForTokenClassification.from_pretrained("model/indobert-large-p2-finetuned-pos")
tokenizer_chunking = AutoTokenizer.from_pretrained("model/indobert-large-p2-finetuned-chunking")
modul_chunking = AutoModelForTokenClassification.from_pretrained("model/indobert-large-p2-finetuned-chunking")
tokenizer_timex = AutoTokenizer.from_pretrained("model/indobert-large-p2-finetuned-indotimex")
modul_timex = AutoModelForTokenClassification.from_pretrained("model/indobert-large-p2-finetuned-indotimex")

In [18]:
def predict(model, tokenizer, sentence):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    inputs = tokenizer(sentence.split(),
                    is_split_into_words = True,
                    return_offsets_mapping=True, 
                    return_tensors="pt",
                    padding='max_length', 
                    truncation=True, 
                    max_length=512)
    
    model.to(device)
    ids = inputs["input_ids"].to(device)
    mask = inputs["attention_mask"].to(device)
    
    outputs = model(ids, attention_mask=mask)
    logits = outputs[0]

    active_logits = logits.view(-1, model.num_labels)
    flattened_predictions = torch.argmax(active_logits, axis=1) 

    tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
    token_predictions = [model.config.id2label[i] for i in flattened_predictions.cpu().numpy()]
    wp_preds = list(zip(tokens, token_predictions)) 

    prediction = []
    for token_pred, mapping in zip(wp_preds, inputs["offset_mapping"].squeeze().tolist()):
        #only predictions on first word pieces are important
        if mapping[0] == 0 and mapping[1] != 0:
            prediction.append(token_pred[1])
        else:
            continue
    
    return sentence.split(), prediction

In [19]:
def get_tag_predictions(model, tokenizer, sentences):
    predictions = []
    for sentence in sentences:
        pred = predict(model, tokenizer, sentence)
        predictions.append(list(zip(*pred)))
    return predictions

## Dependency Parsing

In [20]:
import sys
sys.path.append("/workspace/Tugas Akhir/indolem/dependency_parsing")

In [16]:
from dep_parser import get_dependency

In [17]:
def predict_dependency(words, tags):
    a, b = words, tags 
    zipped_word_label = []
    instance2indexpos =  {
            "_PAD_POS": 0,
            "_ROOT_POS": 1,
            "_END_POS": 2,
            "PROPN": 3,
            "AUX": 4,
            "DET": 5,
            "NOUN": 6,
            "PRON": 7,
            "VERB": 8,
            "ADP": 9,
            "PUNCT": 10,
            "ADV": 11,
            "CCONJ": 12,
            "SCONJ": 13,
            "NUM": 14,
            "ADJ": 15,
            "PART": 16,
            "SYM": 17,
            "X": 18
        }
    mapping = {
        "PRP": "PRON",
        "NN": "NOUN",
        "NNP": "PROPN",
        "CD": "NUM",
        "VB": "VERB",
        "RB": "ADV",
        "DT": "DET"
    }     
    for i, j in zip(a,b):
        j = mapping.get(j[2:], j[2:])
        if j not in instance2indexpos.keys():
            j = "X"
        zipped_word_label.append((i, j))
        
    return get_dependency(zipped_word_label)

## Coref

In [21]:
import sys
sys.path.append("irwanto/indocoref/src")

In [22]:
from core_2 import predict as predict_coref

In [None]:
x = predict_coref('{M1:jenis="" Orang Eropa pertama} yang melakukan perjalanan sepanjang {M2:jenis="" Sungai Amazon} adalah {M3:jenis="" Francisco de Orellana} pada tahun 1542. {M6:jenis="" Dia} lahir di Semarang. {M4:jenis="" Wartawan BBC Unnatural Histories} menyajikan bukti bahwa Orellana, bukannya membesar-besarkan klaimnya seperti yang diduga sebelumnya, adalah benar dalam pengamatannya bahwa peradaban kompleks berkembang di sepanjang {M5:jenis="" Amazon}. di tahun 1540-an. Diyakini bahwa peradaban itu kemudian dihancurkan oleh penyebaran penyakit dari Eropa, seperti cacar. Sejak tahun 1970-an, banyak geoglyph telah ditemukan di tanah gundul yang berasal dari tahun 0-1250 M, melanjutkan klaim tentang peradaban Pra-Kolombia. Ondemar Dias terakreditasi dengan pertama kali menemukan geoglyph pada tahun 1977 dan Alceu Ranzi dengan melanjutkan penemuan mereka setelah terbang di atas Acre. Wartawan BBC Unnatural Histories menyajikan bukti bahwa hutan hujan Amazon, daripada menjadi hutan belantara yang murni, telah dibentuk oleh manusia setidaknya selama 11.000 tahun melalui praktik-praktik seperti berkebun dan terra preta.', 'Orang Eropa pertama yang melakukan perjalanan sepanjang Sungai Amazon adalah Francisco de Orellana pada tahun 1542. Wartawan BBC Unnatural Histories menyajikan bukti bahwa Orellana, bukannya membesar-besarkan klaimnya seperti yang diduga sebelumnya, adalah benar dalam pengamatannya bahwa peradaban kompleks berkembang di sepanjang Amazon. di tahun 1540-an. Diyakini bahwa peradaban itu kemudian dihancurkan oleh penyebaran penyakit dari Eropa, seperti cacar. Sejak tahun 1970-an, banyak geoglyph telah ditemukan di tanah gundul yang berasal dari tahun 0-1250 M, melanjutkan klaim tentang peradaban Pra-Kolombia. Ondemar Dias terakreditasi dengan pertama kali menemukan geoglyph pada tahun 1977 dan Alceu Ranzi dengan melanjutkan penemuan mereka setelah terbang di atas Acre. Wartawan BBC Unnatural Histories menyajikan bukti bahwa hutan hujan Amazon, daripada menjadi hutan belantara yang murni, telah dibentuk oleh manusia setidaknya selama 11.000 tahun melalui praktik-praktik seperti berkebun dan terra preta.')

print(x)

### Mention Detection

In [128]:
def get_mentions(row):
    mentions = []
    for idx_s, sentence in enumerate(row["context_features"]+row["question_features"]):
        state = 0
        text = ""
        prev_pos_tag = None
        for idx_w, word in enumerate(sentence):
            label_pos = word["pos"][2:]
            label_ner = word["ner"]
            if state == 0:
                if label_pos in ["NNP", "PRP", "NN", "PR"] and label_ner == "O":
                    state = 1
                elif label_ner[0] == "B":
                    state = 2

            if state == 1:
                if text == "":
                    text += word["kata"] + " "
                    prev_pos_tag = label_pos
                    continue
                if label_pos != prev_pos_tag:
                    mentions.append((text, ""))
                    state = 0
                    text = ""
                    prev_pos_tag = None
                else:
                    text += word["kata"] + " "
            elif state == 2:
                if label_ner[0] == "B":
                    if text == "":
                        text += word["kata"] + " "
                    else:
                        mentions.append((text, ""))
                        text = word["kata"] + " "
                if label_ner[0] == "I":
                    text += word["kata"] + " "
                if label_ner[0] == "O" and text != "":       
                    prev_label = sentence[idx_w-1]["ner"]
                    if "PER" in prev_label:
                        mentions.append((text, "named-entity person"))
                    elif "PLA" in prev_label:
                        mentions.append((text, "named-entity place"))
                    elif "ORG" in prev_label:
                        mentions.append((text, "named-entity organisasi"))
                    text = ""
                    state = 0

        if text != "":
            mentions.append((text, ""))
    
    return mentions

In [24]:
import regex as re
def annotate_mentions(text_ori, mentions):
    sentence_copy = text_ori
    i = 0
    for mention, jenis in mentions:
        # print(i)
        pattern = fr"(?<!\{{[^}}]*){re.escape(mention[:-1])}(?!.*?\}})"
        # print(pattern)
        rep = f"{{M{i}:jenis=\"{jenis}\" {mention[:-1]}}}"
        sentence_copy = re.sub(pattern, rep, sentence_copy, 1)
        i +=1
        
    return sentence_copy

In [144]:
def replace_coref(row):
    text_ori = row["context"] + " " + row["question"]
    text_ori_2 = row["context"]
    mentions = get_mentions(row)
    annotated = annotate_mentions(text_ori, mentions)
    annotated_2 = annotate_mentions(text_ori_2, mentions)
    try: 
        predicted = predict_coref(annotated, text_ori)
    except:
        return text_ori_2
        
    entities = row["content_entities"]
    flag = False
    annotated = annotated_2
    for k, v in predicted.items():        
        is_included_ner = False
        ner = None
        for i in v:
            if flag:
                break
            for j in entities:
                if j.lower() in i[1].lower():
                    is_included_ner = True
                    ner = remove_punctuation(j)
                    flag = True
                    break
                
        if not is_included_ner:
            for i in v:
                pattern = "(\{M" + str(i[0]) + ":.*?\})"
                annotated = re.sub(pattern, i[1], annotated)
            continue
        
        for i in v:
            pattern = "(\{M" + str(i[0]) + ":.*?\})"
            annotated = re.sub(pattern, ner, annotated)
            
        
    return annotated

## Get EAT

In [26]:
PER = "PERSON"
PLA = "PLACE"
ORG = "ORGANISATION"
TIME = "TIME"
eat_dict = {
    "kapan": [TIME],
    "kapankah": [TIME],
    "dimana": [PLA, "NP"],
    "mana": [PLA, "NP"],
    "darimana": [PLA, "NP"],
    "darimanakah": [PLA, "NP"],
    "dimanakah": [PLA, "NP"],
    "manakah": [PLA, "NP"],
    "siapa": [PER, ORG],
    "siapakah": [PER, ORG],
    "apa": ["NP"],
    "apakah": ["NP"],
    "kenapa": ["NP", "VP"],
    "mengapa": ["NP", "VP"],
    "berapa": ["CD"],
    "berapakah": ["CD"],
    "seberapa": ["CD"],
    "beberapa": ["VP", "NP"],
    "bagaimana": ["VP", "NP"],
    "bagaimanakah": ["VP", "NP"],
    "lainnya": ["NP", "PLA", "ORG", "PER", TIME, "CD"]
}

eat_dict_2 = {
    "kapan": [TIME],
    "kapankah": [TIME],
    "dimana": [PLA],
    "di mana": [PLA],
    "darimanakah": [PLA],
    "dari manakah": [PLA],
    "darimana": [PLA],
    "dari mana": [PLA],
    "dimanakah": [PLA],
    "di manakah": [PLA],
    "manakah": [PLA],
    "kemana": [PLA],
    "ke mana": [PLA],
    "kemanakah": [PLA],
    "ke manakah": [PLA],
    "siapa": [PER, ORG],
    "siapakah": [PER, ORG],
    "lainnya": None
}

eat_dict_3 = {
    "kapan": [TIME, "CD", "NP"],
    "kapankah": [TIME, "CD", "NP"],
    "dimana": [PLA, "NP"],
    "di mana": [PLA, "NP"],
    "darimanakah": [PLA, "NP"],
    "dari manakah": [PLA, "NP"],
    "darimana": [PLA, "NP"],
    "dari mana": [PLA, "NP"],
    "dimanakah": [PLA, "NP"],
    "di manakah": [PLA, "NP"],
    "manakah": [PLA, "NP"],
    "mana": [PLA, "NP"],
    "kemana": [PLA, "NP"],
    "ke mana": [PLA, "NP"],
    "kemanakah": [PLA, "NP"],
    "ke manakah": [PLA, "NP"],
    "siapa": [PER, ORG, "NP"],
    "siapakah": [PER, ORG, "NP"],
    "lainnya": None
}

eat_dict_4 = {
    "apa": ["NP",],
    "apakah": ["NP",],
    "kenapa": ["NP", "VP",],
    "mengapa": ["NP", "VP"],
    "berapa": ["CD", "OD", "NP"],
    "berapakah": ["CD", "OD", "NP"],
    "seberapa": ["CD", "OD", "NP"],
    "beberapa": ["VP", "NP"],
    "bagaimana": ["VP", "NP", "VB"],
    "bagaimanakah": ["VP", "NP"],
    "lainnya": None
}

eat_dict_5 = {**eat_dict_3, **eat_dict_4,
    "lainnya": ["NP", "PLA", "ORG", "PER", TIME, "CD"]
} 

eat_dict_6 = {
    "kapan": ["CD"],
    "kapankah": ["CD"],
    "dimana": ["NP"],
    "di mana": ["NP"],
    "darimanakah": ["NP"],
    "dari manakah": ["NP"],
    "darimana": ["NP"],
    "dari mana": ["NP"],
    "dimanakah": ["NP"],
    "di manakah": ["NP"],
    "kemana": ["NP"],
    "ke mana": ["NP"],
    "kemanakah": ["NP"],
    "ke manakah": ["NP"],
    "siapa": ["NP"],
    "siapakah": ["NP"],
    "lainnya": None
}

def remove_punctuation(text):
    return re.sub(r'[^\w\s]', ' ', text)

def get_eat(question, eat_dict):
    eat = None
    removed_punct = remove_punctuation(question).lower()
    for k in eat_dict:
        pattern = re.compile(r'\b' + re.escape(k) + r'\b')
        match = re.search(pattern, removed_punct)
        if match:
            eat = eat_dict[k]
            break
            
    if k == "lainnya":        
        eat = eat_dict["lainnya"]
            
    return k, eat

In [27]:
def get_f1_avg(i, j, k):
    count = {
        "kapan": [0,0,0],
        "dimana": [0,0,0],
        "siapa": [0,0,0],
        "apa": [0,0,0],
        "mengapa": [0,0,0],
        "kenapa": [0,0,0],
        "berapa": [0,0,0],
        "bagaimana": [0,0,0],
        "lainnya": [0,0,0],
        "beberapa": [0,0,0]
    }
    for x, y, z in zip(i, j, k):
        if x == [TIME, "CD", "NP"]:
            count["kapan"][0] += 1
            count["kapan"][1] += z
            if z == 1:
                count["kapan"][2] += 1
        elif x == [PLA, "NP"]:
            count["dimana"][0] += 1
            count["dimana"][1] += z
            if z == 1:
                count["dimana"][2] += 1
        elif x==[PER, ORG, "NP"]:
            count["siapa"][0] += 1
            count["siapa"][1] += z
            if z == 1:
                count["siapa"][2] += 1
        elif x == ["NP"]:
            count["apa"][0] += 1
            count["apa"][1] += z
            if z == 1:
                count["apa"][2] += 1
        elif y == "mengapa":
            count["mengapa"][0] += 1
            count["mengapa"][1] += z
            if z == 1:
                count["mengapa"][2] += 1
        elif y == "kenapa":
            count["kenapa"][0] += 1
            count["kenapa"][1] += z
            if z == 1:
                count["kenapa"][2] += 1
        elif x == ["CD", "OD", "NP"]:
            count["berapa"][0] += 1
            count["berapa"][1] += z
            if z == 1:
                count["berapa"][2] += 1
        elif y == "beberapa":
            count["beberapa"][0] += 1
            count["beberapa"][1] += z
            if z == 1:
                count["beberapa"][2] += 1
        elif y in ["bagaimana", "bagaimanakah"]:
            count["bagaimana"][0] += 1
            count["bagaimana"][1] += z
            if z == 1:
                count["bagaimana"][2] += 1
        else:
            count["lainnya"][0] += 1
            count["lainnya"][1] += z
            if z == 1:
                count["lainnya"][2] += 1
        
            
    for k, v in count.items():
        if v[0] != 0:
            print(f"{k}: {v[1]/v[0]} {v[2]}")

## Get Candidate Answer

In [28]:
list_chunk_tag = ["ADJP", "ADVP", "INTJ", "NP", "PP", "PRT", "SBAR", "UCP", "VP"]
list_ner = ["PERSON", "PLACE", "ORGANISATION"]
list_pos = ["CD", "OD", "VB"]

In [29]:
def get_candidate_answer_2(eat, feature):
    candidate_answers = []
    if eat != None:
        for i in eat:
            if i in list_chunk_tag:
                count_sent = 0
                for sentence in feature:
                    count = 0
                    for word in sentence:
                        tag_temp = word["chunk"][2:]
                        if tag_temp in eat:
                            candidate_answers.append((word["kata"],word["chunk"],count, count_sent))
                        count+=1
                    count_sent += 1
            if i == TIME:
                count_sent = 0
                for sentence in feature:
                    count = 0
                    for word in sentence:
                        if word["timex"] != "O":
                            candidate_answers.append((word["kata"],word["timex"],count, count_sent))
                        count+=1
                    count_sent+=1 
            if i in list_ner:
                count_sent = 0
                for sentence in feature:
                    count = 0
                    for word in sentence:
                        if i in word["ner"]:
                            candidate_answers.append((word["kata"],word["ner"],count, count_sent))
                        count+=1
                    count_sent += 1
            if i in list_pos:
                count_sent = 0
                for sentence in feature:
                    count = 0
                    for word in sentence:
                        if i in word["pos"]:
                            candidate_answers.append((word["kata"],word["pos"],count, count_sent))
                        count+=1
                    count_sent += 1
    return candidate_answers

## Get Final Candidate Answer

In [30]:
def get_longest_span(lst):
    terpanjang = ''
    for string in lst:
        if len(string) > len(terpanjang):
            terpanjang = string
    return terpanjang

In [31]:
def get_final_candidate_answers_2(candidates, features):
    final = {}
    text = ""
    index_temp = None
    index_s_temp = None
    count = 0

    for i in candidates:
        if i[1][0] == "B":
            if text == "":
                index_temp = i[2]
                index_s_temp = i[3]
                text += i[0]
                count = 0
            else:
                if i[2] == index_temp + count and i[3] == index_s_temp:
                    text += " " + i[0]
                    count += 1
                    continue
                if i[2] == index_temp + count + 1 and features[i[3]][i[2]-1]["pos"] == "B-CC" and i[3] == index_s_temp:
                    text += " " + features[i[3]][i[2]-1]["kata"] + " " + i[0]
                    count += 2
                    continue
                final.setdefault((index_s_temp, index_temp), []).append(text)
                text = ""
                text += i[0]
                index_temp = i[2]
                index_s_temp = i[3]
                count = 0
        elif i[1][0] == "I" and i[3] == index_s_temp and i[2] == index_temp + count:
            text += " " + i[0]
        elif i[1][0] == "I":
            if text == "":
                index_temp = i[2]
                index_s_temp = i[3]
                text += i[0]
                count = 0
            else:
                final.setdefault((index_s_temp, index_temp), []).append(text)
                text = ""
                text += i[0]
                index_temp = i[2]
                index_s_temp = i[3]
                count = 0
        count += 1

    
    if text != "":
        final.setdefault((index_s_temp, index_temp), []).append(text)
        
    final = split_dictionary(final)
    return final

In [32]:
def split_dictionary(dictionary):
    result = {}
    
    for key, values in dictionary.items():
        s, t = key
        
        for i, value in enumerate(values):
            new_key = (s, t, t + len(value.split()))
            result[new_key] = value
    
    return result

# Contoh penggunaan
dictionary = {(0, 1): ["Kata"], (1, 2): ["Kata Siapa", "Kata Saya Kah"]}
new_dictionary = split_dictionary(dictionary)

## Filter WH from Question

In [33]:
def filter_wh(pair_words_tags):
    words = []
    for word, tag in pair_words_tags:
        if "WH" not in tag and word.lower() not in eat_dict.keys() and "Z" not in tag:
            words.append((word,tag))
    
    return words

## Get Question Key

In [34]:
def get_question_key(pairs_pos):
    output = " ".join([word for word, _ in pairs_pos])
    return output

In [35]:
conjuction_dict = {
    "adalah": ["apa", "apakah", "siapa", "siapakah"],
    "di": ["dimana", "dimanakah", "di mana", "di manakah"],
    "dari": ["darimanakah", "darimana", "dari manakah", "dari mana"],
    "karena": ["mengapa", "kenapa"]
}

## Get Final Answer

In [36]:
def get_conjuction(kata_tanya):
    for k, v in conjuction_dict.items():
        if kata_tanya in v:
            return k
    return ""
        
def get_final_answer(passage_sentences, kata_tanya, question_key, candidate_answer, debug):
    dis_lowest = 9999
    sen = ""

    try:
        for key, val in candidate_answer.items():
            conj = get_conjuction(kata_tanya)
            target = question_key + " " + conj + " " + val
            target_2 = passage_sentences[key[0]]
            dis = get_distance(target, target_2)
            if dis < dis_lowest and dis <= 1.3:
                dis_lowest = dis
                sen = val
            if debug:
                print('\033[1m' + "Target 1:" + "\033[0m")
                print(target)
                print('\033[1m' + "Target 2:" + "\033[0m")
                print(target_2)
                print(f"Distance: {dis}\n")
    except AttributeError:
        if len(candidate_answer) == 1:
            return list(candidate_answer)[0]
    return sen
        
        

## Get Content Word

In [37]:
def merge_tags(tagged_list):
    merged_list = []
    current_phrase = ""
    current_tag = ""

    for word, tag in tagged_list:
        if tag.startswith("B-"):
            if current_phrase:
                merged_list.append(current_phrase.strip())
            current_phrase = word
            current_tag = tag.split("-")[1]
        elif tag.startswith("I-") and current_tag == tag.split("-")[1]:
            current_phrase += " " + word

    if current_phrase:
        merged_list.append(current_phrase.strip())

    return merged_list

def get_content_word_entity_and_np(row):
    entities = []
    for token in row["question_features"][0]:
            if token["ner"] != "O" or "NP" in token["chunk"] and token["pos"] != "B-WH" and token["kata"].lower() not in eat_dict_4.keys():
                entities.append(token["kata"])
    return entities

def get_content_word_entity(row):
    entities = []
    for token in row["question_features"][0]:
            if token["ner"] != "O":
                entities.append(token["kata"])
                
    return entities

def get_content_word_np(row):
    entities = []
    for token in row["question_features"][0]:
        if "NP" in token["chunk"] and token["pos"] != "B-WH" and token["kata"].lower() not in eat_dict_4.keys() and token["ner"] == "O":
            entities.extend(preprocess_text(token["kata"]))
    return entities
    

## Filter Answer By Content Word

In [38]:
def filter_by_content_word(row, key_context, key_content_word, key_candidate_ans):
    sentences = row[key_context]
    candidate_ans_filtered = {}
    idx_sentences_filtered = []
    entities = row[key_content_word]
    
    if key_context == "context_coref":
        sentences = split_sentences(row[key_context])
        
    if not(len(entities)):
        return {}
    
    if key_context == "context_coref":
        for idx, sentence in enumerate(sentences):
            is_all = True
            for i in entities:
                if remove_punctuation(i.lower()).strip() not in sentence.lower().strip():
                    is_all = False
                    break
            if is_all:
                idx_sentences_filtered.append(idx)

        for k in row[key_candidate_ans].keys():
            if k[0] in idx_sentences_filtered:
                candidate_ans_filtered[k] = row[key_candidate_ans][k]
    else:
        for idx, sentence in enumerate(sentences):
            is_all = True
            for i in entities:
                if remove_punctuation(i.lower()).strip() not in remove_punctuation(" ".join(sentence).lower()).strip():
                    is_all = False
                    break
            if is_all:
                idx_sentences_filtered.append(idx)

        try:
            for k in row[key_candidate_ans].keys():
                if k[0] in idx_sentences_filtered:
                    candidate_ans_filtered[k] = row[key_candidate_ans][k]
        except:
            return {}
            
    return candidate_ans_filtered

In [39]:
def filter_by_content_word_c(row, key_context, key_content_word, key_candidate_ans):
    sentences = row[key_context]
    candidate_ans_filtered = {}
    idx_sentences_filtered = []
    entities = row[key_content_word]
    
    if key_context == "context_coref":
        sentences = split_sentences(row[key_context])
        
    if not(len(entities)):
        return {}
    
    if key_context == "context_coref":
        for idx, sentence in enumerate(sentences):
            is_all = True
            for i in entities:
                if remove_punctuation(i.lower()).strip() not in sentence.lower().strip():
                    is_all = False
                    break
            if is_all:
                idx_sentences_filtered.append(idx)

        for k in row[key_candidate_ans].keys():
            if k[0] in idx_sentences_filtered:
                candidate_ans_filtered[k] = row[key_candidate_ans][k]
    else:
        for idx, sentence in enumerate(sentences):
            is_all = True
            for i in entities:
                if remove_punctuation(i.lower()).strip() not in remove_punctuation(" ".join(sentence).lower()).strip():
                    is_all = False
                    break
            if is_all:
                idx_sentences_filtered.append(idx)

        try:
            for k in row[key_candidate_ans].keys():
                if k[0] in idx_sentences_filtered:
                    candidate_ans_filtered[k] = row[key_candidate_ans][k]
        except:
            return {}
            
    return candidate_ans_filtered

In [40]:
def filter_by_content_word_or(row, key_context, key_content_word, key_candidate_ans):
    sentences = row[key_context]
    candidate_ans_filtered = {}
    idx_sentences_filtered = []
    entities = row[key_content_word]
    
    if key_context == "context_coref":
        sentences = split_sentences(row[key_context])
        
    if not(len(entities)):
        return {}
    
    if key_context == "context_coref":
        for idx, sentence in enumerate(sentences):
            is_any = False
            for i in entities:
                for word in sentence:
                    if remove_punctuation(i.lower()).strip() in sentence.lower().strip():
                        is_any = True
                        break
            if not len(entities):
                is_any = True
            if is_any:
                idx_sentences_filtered.append(idx)

        for k in row[key_candidate_ans].keys():
            if k[0] in idx_sentences_filtered:
                candidate_ans_filtered[k] = row[key_candidate_ans][k]
    else:
        for idx, sentence in enumerate(sentences):
            is_any = False
            for i in entities:
                for word in sentence:
                    if remove_punctuation(i.lower()).strip() in remove_punctuation(" ".join(sentence).lower()).strip():
                        is_any = True
                        break

            if not len(entities):
                is_any = True
            if is_any:
                idx_sentences_filtered.append(idx)
        try:
            for k in row[key_candidate_ans].keys():
                if k[0] in idx_sentences_filtered:
                    candidate_ans_filtered[k] = row[key_candidate_ans][k]
        except:
            return {}
            
    return candidate_ans_filtered

## Get Passage Feature (*get_feature*(passage))

In [41]:
def get_feature(passage):
    passage_sentences = split_sentences(passage)
    sentences_ner = get_tag_predictions(modul_ner, tokenizer_ner, passage_sentences)
    sentences_pos = get_tag_predictions(modul_pos, tokenizer_pos, passage_sentences)
    sentences_chunk = get_tag_predictions(modul_chunking, tokenizer_chunking, passage_sentences)
    sentences_timex = get_tag_predictions(modul_timex, tokenizer_timex, passage_sentences)
    passage = []
    for i in range(len(sentences_ner)): #List of sentence
        sentence = []
        for j in range(len(sentences_ner[i])): #List of token
            sentence.append({
                "kata": sentences_ner[i][j][0],
                "ner": sentences_ner[i][j][1],
                "pos": sentences_pos[i][j][1],
                "chunk": sentences_chunk[i][j][1],
                "timex": sentences_timex[i][j][1],
            })
        passage.append(sentence)
    
    return passage

# Eval Util

In [42]:
def normalize_text(s):
  """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
  import string, re
  def remove_articles(text):
    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
    return re.sub(regex, " ", text)
  def white_space_fix(text):
    return " ".join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)
  def lower(text):
    try:
        return text.lower()
    except:
        print(type(text))
        print(text)

  return white_space_fix(remove_articles(remove_punc(lower(s))))

def exact_match(prediction, truth):
    return bool(normalize_text(prediction) == normalize_text(truth))

def compute_f1(prediction, truth):
  pred_tokens = normalize_text(prediction).split()
  truth_tokens = normalize_text(truth).split()
  
  # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
  if len(pred_tokens) == 0 or len(truth_tokens) == 0:
    return int(pred_tokens == truth_tokens)
  
  common_tokens = set(pred_tokens) & set(truth_tokens)
  
  # if there are no common tokens then f1 = 0
  if len(common_tokens) == 0:
    return 0
  
  prec = len(common_tokens) / len(pred_tokens)
  rec = len(common_tokens) / len(truth_tokens)
  
  return round(2 * (prec * rec) / (prec + rec), 2)

def eval_all(df, pred_key):
    total_exact = 0
    total_ans = 0
    total_unans = 0
    total_f1 = 0
    total_f1_ans = 0
    total_exact_unans = 0
    total_exact_ans = 0
    try:
        ground_truth = [i["text"] for i in df["answer"]]
    except:
        ground_truth = df["answer"]
    total_question = len(ground_truth)
    
    pred = df[pred_key]
    for p,t in zip(pred, ground_truth):
        try:
            len(t)
        except:
            t = ""
        if not len(t):
            total_unans += 1
            total_exact_unans += exact_match(p,t)
        else:
            total_ans +=1 
            total_exact_ans += exact_match(p,t)
            total_f1_ans += compute_f1(p,t)
            
        
        total_f1 += compute_f1(p,t)
        total_exact += exact_match(p,t)

    print(f"Exact match all: {total_exact/len(ground_truth)}")
    print(f"F1 all: {total_f1/len(ground_truth)}")
    print(f"Exact match answerable: {total_exact_ans/total_ans}")
    print(f"F1 answerable: {total_f1_ans/total_ans}")
    if total_unans:
        print(f"Exact match unanswerebale: {total_exact_unans/total_unans}")
    else:
        print("No unnswerable question")
    

In [43]:
def eval_candidate_answers(df, key):
    total_exact = 0
    total_ans = 0
    total_unans = 0
    total_f1 = 0
    total_f1_ans = 0
    total_exact_unans = 0
    total_exact_ans = 0
    f1s = []
    try:
        ground_truth = [i["text"] for i in df["answer"]]
    except:
        ground_truth = df["answer"]
        
    total_question = len(ground_truth)
    
    preds = df[key]
    
    for p,t in zip(preds, ground_truth):
        if not len(t):
            total_unans += 1
        else:
            total_ans += 1
        
        
        p[(-1, -1, -1)] = ""
            
        for key in p:
            f1_now = compute_f1(p[key], t)
            if f1_now > max_f1:
                max_f1 = f1_now
        
        if max_f1 == 1:
            if not len(t):
                total_exact_unans += 1
            else:
                total_exact_ans += 1
                total_f1_ans += 1
            total_exact += 1
        else:
            total_f1_ans += max_f1
        
        f1s.append(max_f1)
        total_f1 += max_f1

    print(f"Exact match all: {total_exact/len(ground_truth)}")
    print(f"F1 all: {total_f1/len(ground_truth)}")
    print(f"Exact match answerable: {total_exact_ans/total_ans}")
    print(f"F1 answerable: {total_f1_ans/total_ans}")
    if total_unans:
        print(f"Exact match unanswerebale: {total_exact_unans/total_unans}")
    else:
        print("No unnswerable question")
    print("Total Question: ", len(ground_truth))
    print("Total Ans Question: ", total_ans)     
    print("Total Unans Question: ", total_unans)
    
    return f1s

# Eksperimen

In [44]:
tqdm.pandas()

## Get Feature Passage and Question

In [45]:
def get_feature_all(df):
    df["question_features"] = df["question"].progress_apply(lambda x: get_feature(x))
    try:
        df["context_features"] =  df["context"].progress_apply(lambda x: get_feature(x))
    except:
        df["context_features"] =  df["passage"].progress_apply(lambda x: get_feature(x))

In [None]:
get_feature_all(tydiqa_test)

In [None]:
get_feature_all(idkmrc_test)

In [None]:
get_feature_all(squad_test)

## Get Kata Tanya dan EAT

In [42]:
def get_kata_tanya_eat(df):
    # NER + TIMEX
    df[['closed_question_1', 'eat_1']] = df['question'].apply(lambda x: pd.Series(get_eat(x, eat_dict_2)))
    # NER + TIMEX + POS
    df[['closed_question_2', 'eat_2']] = df['question'].apply(lambda x: pd.Series(get_eat(x, eat_dict_3)))    
    # Chunk + POS
    df[['open_question', 'eat_3']] = df['question'].apply(lambda x: pd.Series(get_eat(x, eat_dict_4))) 
    # All
    df[['all_question', 'eat_4']] = df['question'].apply(lambda x: pd.Series(get_eat(x, eat_dict_5)))
    # 
    df[['closed_question_3', 'eat_5']] = df['question'].apply(lambda x: pd.Series(get_eat(x, eat_dict_6)))     
    # All old
    df[['all_question_2', 'eat_6']] = df['question'].apply(lambda x: pd.Series(get_eat(x, eat_dict)))     

In [702]:
get_kata_tanya_eat(squad_test)
get_kata_tanya_eat(tydiqa_test)
get_kata_tanya_eat(idkmrc_test)

## Get Candidate Answer

In [45]:
def get_candidate_ans(df):
    df["candidate_answers_1"] = df.progress_apply(lambda row: get_candidate_answer_2(row["eat_1"], row["context_features"]) if row["eat_1"] else {}, axis=1)
    df["candidate_answers_1"] = df.progress_apply(lambda row: get_final_candidate_answers_2(row["candidate_answers_1"], row["context_features"]) if row["eat_1"] else {}, axis=1)
    df["candidate_answers_2"] = df.progress_apply(lambda row: get_candidate_answer_2(row["eat_2"], row["context_features"]) if row["eat_2"] else {}, axis=1)
    df["candidate_answers_2"] = df.progress_apply(lambda row: get_final_candidate_answers_2(row["candidate_answers_2"], row["context_features"]) if row["eat_2"] else {}, axis=1)
    df["candidate_answers_3"] = df.progress_apply(lambda row: get_candidate_answer_2(row["eat_3"], row["context_features"]) if row["eat_3"] else {}, axis=1)
    df["candidate_answers_3"] = df.progress_apply(lambda row: get_final_candidate_answers_2(row["candidate_answers_3"], row["context_features"]) if row["eat_3"] else {}, axis=1)
    df["candidate_answers_4"] = df.progress_apply(lambda row: get_candidate_answer_2(row["eat_4"], row["context_features"]) if row["eat_4"] else {}, axis=1)
    df["candidate_answers_4"] = df.progress_apply(lambda row: get_final_candidate_answers_2(row["candidate_answers_4"], row["context_features"]) if row["eat_4"] else {}, axis=1)
    df["candidate_answers_5"] = df.progress_apply(lambda row: get_candidate_answer_2(row["eat_5"], row["context_features"]) if row["eat_5"] else {}, axis=1)
    df["candidate_answers_5"] = df.progress_apply(lambda row: get_final_candidate_answers_2(row["candidate_answers_5"], row["context_features"]) if row["eat_5"] else {}, axis=1)
    df["candidate_answers_6"] = df.progress_apply(lambda row: get_candidate_answer_2(row["eat_6"], row["context_features"]) if row["eat_6"] else {}, axis=1)
    df["candidate_answers_6"] = df.progress_apply(lambda row: get_final_candidate_answers_2(row["candidate_answers_6"], row["context_features"]) if row["eat_6"] else {}, axis=1)

In [None]:
get_candidate_ans(squad_test)
get_candidate_ans(tydiqa_test)
get_candidate_ans(idkmrc_test)

## Eval Candidate Ans

In [None]:
idkmrc_test["f1_1"] = eval_candidate_answers(idkmrc_test, "candidate_answers_1" ) 
idkmrc_test["f1_2"] = eval_candidate_answers(idkmrc_test, "candidate_answers_2" ) 
idkmrc_test["f1_3"] = eval_candidate_answers(idkmrc_test, "candidate_answers_3" ) 
idkmrc_test["f1_4"] = eval_candidate_answers(idkmrc_test, "candidate_answers_4" ) 
idkmrc_test["f1_5"] = eval_candidate_answers(idkmrc_test, "candidate_answers_5" ) 

squad_test["f1_1"] = eval_candidate_answers(squad_test, "candidate_answers_1" ) 
squad_test["f1_2"] = eval_candidate_answers(squad_test, "candidate_answers_2" ) 
squad_test["f1_3"] = eval_candidate_answers(squad_test, "candidate_answers_3" ) 
squad_test["f1_4"] = eval_candidate_answers(squad_test, "candidate_answers_4" ) 
squad_test["f1_5"] = eval_candidate_answers(squad_test, "candidate_answers_5" ) 

tydiqa_test["f1_1"] = eval_candidate_answers(tydiqa_test, "candidate_answers_1" ) 
tydiqa_test["f1_2"] = eval_candidate_answers(tydiqa_test, "candidate_answers_2" ) 
tydiqa_test["f1_3"] = eval_candidate_answers(tydiqa_test, "candidate_answers_3" ) 
tydiqa_test["f1_4"] = eval_candidate_answers(tydiqa_test, "candidate_answers_4" ) 
tydiqa_test["f1_5"] = eval_candidate_answers(tydiqa_test, "candidate_answers_5" )

## Ekstrak Content Word

In [48]:
def preprocess_list(lst):
    temp = []
    for i in lst:
        temp.extend(preprocess_text(i)[0:1])
    return temp

In [47]:
def get_preprocessed(row, key_feature):
    sentences = row[key_feature]
    temp = []
    
    for idx, sentence in enumerate(sentences):
        s = []
        for word in sentence:
            if "NP" in word["chunk"] and word["ner"] == "O":
                s.extend(preprocess_text(word["kata"])[0:1])
            else:
                s.append(word["kata"])
        temp.append(s)
    
    return temp

In [49]:
def get_preprocessed_df(df):
    df["lemma"] = df.progress_apply(lambda row: get_preprocessed(row, "context_features"), axis =1)

In [None]:
get_preprocessed_df(squad_test)

In [None]:
get_preprocessed_df(tydiqa_test)

In [None]:
get_preprocessed_df(idkmrc_test)

In [54]:
def get_content_entities(df):
    df["content_entities"] = df.progress_apply(lambda row: get_content_word_entity(row), axis=1)
    df["content_np"] = df.progress_apply(lambda row: get_content_word_np(row), axis=1)
    df["content_entities_np"] = df["content_entities"] + df["content_np"]
    df["filtered_candidate_answer_entities"] = df.progress_apply(lambda row: filter_by_content_word(row, "lemma", "content_entities", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_entities_np"] = df.progress_apply(lambda row: filter_by_content_word(row, "lemma", "content_entities_np", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_entities_or"] = df.progress_apply(lambda row: filter_by_content_word_or(row, "lemma", "content_entities", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_entities_np_or"] = df.progress_apply(lambda row: filter_by_content_word_or(row, "lemma", "content_entities_np", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_np"] = df.progress_apply(lambda row: filter_by_content_word(row, "lemma", "content_np", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_np_or"] = df.progress_apply(lambda row: filter_by_content_word_or(row, "lemma", "content_np", "candidate_answers_4"), axis=1)
    

In [57]:
def get_content_entities_2(df):
    df["filtered_candidate_answer_entities_2"] = df.progress_apply(lambda row: filter_by_content_word(row, "context_coref", "content_entities", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_entities_np_2"] = df.progress_apply(lambda row: filter_by_content_word(row, "context_coref", "content_entities_np", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_entities_or_2"] = df.progress_apply(lambda row: filter_by_content_word_or(row, "context_coref", "content_entities", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_entities_np_or_2"] = df.progress_apply(lambda row: filter_by_content_word_or(row, "context_coref", "content_entities_np", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_np_2"] = df.progress_apply(lambda row: filter_by_content_word(row, "context_coref", "content_np", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_np_or_2"] = df.progress_apply(lambda row: filter_by_content_word_or(row, "context_coref", "content_np", "candidate_answers_4"), axis=1)

In [96]:
def get_filter_acc(df, key_1, key_2):
    total = 0
    for i, j in zip(df[key_1], df[key_2]):
        if j >= i:
            total += 1
            
    print(total / len(df[key_1]))

In [None]:
get_content_entities(squad_test)

In [None]:
get_content_entities(tydiqa_test)

In [None]:
get_content_entities(idkmrc_test)

In [83]:
def get_coreferential_case(df):
    filtered_rows = []
    for idx, row in df.iterrows():
        kalimat = split_sentences(row["context"])
        # print(kalimat)
        kata_entitas = row["content_entities"]   
        a = row["answer"]
        if a["text"] == "":
            continue
        if not len(kata_entitas):
            continue
        # Menghitung panjang karakter untuk setiap kalimat
        panjang_kalimat = [len(k)+1 for k in kalimat]

        # Menentukan indeks kalimat berdasarkan indeks karakter yang diberikan
        total_karakter = 0
        
        s = None
        for panjang, k in zip(panjang_kalimat, kalimat):
            total_karakter += panjang
            if a["answer_start"] < total_karakter and a["text"] in k :
                s = k
                # print(q, k)
        if s == None:
            continue
        is_ada_satu = False
        for i in kata_entitas:
            # print(i.lower(), s.lower())
            
            if remove_punctuation(i.lower()) in remove_punctuation(s.lower()):
                is_ada_satu = True
                
        if not is_ada_satu:
            # print("Masuk")
            filtered_rows.append(row)
            
    return pd.DataFrame(filtered_rows)        

In [None]:
idkmrc_test["f1_t_1"] = eval_candidate_answers(idkmrc_test, "filtered_candidate_answer_entities")
idkmrc_test["f1_t_2"] = eval_candidate_answers(idkmrc_test, "filtered_candidate_answer_entities_or")
idkmrc_test["f1_t_3"] = eval_candidate_answers(idkmrc_test, "filtered_candidate_answer_np")
idkmrc_test["f1_t_4"] = eval_candidate_answers(idkmrc_test, "filtered_candidate_answer_np_or")
idkmrc_test["f1_t_5"] = eval_candidate_answers(idkmrc_test, "filtered_candidate_answer_entities_np")
idkmrc_test["f1_t_6"] = eval_candidate_answers(idkmrc_test, "filtered_candidate_answer_entities_np_or")

In [None]:
squad_test["f1_t_1"] = eval_candidate_answers(squad_test, "filtered_candidate_answer_entities")
squad_test["f1_t_2"] = eval_candidate_answers(squad_test, "filtered_candidate_answer_entities_or")
squad_test["f1_t_3"] = eval_candidate_answers(squad_test, "filtered_candidate_answer_np")
squad_test["f1_t_4"] = eval_candidate_answers(squad_test, "filtered_candidate_answer_np_or")
squad_test["f1_t_5"] = eval_candidate_answers(squad_test, "filtered_candidate_answer_entities_np")
squad_test["f1_t_6"] = eval_candidate_answers(squad_test, "filtered_candidate_answer_entities_np_or")

In [None]:
tydiqa_test["f1_t_1"] = eval_candidate_answers(tydiqa_test, "filtered_candidate_answer_entities")
tydiqa_test["f1_t_2"] = eval_candidate_answers(tydiqa_test, "filtered_candidate_answer_entities_or")
tydiqa_test["f1_t_3"] = eval_candidate_answers(tydiqa_test, "filtered_candidate_answer_np")
tydiqa_test["f1_t_4"] = eval_candidate_answers(tydiqa_test, "filtered_candidate_answer_np_or")
tydiqa_test["f1_t_5"] = eval_candidate_answers(tydiqa_test, "filtered_candidate_answer_entities_np")
tydiqa_test["f1_t_6"] = eval_candidate_answers(tydiqa_test, "filtered_candidate_answer_entities_np_or")

## Get Passage Coref

In [89]:
def get_passage_coref(df):
    df["context_coref"] = df.progress_apply(lambda row: replace_coref(row), axis=1)

In [None]:
get_passage_coref(idkmrc_test)
get_passage_coref(tydiqa_test)
get_passage_coref(squad_test)

In [108]:
def get_content_entities_after_coref(df):
    df["filtered_candidate_answer_entities_coref"] = df.progress_apply(lambda row: filter_by_content_word(row, "context_coref", "content_entities", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_entities_np_coref"] = df.progress_apply(lambda row: filter_by_content_word(row, "context_coref", "content_entities_np", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_entities_or_coref"] = df.progress_apply(lambda row: filter_by_content_word_or(row, "context_coref", "content_entities", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_entities_np_or_coref"] = df.progress_apply(lambda row: filter_by_content_word_or(row, "context_coref", "content_entities_np", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_np_coref"] = df.progress_apply(lambda row: filter_by_content_word(row, "context_coref", "content_np", "candidate_answers_4"), axis=1)
    df["filtered_candidate_answer_np_or_coref"] = df.progress_apply(lambda row: filter_by_content_word_or(row, "context_coref", "content_np", "candidate_answers_4"), axis=1)
    
#     df["filtered_candidate_answer_entities_coref"] = df.progress_apply(lambda row: filter_by_content_word(row, "context_coref", "content_entities", "candidate_answers_4"), axis=1)
#     df["filtered_candidate_answer_entities_np_coref"] = df.progress_apply(lambda row: filter_by_content_word(row, "context_coref", "content_entities_np", "candidate_answers_4"), axis=1)

In [None]:
get_content_entities_after_coref(idkmrc_test)
get_content_entities_after_coref(tydiqa_test)
get_content_entities_after_coref(squad_test)

## Get Dependency Parsing

In [493]:
def get_dependency_graph(row):
    sentences = [[word["kata"] for word in sentence] for sentence in row["context_features"] ]
    pos_tags = [[word["pos"] for word in sentence] for sentence in row["context_features"] ]
    deps = []
    idx_sentence_candidate = set([k[0] for k in row["filtered_candidate_answer_entities_np_or"].keys()])
    
    count = 0
    for s, t in zip(sentences, pos_tags):
        if count not in idx_sentence_candidate:
            deps.append([])
        else:
            deps.append(predict_dependency(s, t))
        count += 1
    
    return deps

def get_df_dep(df):
    df["dependencies"] = df.progress_apply(lambda row: get_dependency_graph(row), axis =1)
    
def get_dependency_graph_q(row):
    sentences = [[]]
    pos_tags = [[]]
    for sentence in row["question_features"]:
        for word in sentence:
            # print(word)
            if "WH" not in word["pos"] and word["kata"].lower() not in eat_dict_4.keys() and "Z" not in word["pos"]:
                sentences[0].append(word["kata"])
                pos_tags[0].append(word["pos"])
    deps = predict_dependency(sentences[0], pos_tags[0])
    
    return deps

def get_df_dep_q(df):
    df["q_dependencies"] = df.progress_apply(lambda row: get_dependency_graph_q(row), axis =1)

In [634]:
def get_dependency_graph_2(row):
    sentences = [[word["kata"] for word in sentence] for sentence in row["context_features"] ]
    pos_tags = [[word["pos"] for word in sentence] for sentence in row["context_features"] ]
    deps = row["dependencies"]
    idx_sentence_candidate = set([k[0] for k in row["filtered_candidate_answer_entities_np_or"].keys()])
    # print(idx_sentence_candidate)
    count = 0
    for s, t, d in zip(sentences, pos_tags, deps):
        # print(d)
        if len(d) != 0:
            # print(count, "Masuk")
            count += 1 
            continue
            
        if count not in idx_sentence_candidate:
            # print("Lahhh", count)
            deps.append([])
        else:
            deps[count] = predict_dependency(s, t)
        count += 1
    
    return deps

def get_df_dep_2(df):
    df["dependencies"] = df.progress_apply(lambda row: get_dependency_graph_2(row), axis =1)

In [None]:
get_df_dep(squad_test)

In [None]:
get_df_dep(tydiqa_test)

In [None]:
get_df_dep(idkmrc_test)

In [None]:
get_df_dep_q(squad_test)

In [None]:
get_df_dep_q(tydiqa_test)

In [None]:
get_df_dep_q(idkmrc_test)

## Built Graph

In [247]:
def remove_numbers_regex(string):
    pattern = r'\d+'  # Matches one or more digits
    return re.sub(pattern, '', string)

In [525]:
import networkx as nx

def create_directed_graph(nodes):
    graph = nx.DiGraph()
    for node in nodes:
        index, name, label, parent = node
        graph.add_node(index, name=name)
        if parent != -1:
            graph.add_edge(parent, index, label=label)
            
    return graph

def create_undirected_graph(nodes):
    graph = nx.Graph()
    for node in nodes:
        index, name, label, parent = node
        graph.add_node(index, name=name)
        if parent != -1:
            graph.add_edge(parent, index, label=label)
    return graph

def calculate_distance(graph, source, target):
    try:
        return nx.shortest_path_length(graph, source, target)
    except nx.NetworkXNoPath:
        return float('9999')
    
# def get_target_idx(row, key_candidate, key_content_word):
#     idxs = {}
#     for k in row[key_candidate].keys():
#         if k[0] in idxs.keys():
#             continue
#         for idx, word in enumerate(row["context_features"][k[0]]):
#             for target in row[key_content_word]:
#                 if remove_punctuation(word["kata"]).lower().strip().maketrans('', '', string.digits) == remove_punctuation(target).lower().strip().maketrans('', '', string.digits):
#                     idxs.setdefault(k[0], []).append(idx)
                    
#     return idxs

def get_target_idx(row, key_candidate, key_content_word):
    idxs = {}
    for k in row[key_candidate].keys():
        if k[0] in idxs.keys() or k[0] < 0:
            continue
        for idx, word in enumerate(row["lemma"][k[0]]):
            for target in row[key_content_word]:
                if remove_numbers_regex(remove_punctuation(word).lower().strip()) == remove_numbers_regex(remove_punctuation(target).lower().strip()):
                    # print(word, target)
                    idxs.setdefault(k[0], []).append(idx)
                    
    return idxs

def get_target_idx_2(row, key_candidate, key_content_word, kata):
    idxs = {}
    for k in row[key_candidate].keys():
        if k[0] in idxs.keys() or k[0] < 0:
            continue
        for idx, word in enumerate(row["lemma"][k[0]]):
            try:
                if remove_numbers_regex(remove_punctuation(word).lower().strip()) == remove_numbers_regex(remove_punctuation(preprocess_text(kata)[0]).lower().strip()):
                    # print(word, target)
                    idxs.setdefault(k[0], []).append(idx)
            except IndexError:
                if remove_numbers_regex(remove_punctuation(word).lower().strip()) == remove_numbers_regex(remove_punctuation(kata).lower().strip()):
                    # print(word, target)
                    idxs.setdefault(k[0], []).append(idx)
                    
    return idxs

In [83]:
def visualize_directed_graph(graph):
    pos = nx.spring_layout(graph)
    labels = {node: f"{node}: {data['name']}" for node, data in graph.nodes(data=True)}
    
    plt.figure(figsize=(8, 6))
    nx.draw_networkx(graph, pos, with_labels=True, labels=labels, node_color='lightblue',
                     node_size=800, font_size=10, arrows=True, arrowstyle='->')
    plt.title("Directed Graph Visualization")
    plt.axis('off')
    plt.show()

def visualize_undirected_graph(graph):
    pos = nx.spring_layout(graph)
    labels = {node: f"{node}: {data['name']}" for node, data in graph.nodes(data=True)}
    
    plt.figure(figsize=(8, 6))
    nx.draw_networkx(graph, pos, with_labels=True, labels=labels, node_color='lightblue',
                     node_size=800, font_size=10, edge_color='gray', linewidths=0.5)
    plt.title("Undirected Graph Visualization")
    plt.axis('off')
    plt.show()


In [723]:
def get_lowest_key(my_dict):
    min_value = min(my_dict.values())
    keys_with_lowest_value = [key for key, value in my_dict.items() if value == min_value]
    return keys_with_lowest_value

def get_answer(row, key_candidate, key_word_content):
    targets = get_target_idx(row, key_candidate, key_word_content)
    directed_graphs = {}
    undirected_graphs = {}
    filter_by_label = {}
    if len(row[key_candidate]) == 0:
        return {}
    
    # print(row[key_candidate])
    for k, v in row[key_candidate].items():
        directed_graphs[k[0]] = create_directed_graph(row["dependencies"][k[0]])
        undirected_graphs[k[0]] = create_undirected_graph(row["dependencies"][k[0]])
    
    for k, v in row[key_candidate].items():
        # print(v)
        for i, j in enumerate(v.split()):
            node_index = k[1] + i 
            try:
                parent_index = list(directed_graphs[k[0]].predecessors(node_index))[0]
                label = directed_graphs[k[0]].get_edge_data(parent_index, node_index)['label']
            except IndexError:
                label = "root"
            # except nx.exception.NetworkXError:
            #     print(row["context"])
            
            # print(label)
            if row["all_question"] in ["apa", "apakah"]:
                if label in ["nsubj", "root", "obj"]:
                    filter_by_label[k] = v
            elif row["all_question"] in ["kapan", "kapankah"]:
                if label in ["nmod", "nummod", "appos", "root"]:
                    filter_by_label[k] = v
            elif row["all_question"] in ["dimana", "di mana", "darimanakah", "dari manakah", "darimana", "dari mana", "dimanakah", "di manakah", "manakah", "mana", "kemana", "ke mana", "kemanakah", "ke manakah"]:
                if label in ["nmod", "obl", "root"]:
                    filter_by_label[k] = v
            elif row["all_question"] in ["siapa", "siapakah"]:
                if label in ["flat", "root", "nsubj"]:
                    filter_by_label[k] = v
            elif row["all_question"] in ["berapa", "berapakah", "seberapa"]:
                if label in ["nmod", "root", "nummod", "punct"]:
                    filter_by_label[k] = v
            elif row["all_question"] in ["kenapa", "mengapa", "beberapa", "bagaimanakah"]:
                if label in ["compound", "obj"]:
                    filter_by_label[k] = v
            elif row["all_question"] in ["bagaimana"]:
                if label in ["compound", "obj", "root"]:
                    filter_by_label[k] = v
            else:
                filter_by_label[k] = v
                    
    return filter_by_label
    # print(filter_by_label)
    dis_directed = {}
    dis_undirected = {}
    for k, v in filter_by_label.items():
        found_overlap = True
        for en in row[key_word_content]:
            if remove_punctuation(en).lower().strip() not in v.lower():
                found_overlap = False
                break
        if found_overlap:
            continue
        total_1 = 0
        total_2 = 0
        # try:
        # print(targets, "targets")
        for i in targets[k[0]]:
            total_1 += calculate_distance(directed_graphs[k[0]], k[1], i)
            total_2 += calculate_distance(undirected_graphs[k[0]], k[1], i)
        # except:
        #     continue
        dis_directed[k] = total_1
        dis_undirected[k] = total_2
    
    # print(dis_directed)
    if not len(dis_directed):
        return ""

    lowest_key = get_lowest_key(dis_directed)

    if len(lowest_key) == 1:
        return row[key_candidate][lowest_key[0]]

    lowest_key = get_lowest_key(dis_undirected)
    if len(lowest_key) == 1:
        return row[key_candidate][lowest_key[0]]
    else:
        # print(lowest_key)
        return "not implemented"

In [636]:
def get_answer_dep(row, key_candidate, key_entities, key_np):
    idx_target_entities = get_target_idx(row, key_candidate, key_entities)
    idx_target_np = get_target_idx(row, key_candidate, key_np)
    target_entities = row[key_entities]
    target_np = row[key_np]
    target_all = target_entities + target_np
    idx_root = {}
    q_root = ""
    
    directed_graphs = {}
    undirected_graphs = {}
    filter_by_label = row["filter_by_label"]
    # filter_by_label = filter_by_label.pop((-1, -1, -1))
    
    if len(row[key_candidate]) == 0:
        return ""
    
    for k, v in row[key_candidate].items():
        if k[0] < 0:
            continue
        for i, j in enumerate(row["dependencies"][k[0]]):
            if j[2] == "root":
                idx_root[k[0]] = i
        directed_graphs[k[0]] = create_directed_graph(row["dependencies"][k[0]])
        undirected_graphs[k[0]] = create_undirected_graph(row["dependencies"][k[0]])
    
    for kata in row["q_dependencies"]:
        if kata[2] == "root":
            q_root = kata[1]
            break
    
            
    idx_target_from_q = get_target_idx_2(row, key_candidate, key_entities, q_root)
    
    dis_directed = {}
    dis_undirected = {}
    
    #11111111
    for k, v in filter_by_label.items():
        jaraks = []
        found_overlap = False
        for en in target_entities:
           
            if remove_punctuation(en).lower().strip() in v.lower():
                found_overlap = True
                break
                
        if found_overlap:
            continue
            
        try:
            for i in idx_target_entities[k[0]]:
                for j in range(k[1], k[2]):
                    jaraks.append(calculate_distance(directed_graphs[k[0]], i, j))
        except KeyError:
            continue
           
        dis_directed[k] = min(jaraks)
        
    # print(dis_directed)
    
    if dis_directed:
        lowest_key = get_lowest_key(dis_directed)
        if len(lowest_key) == 1 and dis_directed[lowest_key[0]] != 9999:
            return row[key_candidate][lowest_key[0]]
    
    #22222222
    for k, v in filter_by_label.items():
        jaraks = []
        found_overlap = False
        for en in target_np:
            if remove_punctuation(en).lower().strip() in v.lower():
                # print(remove_punctuation(en).lower().strip(), v.lower())
                found_overlap = True
                break
                
        if found_overlap:
            continue
        try:
            for i in idx_target_np[k[0]]:
                for j in range(k[1], k[2]):
                    jaraks.append(calculate_distance(directed_graphs[k[0]], i, j))
        except KeyError:
            continue
        dis_directed[k] = min(jaraks)
    # print(dis_directed)
    
    
    if dis_directed:
        lowest_key = get_lowest_key(dis_directed)
        if len(lowest_key) == 1 and dis_directed[lowest_key[0]] != 9999:
            return row[key_candidate][lowest_key[0]]
    
    # 222.111
    for k, v in filter_by_label.items():
        jaraks = []
        # found_overlap = False
#         for en in target_from_q:
#             if remove_punctuation(en).lower().strip() in v.lower():
#                 # print(remove_punctuation(en).lower().strip(), v.lower())
#                 found_overlap = True
#                 break
                
#         if found_overlap:
#             continue
        try:
            for i in idx_target_from_q[k[0]]:
                for j in range(k[1], k[2]):
                    jaraks.append(calculate_distance(directed_graphs[k[0]], i, j))
        except KeyError:
            continue
        dis_directed[k] = min(jaraks)
    # print(dis_directed)
    
    
    if dis_directed:
        lowest_key = get_lowest_key(dis_directed)
        if len(lowest_key) == 1 and dis_directed[lowest_key[0]] != 9999:
            return row[key_candidate][lowest_key[0]]
    
    # 333333333
    for k, v in filter_by_label.items():
        if k[0] < 0:
            continue
        jaraks = []
        found_overlap = True
        for en in target_all:
            # Kalau semua kata pada entitas ada di answer
            if remove_punctuation(en).lower().strip() not in v.lower():
                found_overlap = False
                break
            # if remove_punctuation(en).lower().strip() in v.lower():
            #     # print(remove_punctuation(en).lower().strip(), v.lower())
            #     found_overlap = True
            #     break
                
        if found_overlap:
            continue
            
        try:
            for j in range(k[1], k[2]):
                jaraks.append(calculate_distance(directed_graphs[k[0]], idx_root[k[0]], j))
        except KeyError:
            continue
        dis_directed[k] = min(jaraks)
        
    # print(dis_directed)
    
    if dis_directed:
        lowest_key = get_lowest_key(dis_directed)
        if len(lowest_key) == 1 and dis_directed[lowest_key[0]] != 9999:
            return row[key_candidate][lowest_key[0]]
        
    #444444    
    for k, v in filter_by_label.items():
        jaraks = []
        found_overlap = False
        for en in target_entities:
            if remove_punctuation(en).lower().strip() in v.lower():
                found_overlap = True
                break
                
        if found_overlap:
            continue
            
        try:
            for i in idx_target_entities[k[0]]:
                for j in range(k[1], k[2]):
                    jaraks.append(calculate_distance(directed_graphs[k[0]], j, i))
        except KeyError:
            continue
        dis_directed[k] = min(jaraks)
    # print(dis_directed)
    
    ### 55555q
    if dis_directed:
        lowest_key = get_lowest_key(dis_directed)
        if len(lowest_key) == 1 and dis_directed[lowest_key[0] ]!= 9999:
            return row[key_candidate][lowest_key[0]]
    
    for k, v in filter_by_label.items():
        jaraks = []
        found_overlap = False
        for en in target_np:
           
            if remove_punctuation(en).lower().strip() in v.lower():
                # print(remove_punctuation(en).lower().strip(), v.lower())
                found_overlap = True
                break
                
        if found_overlap:
            continue
            
        try:
            for i in idx_target_np[k[0]]:
                for j in range(k[1], k[2]):
                    jaraks.append(calculate_distance(directed_graphs[k[0]], j, i))
        except KeyError:
            continue
        dis_directed[k] = min(jaraks)
        
    # print(dis_directed)
    
    if dis_directed:
        lowest_key = get_lowest_key(dis_directed)
        if len(lowest_key) == 1 and dis_directed[lowest_key[0]] != 9999:
            return row[key_candidate][lowest_key[0]]
    
    return ""
    

In [None]:
squad_test["ans_dep"] = squad_test.progress_apply(lambda row: get_answer_dep(row, "filter_by_label", "content_entities", "content_np"), axis=1)

In [None]:
tydiqa_test["ans_dep"] = tydiqa_test.progress_apply(lambda row: get_answer_dep(row, "filter_by_label", "content_entities", "content_np"), axis=1)

In [None]:
idkmrc_test["ans_dep"] = idkmrc_test.progress_apply(lambda row: get_answer_dep(row, "filter_by_label", "content_entities", "content_np"), axis=1)

In [None]:
idkmrc_test["filter_by_label"] = idkmrc_test.progress_apply(lambda row: get_answer(row, "filtered_candidate_answer_entities_np_or", "content_np"), axis=1)
tydiqa_test["filter_by_label"] = tydiqa_test.progress_apply(lambda row: get_answer(row, "filtered_candidate_answer_entities_np_or", "content_np"), axis=1)
squad_test["filter_by_label"] = squad_test.progress_apply(lambda row: get_answer(row, "filtered_candidate_answer_entities_np_or", "content_np"), axis=1)

In [564]:
def get_filtered_not_null(df, key):
    temp_df = df[df[key].apply(lambda x: len(x) != 0)]
    return temp_df

## Text Similarity

In [734]:
def get_answer_ts(row, debug = False):
    try:
        passage = row["passage"]
    except KeyError:
        passage = row["context"]
    
    passage_sentences = split_sentences(passage)
    
    candidate_sentences = passage_sentences

    question_pos_wo_wh = filter_wh([[w["kata"], w["pos"]] for w in row["question_features"][0]])
    candidate_answer = row["filter_by_label"]
    
    # Get eat
    kata_tanya = row["all_question"]
    eat = row["eat_4"]
    
    question_key = get_question_key(question_pos_wo_wh)
    
    final_answer = get_final_answer(passage_sentences, kata_tanya, question_key, candidate_answer, debug)
    return final_answer

In [None]:
squad_test["ans_ts"] = squad_test.progress_apply(lambda row: get_answer_ts(row), axis=1)

In [None]:
tydiqa_test["ans_ts"] = tydiqa_test.progress_apply(lambda row: get_answer_ts(row), axis=1)

In [None]:
idkmrc_test["ans_ts"] = idkmrc_test.progress_apply(lambda row: get_answer_ts(row), axis=1)

In [None]:
eval_all(squad_test, "ans_ts")
eval_all(tydiqa_test, "ans_ts")
eval_all(idkmrc_test, "ans_ts")