In [1]:
%load_ext autoreload
%autoreload 2
%load_ext autotime

In [2]:
import json
import pandas as pd
from tqdm.notebook import tqdm

from libs.dataset_reader.utils import parse_raw_question, answser_to_index
from libs.knowledge_base.worldtree_KB import WorldTreeKB
from libs.knowledge_base.tablestore import load_single_table, load_tables_from_dir
from libs.knowledge_base.utils import remove_punctuations, preprocess, WorldTreeLemmatizer

time: 3.7 s


In [3]:
# Load the questions

file_path = "data/questions/questions.train.tsv"

df_train = pd.read_csv(file_path, delimiter="\t")
print(f"Train: {len(df_train)} questions.")

train_questions = []
for _, row in df_train.iterrows():
    qid = row["QuestionID"]
    raw_question = row["question"]
    question, choices = parse_raw_question(
        raw_question)
    answer = row["AnswerKey"]
    answer_idx = answser_to_index(answer)
    train_questions.append({
        "id": qid,
        "question": question,
        "choices": choices,
        "answer": answer_idx
    })

Train: 2207 questions.
time: 388 ms


In [4]:
# Load the tables

tables_dir = "data/tablestore/v2.1/"

facts = load_tables_from_dir(tables_dir)

time: 1.16 s


In [5]:
# Collect the corpus to turn the TFIDF/BM25 parameter

corpus = []

for fact in facts:
    corpus.append(fact["fact"])

for question in train_questions:
    corpus.append(question["question"])



# Load the knowledge base
lemmatizer = WorldTreeLemmatizer("others/lemmatization-en.txt")
KB = WorldTreeKB(lemmatizer=lemmatizer)
KB.fit_to_corpus(corpus)
KB.set_documents(facts)


time: 26 s


In [6]:
KB.query_relevant_facts("tree", 3)

[{'id': '0d3a-8739-9f42-5e1c',
  'table': 'KINDOF',
  'fact': 'a tree is a kind of plant',
  'processed_fact': 'tree kind plant',
  'relevance_score': 0.7652165901641027},
 {'id': 'e818-cf4a-2d3a-c8c6',
  'table': 'KINDOF',
  'fact': 'trees are a kind of resource',
  'processed_fact': 'tree kind resource',
  'relevance_score': 0.7043037663202094},
 {'id': '0340-14ae-a023-ac50',
  'table': 'KINDOF',
  'fact': 'a tree is a kind of matter',
  'processed_fact': 'tree kind matt',
  'relevance_score': 0.6737261327908123}]

time: 14.7 ms


In [7]:
# Prepare the supporting facts for the training data

for question in tqdm(train_questions):
    
    query = question["question"]
    top10_results = KB.query_relevant_facts(query, 10)
    top10_facts = [res["fact"] +"." for res in top10_results] # Make them end with a period.
    
    question["supports"] = top10_facts
    
with open("train.json", 'w') as f:
    json.dump(train_questions, f, indent=4)

HBox(children=(FloatProgress(value=0.0, max=2207.0), HTML(value='')))


time: 11.8 s


In [8]:
# Prepare the supporting facts for the dev data

file_path = "data/questions/questions.dev.tsv"

df_dev = pd.read_csv(file_path, delimiter="\t")
print(f"Train: {len(df_dev)} questions.")

dev_questions = []
for _, row in df_dev.iterrows():
    qid = row["QuestionID"]
    raw_question = row["question"]
    question, choices = parse_raw_question(
        raw_question)
    answer = row["AnswerKey"]
    answer_idx = answser_to_index(answer)
    dev_questions.append({
        "id": qid,
        "question": question,
        "choices": choices,
        "answer": answer_idx
    })

for question in tqdm(dev_questions):
    
    query = question["question"]
    top10_results = KB.query_relevant_facts(query, 10)
    top10_facts = [res["fact"] +"." for res in top10_results] # Make them end with a period.
    
    question["supports"] = top10_facts
    
with open("dev.json", 'w') as f:
    json.dump(dev_questions, f, indent=4)

Train: 496 questions.


HBox(children=(FloatProgress(value=0.0, max=496.0), HTML(value='')))


time: 2.76 s


In [9]:
# Prepare the supporting facts for the dev data

file_path = "data/questions/questions.test.tsv"

df_test = pd.read_csv(file_path, delimiter="\t")
print(f"Test: {len(df_test)} questions.")

test_questions = []
for _, row in df_test.iterrows():
    qid = row["QuestionID"]
    raw_question = row["question"]
    question, choices = parse_raw_question(
        raw_question)
    answer = row["AnswerKey"]
    answer_idx = answser_to_index(answer)
    test_questions.append({
        "id": qid,
        "question": question,
        "choices": choices,
        "answer": answer_idx
    })

for question in tqdm(test_questions):
    
    query = question["question"]
    top10_results = KB.query_relevant_facts(query, 10)
    top10_facts = [res["fact"] +"." for res in top10_results] # Make them end with a period.
    
    question["supports"] = top10_facts
    
with open("test.json", 'w') as f:
    json.dump(test_questions, f, indent=4)

Test: 1664 questions.


HBox(children=(FloatProgress(value=0.0, max=1664.0), HTML(value='')))


time: 9.1 s
