In [1]:
import os
import json
from sqlitedict import SqliteDict
import shutil
from tqdm import tnrange

from utils_db import dict_save_json, dict_load_json, load_jsonl
from text_database import TextDatabase, Text
from vocabulary import Vocabulary
from vocabulary import count_n_grams
from tfidf_database import TFIDFDatabase

import config


In [2]:
def write_jsonl(filename, dic_list):
    # description: only use for wikipedia dump
    output_file = open(filename, 'w', encoding='utf-8')
    for dic in dic_list:
        json.dump(dic, output_file) 
        output_file.write("\n")
        

In [23]:
class Claim:
    def __init__(self, claim_dictionary):
        self.id = claim_dictionary['id']
        self.verifiable = claim_dictionary['verifiable']
        self.label = claim_dictionary['label']
        self.claim = claim_dictionary['claim']
        self.evidence = claim_dictionary['evidence']
        if 'docs_selected' in claim_dictionary:
            self.docs_selected = claim_dictionary['docs_selected']
    def get_tokenized_claim(self, method_tokenization):
        claim_without_dot = self.claim[:-1]  # remove . at the end
        text = Text(claim_without_dot, "claim")
        tokenized_claim = text.process(method_tokenization)
        return tokenized_claim
    def get_n_grams(self, method_tokenization, n_gram):
        return count_n_grams(self.get_tokenized_claim(method_tokenization), n_gram, 'str')
    

In [4]:
__root__ = os.path.dirname(os.path.dirname(os.getcwd()))
__data_dir__ = "_01_data"
__wiki_dir__ = "_02_wikipedia_pages"
__dep_dir__ = "_90_dependencies"
__raw_data_dir__ = "_01_raw_data"
__jupyter_notebook_dir__ = "_98_jupyter_notebook"
path_mydict_tf_idf = "mydict_tf_idf.sqlite"
path_mydict_ids = "mydict_ids.sqlite"


In [12]:
experiment_nr = 3
file_name = 'experiment_%.2d.json'%(experiment_nr)
path_experiment = os.path.join(config.ROOT, config.CONFIG_DIR, file_name)
with open(path_experiment) as json_data_file:
    data = json.load(json_data_file)

vocab = Vocabulary(path_wiki_database = os.path.join(config.ROOT, data['path_large_wiki_database']), 
    table_name_wiki = data['table_name_wiki'], n_gram = data['n_gram'],
    method_tokenization = data['method_tokenization'], source = data['vocabulary_source'])

tf_idf_db = TFIDFDatabase(vocabulary = vocab, method_tf = data['method_tf'], method_df = data['method_df'],
    delimiter = data['delimiter'], threshold = data['threshold'], source = data['tf_idf_source'])

path_title_unigrams_ids = tf_idf_db.path_ids_dict
path_title_unigrams_tf_idf = tf_idf_db.path_tf_idf_dict


/home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_01_data/_03_database/wiki.db
/home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_04_results/vocab_text_2_t_mlc
Load existing settings file
Word count count dictionary already exists
Document count dictionary already exists
Load title_2_id and id_2_title dictionaries
Directory  /home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_04_results/vocab_text_2_t_mlc/thr_0.001_  already exists
Directory  /home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_04_results/vocab_text_2_t_mlc/thr_0.001_/ex_raw_count_inverse_document_frequency_title  already exists
selected vocabulary dictionary already exists
empty database already exists
database already filled


In [45]:
path_dev_set = os.path.join(__root__, __data_dir__, __raw_data_dir__, "dev.jsonl")
results = load_jsonl(path_dev_set)
results[0:5]

[{'id': 91198,
  'verifiable': 'NOT VERIFIABLE',
  'label': 'NOT ENOUGH INFO',
  'claim': 'Colin Kaepernick became a starting quarterback during the 49ers 63rd season in the National Football League.',
  'evidence': [[[108548, None, None, None]]]},
 {'id': 194462,
  'verifiable': 'NOT VERIFIABLE',
  'label': 'NOT ENOUGH INFO',
  'claim': 'Tilda Swinton is a vegan.',
  'evidence': [[[227768, None, None, None]]]},
 {'id': 137334,
  'verifiable': 'VERIFIABLE',
  'label': 'SUPPORTS',
  'claim': 'Fox 2000 Pictures released the film Soul Food.',
  'evidence': [[[289914, 283015, 'Soul_Food_-LRB-film-RRB-', 0]],
   [[291259, 284217, 'Soul_Food_-LRB-film-RRB-', 0]],
   [[293412, 285960, 'Soul_Food_-LRB-film-RRB-', 0]],
   [[337212, 322620, 'Soul_Food_-LRB-film-RRB-', 0]],
   [[337214, 322622, 'Soul_Food_-LRB-film-RRB-', 0]]]},
 {'id': 166626,
  'verifiable': 'NOT VERIFIABLE',
  'label': 'NOT ENOUGH INFO',
  'claim': 'Anne Rice was born in New Jersey.',
  'evidence': [[[191656, None, None, None]

In [43]:
# === constants === #
delimiter ='\k'

# === variables === #
K = 20

# === load claims === #
path_dev_set = os.path.join(__root__, __data_dir__, __raw_data_dir__, "dev.jsonl")
results = load_jsonl(path_dev_set)

# === process === #
method_tokenization = vocab.method_tokenization #["tokenize", "make_lower_case"]

mydict_ids = SqliteDict(path_title_bigrams_ids)
mydict_tf_idf = SqliteDict(path_title_bigrams_tf_idf)

file_name = 'predicted_labels_' + str(K) + '.json'
path_predicted_documents = os.path.join(tf_idf_db.base_dir, file_name)

if os.path.isfile(path_predicted_documents):
    print('file already exists')
else:
    for i in tnrange(len(results)):
        claim = Claim(results[i])

        dictionary = {}

        n_grams, nr_words = claim.get_n_grams(method_tokenization, vocab.n_gram)

        for word in n_grams:
            try:
                word_id_list = mydict_ids[word].split(delimiter)[1:]
                word_tf_idf_list = mydict_tf_idf[word].split(delimiter)[1:]
            except KeyError:
                word_id_list = []
                word_tf_idf_list = []
            for j in range(len(word_id_list)):
                id = int(word_id_list[j])       
                tf_idf = float(word_tf_idf_list[j])
                try:
                    dictionary[id] = dictionary[id] + tf_idf
                except KeyError:
                    dictionary[id] = tf_idf

        keys_list = list(dictionary.keys())
        tf_idf_list = list(dictionary.values())

        dictionary = {}

        # make K best selection based on score
        selected_ids = sorted(range(len(tf_idf_list)), key=lambda l: tf_idf_list[l])[-K:]
        selected_ids = [keys_list[l] for l in selected_ids]

        results[i]['docs_selected'] = selected_ids
        
    write_jsonl(path_predicted_documents, results)


file already exists


In [42]:
results = load_jsonl(path_predicted_documents)

nr_correct = 0
nr_claims = 0
nr_no_evidence = 0
nr_title_not_in_dict = 0

method_list = ["min_one", "overall_score"]
method = method_list[0]

for i in tnrange(len(results)):
    claim = Claim(results[i])
    
    # scoring the selection
    score = "incorrect"  # 'correct', 'no_evidence'
    if method == "min_one":
        for interpreter in claim.evidence:
            for proof in interpreter:
                title_proof = proof[2]
                if title_proof == None:
                    score = "no_evidence"
                else:
                    try:
                        id_proof = vocab.title_2_id_dict[title_proof]
                        if id_proof in claim.docs_selected:
                            score = "correct"
                    except KeyError:
                        score = "title_not_in_dictionary"
#                         print("title not in dictionary, check out", title_proof)
                        break
    nr_claims += 1
    if score == 'title_not_in_dictionary':
        nr_title_not_in_dict += 1
    elif score == "no_evidence":
        nr_no_evidence += 1
    elif score == "correct":
        nr_correct += 1
print(nr_correct / float(nr_claims - nr_no_evidence - nr_title_not_in_dict + 0.000001))

HBox(children=(IntProgress(value=0, max=19998), HTML(value='')))


0.6465595634550955


# Experiment

In [None]:
# from text_database import TextDatabase

# path_large_wiki_database = '/home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_01_data/_03_database/wiki.db' 
# # path_wiki_database = 'wiki.db'
# table_name_wiki = 'wikipages'

# text_db = TextDatabase(path_large_wiki_database, table_name_wiki)
# path_dev_set = os.path.join(__root__, __data_dir__, __raw_data_dir__, "dev.jsonl")
# results = load_jsonl(path_dev_set)
# i = 5
# method_tokenization = ["tokenize", "make_lower_case"]
# claim = Claim(results[i])
# print(claim.claim)
# print(claim.evidence)
# id = title_2_id_dict['Damon_Albarn']
# print(text_db.get_tokenized_title_from_id(id))
# print(text_db.get_tokenized_text_from_id(id))

In [None]:
# import os
# import shutil
# import json
# import config

# from vocabulary import Vocabulary

# from tfidf_database import TFIDFDatabase

# # === unigram titles === #
# experiment_nr = 4
# file_name = 'experiment_%.2d.json'%(experiment_nr)
# path_experiment = os.path.join(config.ROOT, config.CONFIG_DIR, file_name)
# with open(path_experiment) as json_data_file:
#     data = json.load(json_data_file)

# vocab_1 = Vocabulary(path_wiki_database = os.path.join(config.ROOT, data['path_large_wiki_database']), 
#     table_name_wiki = data['table_name_wiki'], n_gram = data['n_gram'],
#     method_tokenization = data['method_tokenization'], source = data['vocabulary_source'])

# tf_idf_db_1 = TFIDFDatabase(vocabulary = vocab_1, method_tf = data['method_tf'], method_df = data['method_df'],
#     delimiter = data['delimiter'], threshold = data['threshold'], source = data['tf_idf_source'])

# # === bigram titles === #
# experiment_nr = 3
# file_name = 'experiment_%.2d.json'%(experiment_nr)
# path_experiment = os.path.join(config.ROOT, config.CONFIG_DIR, file_name)
# with open(path_experiment) as json_data_file:
#     data = json.load(json_data_file)

# vocab_2 = Vocabulary(path_wiki_database = os.path.join(config.ROOT, data['path_large_wiki_database']), 
#     table_name_wiki = data['table_name_wiki'], n_gram = data['n_gram'],
#     method_tokenization = data['method_tokenization'], source = data['vocabulary_source'])

# tf_idf_db_2 = TFIDFDatabase(vocabulary = vocab_2, method_tf = data['method_tf'], method_df = data['method_df'],
#     delimiter = data['delimiter'], threshold = data['threshold'], source = data['tf_idf_source'])
# path_title_unigrams_ids = tf_idf_db_1.path_ids_dict
# path_title_unigrams_tf_idf = tf_idf_db_1.path_tf_idf_dict
# path_title_bigrams_ids = tf_idf_db_2.path_ids_dict
# path_title_bigrams_tf_idf = tf_idf_db_2.path_tf_idf_dict


In [107]:
path_large_wiki_database = '/home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_01_data/_03_database/wiki.db' 
# path_wiki_database = 'wiki.db'
table_name_wiki = 'wikipages'
# table_name_tf_idf = 'tf_idf'
# path_tf_idf_database = 'tf_idf.db'
# path_mydict_tf_idf = 'mydict_tf_idf.sqlite'
# path_mydict_ids = 'mydict_ids.sqlite'

# === settings experiment === #
n_gram = 2
# method_tokenization = ['tokenize', 'remove_space', 'make_lower_case', 'lemmatization_get_nouns']
method_tokenization = ['tokenize', 'make_lower_case']
threshold = 0.001
method_tf = 'raw_count' # raw_count term_frequency
method_df = 'inverse_document_frequency' # 
delimiter = '\k'

vocab = Vocabulary(path_large_wiki_database, table_name_wiki, n_gram, method_tokenization, 'text')
# tf_idf_db = TFIDFDatabase(vocab, method_tf, method_df, delimiter, threshold, 'text')

NameError: name 'Vocabulary' is not defined

In [10]:
id_nr=1

vocab.text_database.get_tokenized_text_from_id(id_nr, method_tokenization)

['the',
 'following',
 'are',
 'the',
 'football',
 '-lrb-',
 'soccer',
 '-rrb-',
 'events',
 'of',
 'the',
 'year',
 '1928',
 'throughout',
 'the',
 'world',
 '.',
 '']