In [78]:
import os
import shutil
import json
from sqlitedict import SqliteDict

from utils_db import load_jsonl
from vocabulary import VocabularySqlite
from tfidf_database import TFIDFDatabaseSqlite
from wiki_database import WikiDatabaseSqlite
# from text_database import TextDatabaseSqlite

import config


In [None]:
import os
import shutil
import json
from sqlitedict import SqliteDict

from utils_db import load_jsonl
from vocabulary import VocabularySqlite
from tfidf_database import TFIDFDatabaseSqlite
from wiki_database import WikiDatabaseSqlite
# from text_database import TextDatabaseSqlite
from utils_doc_results import Claim, ClaimDocTokenizer

import config

def get_vocab_tf_idf_from_exp(experiment_nr):
    file_name = 'experiment_%.2d.json'%(experiment_nr)
    path_experiment = os.path.join(config.ROOT, config.CONFIG_DIR, file_name)

    with open(path_experiment) as json_data_file:
        data = json.load(json_data_file)

    vocab = VocabularySqlite(wiki_database = wiki_database, n_gram = data['n_gram'],
        method_tokenization = data['method_tokenization'], tags_in_db_flag = data['tags_in_db_flag'], 
        source = data['vocabulary_source'], tag_list_selected = data['tag_list_selected'])

    tf_idf_db = TFIDFDatabaseSqlite(vocabulary = vocab, method_tf = data['method_tf'], method_df = data['method_df'],
        delimiter = data['delimiter'], threshold = data['threshold'], source = data['tf_idf_source'])
    return vocab, tf_idf_db

def get_dict_from_n_gram(n_gram_list, mydict_ids, mydict_tf_idf, tf_idf_db):
    
    dictionary = {}

    for word in n_gram_list:
        try:
            word_id_list = mydict_ids[word].split(tf_idf_db.delimiter)[1:]
            word_tf_idf_list = mydict_tf_idf[word].split(tf_idf_db.delimiter)[1:]
        except KeyError:
            print('KeyError', word)
            word_id_list = []
            word_tf_idf_list = []
        for j in range(len(word_id_list)):
            id = int(word_id_list[j])       
            tf_idf = float(word_tf_idf_list[j])
            try:
                dictionary[id] = dictionary[id] + tf_idf
            except KeyError:
                dictionary[id] = tf_idf
    return dictionary

def get_empty_dict_claim():
    empty_dict = {}
    empty_dict['nr_words_title'] = None
    empty_dict['nr_words_per_pos'] = None
    empty_dict['tokenize'] = {}
    empty_dict['tokenize']['matches_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize']['tf_idf'] = get_empty_tag_dict()
    empty_dict['tokenize']['raw_count_idf'] = get_empty_tag_dict()
    empty_dict['tokenize']['idf'] = get_empty_tag_dict()
#     empty_dict['tokenize']['raw_count_sum_per_pos'] = get_empty_tag_dict()
#     empty_dict['tokenize']['max_sum_idf'] = 0
    empty_dict['tokenize_lemma'] = {}
    empty_dict['tokenize_lemma']['matches_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['tf_idf_sum_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['raw_count_idf_sum_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['idf_sum_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['raw_count_sum_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['max_sum_idf'] = 0
    return empty_dict
       
def get_empty_tag_dict():
    tag_2_id_dict = get_tag_2_id_dict()
    empty_tag_dict = {}
    for pos_id in range(len(tag_2_id_dict)):
        empty_tag_dict[pos_id] = 0
    return empty_tag_dict

def get_tag_2_id_dict():
    tag_2_id_dict = {}
    tag_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']

    for i in range(len(tag_list)):
        tag = tag_list[i]
        tag_2_id_dict[tag] = i
    return tag_2_id_dict

def get_tf_idf_name(experiment_nr):
    if experiment_nr in [31,34, 37]:
        return 'tf_idf'
    elif experiment_nr in [32,35, 38]:
        return 'raw_count_idf'
    elif experiment_nr in [33,36]:
        return 'idf'
    else:
        raise ValueError('experiment_nr not in selection', experiment_nr)

In [153]:
tag_2_id_dict = get_tag_2_id_dict()


# === process claim === #
experiment_nr = 37

with HiddenPrints():
    vocab, tf_idf_db = get_vocab_tf_idf_from_exp(experiment_nr)

doc = vocab.wiki_database.nlp(claim.claim)
claim_doc_tokenizer = ClaimDocTokenizer(doc, vocab.delimiter_words)
n_grams, nr_words = claim_doc_tokenizer.get_n_grams(vocab.method_tokenization, tf_idf_db.vocab.n_gram)

claim_dict = {}

claim_dict['claim'] = {}
claim_dict['claim']['nr_words'] = sum(n_grams.values())
claim_dict['claim']['nr_words_per_pos'] = get_empty_tag_dict()

tag_list = []
for key, count in n_grams.items():
    tag, word = get_tag_word_from_wordtag(key, vocab.delimiter_tag_word)
    tag_list.append(tag)
    pos_id = tag_2_id_dict[tag]
    claim_dict['claim']['nr_words_per_pos'][pos_id] += count

# === process titles === #
experiment_nr = 31

with HiddenPrints():
    vocab, tf_idf_db = get_vocab_tf_idf_from_exp(experiment_nr)

doc = vocab.wiki_database.nlp(claim.claim)
claim_doc_tokenizer = ClaimDocTokenizer(doc, vocab.delimiter_words)
n_grams, nr_words = claim_doc_tokenizer.get_n_grams(vocab.method_tokenization, tf_idf_db.vocab.n_gram)

path_title_ids = tf_idf_db.path_ids_dict
path_title_tf_idf = tf_idf_db.path_tf_idf_dict
mydict_ids = SqliteDict(path_title_ids)
mydict_tf_idf = SqliteDict(path_title_tf_idf)

idx = 0
for key, count in n_grams.items():
    tag = tag_list[idx]

    dictionary = get_dict_from_n_gram([key], mydict_ids, mydict_tf_idf, tf_idf_db)

    tf_idf_name = get_tf_idf_name(experiment_nr)
    tag_nr = tag_2_id_dict[tag]

    for id, tf_idf_value in dictionary.items():
        if id in claim_dict['title']['ids'].keys():
            claim_dict['title']['ids'][id][vocab.method_tokenization[0]][tf_idf_name][tag_nr] += tf_idf_value
        else:
            claim_dict['title']['ids'][id] = get_empty_dict_claim()
            claim_dict['title']['ids'][id][vocab.method_tokenization[0]][tf_idf_name][tag_nr] += tf_idf_value

    idx += 1

KeyError: 'title'

In [2]:
path_wiki_pages = os.path.join(config.ROOT, config.DATA_DIR, config.WIKI_PAGES_DIR, 'wiki-pages')
path_wiki_database_dir = os.path.join(config.ROOT, config.DATA_DIR, config.DATABASE_DIR)

wiki_database = WikiDatabaseSqlite(path_wiki_database_dir, path_wiki_pages)


Load existing settings file
Load title dictionary


In [3]:
experiment_nr = 37
file_name = 'experiment_%.2d.json'%(experiment_nr)
path_experiment = os.path.join(config.ROOT, config.CONFIG_DIR, file_name)

with open(path_experiment) as json_data_file:
    data = json.load(json_data_file)

# === run === #
vocab = VocabularySqlite(wiki_database = wiki_database, n_gram = data['n_gram'],
    method_tokenization = data['method_tokenization'], tags_in_db_flag = data['tags_in_db_flag'], 
    source = data['vocabulary_source'], tag_list_selected = data['tag_list_selected'])

tf_idf_db = TFIDFDatabaseSqlite(vocabulary = vocab, method_tf = data['method_tf'], method_df = data['method_df'],
    delimiter = data['delimiter'], threshold = data['threshold'], source = data['tf_idf_source'])


/home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_04_results/vocab_title_1_lp
Load existing settings file
Document count dictionary already exists
Directory  /home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_04_results/vocab_title_1_lp/thr_0.01_  already exists
Directory  /home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_04_results/vocab_title_1_lp/thr_0.01_/ex_term_frequency_inverse_document_frequency_title  already exists
load total TF-IDF dictionary
/home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_04_results/vocab_title_1_lp/thr_0.01_/ex_term_frequency_inverse_document_frequency_title/title_total_tf_idf.json
empty database already exists
database already filled


In [4]:
claim_data_set_names = ['dev']
claim_data_set = claim_data_set_names[0]
path_dev_set = os.path.join(config.ROOT, config.DATA_DIR, config.RAW_DATA_DIR, claim_data_set + ".jsonl")
results = load_jsonl(path_dev_set)

In [5]:
from utils_doc_results import Claim, ClaimDocTokenizer

results
i = 0
claim = Claim(results[i])


In [7]:
from utils_doc_results import get_tag_word_from_wordtag
for key, count in n_grams.items():
    tag, word = get_tag_word_from_wordtag(key, vocab.delimiter_tag_word)
    print(tag, word)


PROPN Colin
PROPN Kaepernick
VERB become
DET a
VERB start
NOUN quarterback
ADP during
DET the
NOUN 49er
NUM 63rd
NOUN season
ADP in
PROPN National
PROPN Football
PROPN League
PUNCT .


In [None]:
def count_pos_tags_claim(n_grams):

In [69]:
tag_2_id_dict = get_tag_2_id_dict()

experiment_nr = 37

with HiddenPrints():
    vocab, tf_idf_db = get_vocab_tf_idf_from_exp(experiment_nr)

doc = vocab.wiki_database.nlp(claim.claim)
claim_doc_tokenizer = ClaimDocTokenizer(doc, vocab.delimiter_words)
n_grams, nr_words = claim_doc_tokenizer.get_n_grams(vocab.method_tokenization, tf_idf_db.vocab.n_gram)

claim_dict = {}

# claim features
claim_dict['claim'] = {}
claim_dict['claim']['nr_words'] = sum(n_grams.values())
claim_dict['claim']['nr_words_per_pos'] = get_empty_tag_dict()

for key, count in n_grams.items():
    tag, word = get_tag_word_from_wordtag(key, vocab.delimiter_tag_word)
    pos_id = tag_2_id_dict[tag]
    claim_dict['claim']['nr_words_per_pos'][pos_id] += count
    



In [143]:
# title features
claim_dict['title'] = {}
claim_dict['title']['ids'] = {}
# get_empty_dict_claim()
# tf idf per tag
experiment_nr = 31
vocab, tf_idf_db = get_vocab_tf_idf_from_exp(experiment_nr)

doc = vocab.wiki_database.nlp(claim.claim)
claim_doc_tokenizer = ClaimDocTokenizer(doc, vocab.delimiter_words)
n_grams, nr_words = claim_doc_tokenizer.get_n_grams(vocab.method_tokenization, tf_idf_db.vocab.n_gram)

# === initialise databases === #
path_title_ids = tf_idf_db.path_ids_dict
path_title_tf_idf = tf_idf_db.path_tf_idf_dict
mydict_ids = SqliteDict(path_title_ids)
mydict_tf_idf = SqliteDict(path_title_tf_idf)

dictionary = get_dict_from_n_gram(mydict_ids, mydict_tf_idf, tf_idf_db)

tf_idf_name = get_tf_idf_name(experiment_nr)
tag_nr = get_tag_2_id_dict()

for id, tf_idf_value in dictionary.items():
    if id in claim_dict['title']['ids'].keys():
        claim_dict['title']['ids'][id][vocab.method_tokenization[0]][tf_idf_name][] += tf_idf_value
    else:
        claim_dict['title']['ids'][id] = get_empty_dict_claim()
        claim_dict['title']['ids'][id][vocab.method_tokenization[0]][tf_idf_name][] += tf_idf_value



SyntaxError: invalid syntax (<ipython-input-143-b597943da182>, line 22)

In [148]:
def get_tf_idf_name(experiment_nr):
    if experiment_nr in [31,34, 37]:
        return 'tf_idf'
    elif experiment_nr in [32,35, 38]:
        return 'raw_count_idf'
    elif experiment_nr in [33,36]:
        return 'idf'
    else:
        raise ValueError('experiment_nr not in selection', experiment_nr)

In [137]:
def get_dict_from_n_gram(mydict_ids, mydict_tf_idf, tf_idf_db):
    
    dictionary = {}

    for word in n_grams:
        try:
            word_id_list = mydict_ids[word].split(tf_idf_db.delimiter)[1:]
            word_tf_idf_list = mydict_tf_idf[word].split(tf_idf_db.delimiter)[1:]
        except KeyError:
            print('KeyError', word)
            word_id_list = []
            word_tf_idf_list = []
        for j in range(len(word_id_list)):
            id = int(word_id_list[j])       
            tf_idf = float(word_tf_idf_list[j])
            try:
                dictionary[id] = dictionary[id] + tf_idf
            except KeyError:
                dictionary[id] = tf_idf
    return dictionary

def get_empty_dict_claim():
    empty_dict = {}
    empty_dict['nr_words_title'] = None
    empty_dict['nr_words_per_pos'] = None
    empty_dict['tokenize'] = {}
    empty_dict['tokenize']['matches_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize']['tf_idf'] = get_empty_tag_dict()
    empty_dict['tokenize']['raw_count_idf'] = get_empty_tag_dict()
    empty_dict['tokenize']['idf'] = get_empty_tag_dict()
#     empty_dict['tokenize']['raw_count_sum_per_pos'] = get_empty_tag_dict()
#     empty_dict['tokenize']['max_sum_idf'] = 0
    empty_dict['tokenize_lemma'] = {}
    empty_dict['tokenize_lemma']['matches_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['tf_idf_sum_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['raw_count_idf_sum_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['idf_sum_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['raw_count_sum_per_pos'] = get_empty_tag_dict()
    empty_dict['tokenize_lemma']['max_sum_idf'] = 0
    return empty_dict
       
def get_empty_tag_dict():
    tag_2_id_dict = get_tag_2_id_dict()
    empty_tag_dict = {}
    for pos_id in range(len(tag_2_id_dict)):
        empty_tag_dict[pos_id] = 0
    return empty_tag_dict

def get_tag_2_id_dict():
    tag_2_id_dict = {}
    tag_list = ['ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE']

    for i in range(len(tag_list)):
        tag = tag_list[i]
        tag_2_id_dict[tag] = i
    return tag_2_id_dict

In [29]:
def get_vocab_tf_idf_from_exp(experiment_nr):
    file_name = 'experiment_%.2d.json'%(experiment_nr)
    path_experiment = os.path.join(config.ROOT, config.CONFIG_DIR, file_name)

    with open(path_experiment) as json_data_file:
        data = json.load(json_data_file)

    vocab = VocabularySqlite(wiki_database = wiki_database, n_gram = data['n_gram'],
        method_tokenization = data['method_tokenization'], tags_in_db_flag = data['tags_in_db_flag'], 
        source = data['vocabulary_source'], tag_list_selected = data['tag_list_selected'])

    tf_idf_db = TFIDFDatabaseSqlite(vocabulary = vocab, method_tf = data['method_tf'], method_df = data['method_df'],
        delimiter = data['delimiter'], threshold = data['threshold'], source = data['tf_idf_source'])
    return vocab, tf_idf_db

In [37]:
import sys, os

# Disable
def blockPrint():
    sys.stdout = open(os.devnull, 'w')

# Restore
def enablePrint():
    sys.stdout = sys.__stdout__


# print 'This will print'

# blockPrint()
# print "This won't"

# enablePrint()
# print "This will too"

In [39]:
import os, sys

class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, 'w')

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

Collecting spacy
  Using cached https://files.pythonhosted.org/packages/a1/5b/0fab3fa533229436533fb504bb62f4cf7ea29541a487a9d1a0749876fc23/spacy-2.1.4-cp36-cp36m-manylinux1_x86_64.whl
Requirement already up-to-date: requests<3.0.0,>=2.13.0 in /home/bmelman/C_disk/03_environment/03_fever/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: murmurhash<1.1.0,>=0.28.0 in /home/bmelman/C_disk/03_environment/03_fever/lib/python3.6/site-packages (from spacy)
Collecting wasabi<1.1.0,>=0.2.0 (from spacy)
  Using cached https://files.pythonhosted.org/packages/f4/c1/d76ccdd12c716be79162d934fe7de4ac8a318b9302864716dde940641a79/wasabi-0.2.2-py3-none-any.whl
Collecting blis<0.3.0,>=0.2.2 (from spacy)
  Using cached https://files.pythonhosted.org/packages/34/46/b1d0bb71d308e820ed30316c5f0a017cb5ef5f4324bcbc7da3cf9d3b075c/blis-0.2.4-cp36-cp36m-manylinux1_x86_64.whl
Requirement already up-to-date: jsonschema<3.1.0,>=2.6.0 in /home/bmelman/C_disk/03_environment/03_fever/lib/python3.6

# Experiment