In [1]:
import os, sys
from tqdm import tqdm
from sqlitedict import SqliteDict

from wiki_database import WikiDatabaseSqlite, Text
from utils_doc_results_db import get_empty_tag_dict, get_tag_2_id_dict, get_tag_dict, get_tf_idf_from_exp, get_tf_idf_name, get_vocab_tf_idf_from_exp
from utils_doc_results_db import get_dict_from_n_gram
from utils_db import dict_load_json, dict_save_json, HiddenPrints, load_jsonl
from utils_doc_results import Claim, ClaimDocTokenizer, get_tag_word_from_wordtag, ClaimDatabase

import config

In [2]:
# === constants === #

# === variables === #
n_gram = 1
claim_data_set = 'dev'
folder_name_score_combination = 'score_combination'

# === process === #
path_dir_results = os.path.join(config.ROOT, config.RESULTS_DIR, folder_name_score_combination)
path_tags = os.path.join(path_dir_results, 'tags_' + claim_data_set + '_n_gram_' + str(n_gram) + '.json')

path_wiki_pages = os.path.join(config.ROOT, config.DATA_DIR, config.WIKI_PAGES_DIR, 'wiki-pages')
path_wiki_database_dir = os.path.join(config.ROOT, config.DATA_DIR, config.DATABASE_DIR)

path_claim_data_set = os.path.join(config.ROOT, config.DATA_DIR, config.RAW_DATA_DIR, claim_data_set + ".jsonl")
path_dir_claim_database = os.path.join(config.ROOT, config.DATA_DIR, config.DATABASE_DIR)
path_raw_data = os.path.join(config.ROOT, config.DATA_DIR, config.RAW_DATA_DIR)

path_dir_claims = os.path.join(path_dir_results, claim_data_set)

try:
    os.makedirs(path_dir_results, exist_ok=True)
except FileExistsError:
    print('folder already exists:', path_dir_results)

try:
    os.makedirs(path_dir_claims, exist_ok=True)
except FileExistsError:
    print('folder already exists:', path_dir_claims)

claim_database = ClaimDatabase(path_dir_database = path_dir_claim_database, path_raw_data = path_raw_data, claim_data_set = claim_data_set)


claim database
- claim database already exists


In [12]:
id = 4
claim = claim_database.get_claim_from_id(id)
document_name_list = claim

In [73]:
class ClaimFile:
    """A sample Employee class"""
    def __init__(self, id, path_dir_files):
        self.path_claim = os.path.join(path_dir_files, str(id) + '.json')
        if os.path.isfile(self.path_claim):
            self.claim_dict = dict_load_json(self.path_claim)
        else:
            self.claim_dict = {}
            self.claim_dict['claim'] = {}
            self.claim_dict['claim']['1_gram'] = {}
            self.claim_dict['claim']['1_gram']['nr_words'] = None
            self.claim_dict['claim']['1_gram']['nr_words_per_pos'] = get_empty_tag_dict()
            self.claim_dict['title'] = {}
            self.claim_dict['title']['1_gram'] = {}
#             self.claim_dict['title']['1_gram']['nr_words'] = None
#             self.claim_dict['title']['1_gram']['nr_words_per_pos'] = get_empty_tag_dict()
#             self.claim_dict['title']['ids'] = {}
            self.save_claim()
    
    def process_claims_selected(self, claim_dictionary):
        # add ids to selected dictionary which are the proof
        claim = Claim(claim_dictionary)
        if 'ids_selected' not in self.claim_dict:
            interpreter_list = claim.evidence
            id_list = []
            for interpreter in interpreter_list:
                for proof in interpreter:
                    title = proof[2]
                    if title is not None:
                        id = wiki_database.get_id_from_title(title)
                        id_list.append(id)
            self.claim_dict['ids_selected'] = id_list
        
        # === add from selected_ids in claim_dictionary === #    
        if 'docs_selected' in claim_dictionary:
            self.claim_dict['ids_selected'] += claim['docs_selected']
            
        # === save === #
        self.save_claim()
        
    def process_claim(self, claim):
        self.claim_dict['claim']['text'] = claim
        self.save_claim()

    def process_tags(self, tag_list, n_gram):
#         self.claim_dict['claim'][str(n_gram) +'_gram'] = {}
        if n_gram == 1:
            self.claim_dict['claim'][str(n_gram) +'_gram']['tag_list'] = tag_list
        else:
            raise ValueError('written for n_gram == 1')
        self.save_claim()
    
    def process_tf_idf_experiment(self, tag_2_id_dict, tf_idf_db, mydict_ids, mydict_tf_idf):
        tf_idf_name = get_tf_idf_name(experiment_nr)
        if tf_idf_db.n_gram == 1:
            doc = tf_idf_db.vocab.wiki_database.nlp(self.claim_dict['claim']['text'])
            
            tag_list = [word.pos_ for word in doc]        
            # === write tf-idf values === #
            claim_text = Text(doc)
            tokenized_claim_list = claim_text.process(tf_idf_db.vocab.method_tokenization)
#             idx = 0
            for i in range(len(tag_list)):
                tag = tag_list[i]
                word = tokenized_claim_list[i]
            
                pos_id = tag_2_id_dict[tag]
                
                with HiddenPrints():
                    dictionary = get_dict_from_n_gram([word], mydict_ids, mydict_tf_idf, tf_idf_db)
                    
                for id in self.claim_dict['ids_selected']:
                    # === create dictionary if does not exist === #
                    if str(id) not in self.claim_dict['title']['1_gram']:
                        self.claim_dict['title']['1_gram'][str(id)] = {}
                        title = tf_idf_db.vocab.wiki_database.get_title_from_id(id)

                        doc = tf_idf_db.vocab.wiki_database.nlp(title)
                        claim_doc_tokenizer = ClaimDocTokenizer(doc, tf_idf_db.vocab.delimiter_words)
                        n_grams_dict_title, nr_words_title = claim_doc_tokenizer.get_n_grams(tf_idf_db.vocab.method_tokenization, tf_idf_db.vocab.n_gram)

                        self.claim_dict['title']['1_gram'][str(id)]['nr_words'] = nr_words_title
                    
                    if tf_idf_db.vocab.method_tokenization[0] not in self.claim_dict['title']['1_gram'][str(id)].keys():
                        self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]] = {}
                        if tf_idf_name not in self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]].keys():
                            self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]][tf_idf_name] = get_empty_tag_dict()
                    
                    if id in dictionary:
                        tf_idf_value = dictionary[id]
                    else:
                        tf_idf_value = 0.0
                        
                    self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]][tf_idf_name][str(pos_id)] += tf_idf_value   
                    
#                 for id, tf_idf_value in dictionary.items():
#                     if id in self.claim_dict['ids_selected']:
#                         # === create dictionary if does not exist === #
#                         if str(id) not in self.claim_dict['title']['1_gram']:
#                             self.claim_dict['title']['1_gram'][str(id)] = {}
#                             title = tf_idf_db.vocab.wiki_database.get_title_from_id(id)

#                             doc = tf_idf_db.vocab.wiki_database.nlp(title)
#                             claim_doc_tokenizer = ClaimDocTokenizer(doc, tf_idf_db.vocab.delimiter_words)
#                             n_grams_dict_title, nr_words_title = claim_doc_tokenizer.get_n_grams(tf_idf_db.vocab.method_tokenization, tf_idf_db.vocab.n_gram)

#                             self.claim_dict['title']['1_gram'][str(id)]['nr_words'] = nr_words_title

#                         if tf_idf_db.vocab.method_tokenization[0] not in self.claim_dict['title']['1_gram'][str(id)].keys():
#                             self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]] = {}
#                             if tf_idf_name not in self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]].keys():
#                                 self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]][tf_idf_name] = get_empty_tag_dict()

#                         self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]][tf_idf_name][str(pos_id)] += tf_idf_value   
#                 idx += 1
            self.save_claim()
        else:
            raise ValueError('Adapt function for bigrams')
        
        self.save_claim()
    
    def process_nr_words_per_pos(self, tf_idf_db, tag_2_id_dict):
        if tf_idf_db.n_gram == 1:
            doc = tf_idf_db.vocab.wiki_database.nlp(self.claim_dict['claim']['text'])
            claim_doc_tokenizer = ClaimDocTokenizer(doc, tf_idf_db.vocab.delimiter_words)
            n_grams_dict, nr_words = claim_doc_tokenizer.get_n_grams(tf_idf_db.vocab.method_tokenization, tf_idf_db.vocab.n_gram)

            self.claim_dict['claim']['1_gram']['nr_words'] = sum(n_grams_dict.values())
            
            for key, count in n_grams_dict.items():
                tag, word = get_tag_word_from_wordtag(key, tf_idf_db.vocab.delimiter_tag_word)
                pos_id = tag_2_id_dict[tag]
                self.claim_dict['claim']['1_gram']['nr_words_per_pos'][str(pos_id)] += count
            self.save_claim()
        else:
            raise ValueError('Adapt function for bigrams')

    def save_claim(self):
        with HiddenPrints():
            dict_save_json(self.claim_dict, self.path_claim)

In [58]:
# === constants === #

# === variables === #
n_gram = 1
claim_data_set = 'dev'
folder_name_score_combination = 'score_combination'

# === process === #
path_dir_results = os.path.join(config.ROOT, config.RESULTS_DIR, folder_name_score_combination)
path_tags = os.path.join(path_dir_results, 'tags_' + claim_data_set + '_n_gram_' + str(n_gram) + '.json')

path_wiki_pages = os.path.join(config.ROOT, config.DATA_DIR, config.WIKI_PAGES_DIR, 'wiki-pages')
path_wiki_database_dir = os.path.join(config.ROOT, config.DATA_DIR, config.DATABASE_DIR)

path_claim_data_set = os.path.join(config.ROOT, config.DATA_DIR, config.RAW_DATA_DIR, claim_data_set + ".jsonl")
path_dir_claim_database = os.path.join(config.ROOT, config.DATA_DIR, config.DATABASE_DIR)
path_raw_data = os.path.join(config.ROOT, config.DATA_DIR, config.RAW_DATA_DIR)

path_dir_claims = os.path.join(path_dir_results, claim_data_set)

try:
    os.makedirs(path_dir_results, exist_ok=True)
except FileExistsError:
    print('folder already exists:', path_dir_results)

try:
    os.makedirs(path_dir_claims, exist_ok=True)
except FileExistsError:
    print('folder already exists:', path_dir_claims)

claim_database = ClaimDatabase(path_dir_database = path_dir_claim_database, path_raw_data = path_raw_data, claim_data_set = claim_data_set)

wiki_database = WikiDatabaseSqlite(path_wiki_database_dir, path_wiki_pages)
tag_2_id_dict = get_tag_2_id_dict()

tag_dict = get_tag_dict(claim_data_set, n_gram, path_tags, wiki_database)

nr_claims = 1000 # len(results)

print('claim database: insert claim\'s text and claim\'s tag_list')

for str_id, tag_list in tqdm(tag_dict.items(), total = len(tag_dict), desc = 'tag'):
    id = int(str_id)
    if id < nr_claims:
        file = ClaimFile(id = id, path_dir_files = path_dir_claims)
        file.process_tags(tag_list, n_gram)
        claim_dict = claim_database.get_claim_from_id(id)
        claim = Claim(claim_dict)
        file.process_claim(claim.claim)
        file.process_claims_selected(claim_dict)

print('claim database: insert nr words per tag for claim')

experiment_nr = 37
with HiddenPrints():
    tf_idf_db = get_tf_idf_from_exp(experiment_nr, wiki_database)

for id in tqdm(range(nr_claims), desc = 'nr words per pos'):
    file = ClaimFile(id = id, path_dir_files = path_dir_claims)
    file.process_nr_words_per_pos(tf_idf_db, tag_2_id_dict)

print('claim database: insert selected ids')


claim database
- claim database already exists
wiki_database
- Load existing settings file
- Load title dictionary






tag:   0%|          | 0/19998 [00:00<?, ?it/s][A[A[A[A



tag:   0%|          | 41/19998 [00:00<00:49, 406.17it/s][A[A[A[A

tags file already exists
claim database: insert claim's text and claim's tag_list






tag:   0%|          | 77/19998 [00:00<00:51, 387.97it/s][A[A[A[A



tag:   1%|          | 114/19998 [00:00<00:52, 382.20it/s][A[A[A[A



tag:   1%|          | 150/19998 [00:00<00:53, 373.49it/s][A[A[A[A



tag:   1%|          | 187/19998 [00:00<00:53, 369.72it/s][A[A[A[A



tag:   1%|          | 223/19998 [00:00<00:53, 366.28it/s][A[A[A[A



tag:   1%|▏         | 261/19998 [00:00<00:53, 367.82it/s][A[A[A[A



tag:   1%|▏         | 298/19998 [00:00<00:53, 365.67it/s][A[A[A[A



tag:   2%|▏         | 336/19998 [00:00<00:53, 368.02it/s][A[A[A[A



tag:   2%|▏         | 372/19998 [00:01<00:54, 360.21it/s][A[A[A[A



tag:   2%|▏         | 409/19998 [00:01<00:54, 360.50it/s][A[A[A[A



tag:   2%|▏         | 446/19998 [00:01<00:54, 359.94it/s][A[A[A[A



tag:   2%|▏         | 483/19998 [00:01<00:53, 361.85it/s][A[A[A[A



tag:   3%|▎         | 519/19998 [00:01<00:54, 357.10it/s][A[A[A[A



tag:   3%|▎         | 555/19998 [00:01<00:54, 35

claim database: insert nr words per tag for claim






nr words per pos:   0%|          | 0/1000 [00:00<?, ?it/s][A[A[A[A



nr words per pos:   1%|▏         | 13/1000 [00:00<00:08, 119.33it/s][A[A[A[A



nr words per pos:   2%|▎         | 25/1000 [00:00<00:08, 119.45it/s][A[A[A[A



nr words per pos:   4%|▎         | 37/1000 [00:00<00:08, 118.56it/s][A[A[A[A



nr words per pos:   5%|▍         | 49/1000 [00:00<00:08, 117.77it/s][A[A[A[A



nr words per pos:   6%|▌         | 60/1000 [00:00<00:08, 114.93it/s][A[A[A[A



nr words per pos:   7%|▋         | 71/1000 [00:00<00:08, 113.21it/s][A[A[A[A



nr words per pos:   8%|▊         | 84/1000 [00:00<00:07, 117.67it/s][A[A[A[A



nr words per pos:  10%|█         | 101/1000 [00:00<00:06, 129.39it/s][A[A[A[A



nr words per pos:  12%|█▏        | 117/1000 [00:00<00:06, 136.04it/s][A[A[A[A



nr words per pos:  14%|█▎        | 136/1000 [00:01<00:05, 147.20it/s][A[A[A[A



nr words per pos:  15%|█▌        | 151/1000 [00:01<00:06, 140.54it/s][A[A[A

claim database: insert selected ids


In [96]:
from sqlitedict import SqliteDict
from utils_doc_results_db import get_tf_idf_from_exp
from utils_db import HiddenPrints
from tqdm import tqdm

# === run experiment === #
experiment_list = [31, 37]

nr_claims_selected = 100 #nr_claims

for experiment_nr in experiment_list:
    print('experiment:', experient_nr)
    print('load tf_idf nr_words_pos')
    with HiddenPrints():
        tf_idf_db = get_tf_idf_from_exp(experiment_nr, wiki_database)

    mydict_ids = SqliteDict(tf_idf_db.path_ids_dict)
    mydict_tf_idf = SqliteDict(tf_idf_db.path_tf_idf_dict)

    for id in tqdm(range(nr_claims_selected), desc = 'nr words per pos'):
        file = ClaimFile(id = id, path_dir_files = path_dir_claims)
        file.process_tf_idf_experiment(tag_2_id_dict, tf_idf_db, mydict_ids, mydict_tf_idf)

experiment: [31, 37]
load tf_idf nr_words_pos






nr words per pos:   0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A



nr words per pos:   1%|          | 1/100 [00:00<00:23,  4.30it/s][A[A[A[A



nr words per pos:   4%|▍         | 4/100 [00:00<00:17,  5.57it/s][A[A[A[A



nr words per pos:   6%|▌         | 6/100 [00:00<00:14,  6.53it/s][A[A[A[A



nr words per pos:   9%|▉         | 9/100 [00:00<00:11,  8.27it/s][A[A[A[A



nr words per pos:  11%|█         | 11/100 [00:00<00:08,  9.97it/s][A[A[A[A



nr words per pos:  13%|█▎        | 13/100 [00:00<00:07, 11.43it/s][A[A[A[A



nr words per pos:  16%|█▌        | 16/100 [00:01<00:06, 12.96it/s][A[A[A[A



nr words per pos:  19%|█▉        | 19/100 [00:01<00:05, 14.77it/s][A[A[A[A



nr words per pos:  22%|██▏       | 22/100 [00:01<00:04, 16.14it/s][A[A[A[A



nr words per pos:  25%|██▌       | 25/100 [00:01<00:04, 18.14it/s][A[A[A[A



nr words per pos:  28%|██▊       | 28/100 [00:01<00:03, 20.50it/s][A[A[A[A



nr words per pos:  31%|██

experiment: [31, 37]
load tf_idf nr_words_pos






nr words per pos:   0%|          | 0/100 [00:00<?, ?it/s][A[A[A[A



nr words per pos:   1%|          | 1/100 [00:00<00:19,  5.00it/s][A[A[A[A



nr words per pos:   4%|▍         | 4/100 [00:00<00:14,  6.41it/s][A[A[A[A



nr words per pos:   6%|▌         | 6/100 [00:00<00:12,  7.55it/s][A[A[A[A



nr words per pos:   9%|▉         | 9/100 [00:00<00:09,  9.39it/s][A[A[A[A



nr words per pos:  11%|█         | 11/100 [00:00<00:08, 10.92it/s][A[A[A[A



nr words per pos:  13%|█▎        | 13/100 [00:00<00:07, 12.32it/s][A[A[A[A



nr words per pos:  16%|█▌        | 16/100 [00:01<00:05, 14.22it/s][A[A[A[A



nr words per pos:  19%|█▉        | 19/100 [00:01<00:05, 16.10it/s][A[A[A[A



nr words per pos:  22%|██▏       | 22/100 [00:01<00:04, 16.68it/s][A[A[A[A



nr words per pos:  25%|██▌       | 25/100 [00:01<00:04, 18.17it/s][A[A[A[A



nr words per pos:  28%|██▊       | 28/100 [00:01<00:03, 20.14it/s][A[A[A[A



nr words per pos:  31%|██

In [233]:
import collections

def get_list_properties(dict, temp_key_list, list_keys_list, list_values):
    for key, value in dict.items():
        if isinstance(value, collections.Mapping):
            list_keys_list, list_values = get_list_properties(dict[key], temp_key_list + [key], list_keys_list, list_values)
        else:
            if type(value) is not str and type(value) is not list:
                list_values.append(value)
                list_keys_list.append(temp_key_list + [key])
    return list_keys_list, list_values

def get_value_if_exists(dict, list_keys):
    tmp = dict
    for key in list_keys:
        try:
            tmp = tmp[key]
        except KeyError:
            return 0
    if isinstance(tmp, collections.Mapping):
        raise ValueError('Get a Dictionary whereas we expect a value')
    value = tmp
    return value

In [263]:
import numpy as np
id = 5

file = ClaimFile(id = id, path_dir_files = path_dir_claims)

id_list = list(file.claim_dict['title']['1_gram'].keys())

observation_key_list_claim, _ = get_list_properties(file.claim_dict['claim']['1_gram'], [], [], [])
observation_key_list_title, _ = get_list_properties(file.claim_dict['title']['1_gram'][id_list[0]], [], [], [])

nr_claims = 10
nr_variables = len(observation_key_list_claim) + len(observation_key_list_title)

data_matrix = np.zeros((nr_claims, nr_variables))

for id in range(nr_claims):
    _, values_claim = get_list_properties(file.claim_dict['claim']['1_gram'], [], [], [])
    _, values_title = get_list_properties(file.claim_dict['title']['1_gram'][id_list[0]], [], [], [])
    data_matrix[id, :] = values_claim + values_title
    



In [274]:
import numpy as np
data_matrix = np.array([1,2,3,4,5])
data_matrix

array([1, 2, 3, 4, 5])

In [249]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        # 1 input image channel, 6 output channels, 3x3 square convolution
        # kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)


Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


In [252]:
params = list(net.parameters())
print(len(params))
print(params[0].size()) 

10
torch.Size([6, 1, 3, 3])


In [253]:
input = torch.randn(1, 1, 32, 32)
out = net(input)
print(out)

tensor([[-0.0270,  0.0295, -0.0137,  0.1104, -0.1436, -0.0219, -0.0160, -0.0985,
          0.0642, -0.0511]], grad_fn=<ThAddmmBackward>)


In [254]:
net.zero_grad()
out.backward(torch.randn(1, 10))

In [255]:
output = net(input)
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(0.4623, grad_fn=<MseLossBackward>)


In [256]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.01)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)
loss = criterion(output, target)
loss.backward()
optimizer.step()    # Does the update

In [259]:
import torch.nn as nn
import torch.nn.functional as F

class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.

        D_in: input dimension
        H: dimension of hidden layer
        D_out: output dimension
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = nn.Linear(D_in, H) 
        self.linear2 = nn.Linear(H, D_out)

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must 
        return a Variable of output data. We can use Modules defined in the 
        constructor as well as arbitrary operators on Variables.
        """
        h_relu = F.relu(self.linear1(x))
        y_pred = self.linear2(h_relu)
        return y_pred


In [262]:
from torch.autograd import Variable
# N is batch size; D_in is input dimension;
# H is the dimension of the hidden layer; D_out is output dimension.
N, D_in, H, D_out = 32, 100, 50, 10

# Create random Tensors to hold inputs and outputs, and wrap them in Variables
x = Variable(torch.randn(N, D_in))  # dim: 32 x 100

# Construct our model by instantiating the class defined above
model = TwoLayerNet(D_in, H, D_out)

# Forward pass: Compute predicted y by passing x to the model
y_pred = model(x)   # dim: 32 x 10

In [None]:
import torch
from torch.utils import data

class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, list_IDs, labels):
        'Initialization'
        self.labels = labels
        self.list_IDs = list_IDs

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.list_IDs)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        ID = self.list_IDs[index]

        # Load data and get label
        X = torch.load('data/' + ID + '.pt')
        y = self.labels[ID]

        return X, y

In [None]:
import torch
from torch.utils import data

from my_classes import Dataset


# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
cudnn.benchmark = True

# Parameters
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 6}
max_epochs = 100

# Datasets
partition = # IDs
labels = # Labels

# Generators
training_set = Dataset(partition['train'], labels)
training_generator = data.DataLoader(training_set, **params)

validation_set = Dataset(partition['validation'], labels)
validation_generator = data.DataLoader(validation_set, **params)

# Loop over epochs
for epoch in range(max_epochs):
    # Training
    for local_batch, local_labels in training_generator:
        # Transfer to GPU
        local_batch, local_labels = local_batch.to(device), local_labels.to(device)

        # Model computations
        [...]

    # Validation
    with torch.set_grad_enabled(False):
        for local_batch, local_labels in validation_generator:
            # Transfer to GPU
            local_batch, local_labels = local_batch.to(device), local_labels.to(device)

            # Model computations
            [...]

In [57]:
from utils_db import load_jsonl

path_results = '/home/bmelman/Desktop/C_disk/02_university/06_thesis/01_code/fever/_04_results/vocab_title_1_lp/thr_0.01_/ex_term_frequency_inverse_document_frequency_title/01_results/predicted_labels_20.json'
dict_results = load_jsonl(path_results)
claim_nr = 9
interpreter_list = dict_results[claim_nr]['evidence']
print(interpreter_list)

id_list = []
for interpreter in interpreter_list:
    for proof in interpreter:
        title = proof[2]
        if title is not None:
            id = wiki_database.get_id_from_title(title)
            id_list.append(id)


[[[127089, 141573, 'Andrew_Kevin_Walker', 0]]]


In [97]:
from utils_db import load_jsonl

path_results = '/home/bmelman/Desktop/C_disk/02_university/06_thesis/01_code/fever/_04_results/vocab_title_1_lp/thr_0.01_/ex_term_frequency_inverse_document_frequency_title/01_results/predicted_labels_20.json'
dict_results = load_jsonl(path_results)
claim_nr = 9
interpreter_list = dict_results[claim_nr]
interpreter_list

{'id': 108281,
 'verifiable': 'VERIFIABLE',
 'label': 'REFUTES',
 'claim': 'Andrew Kevin Walker is only Chinese.',
 'evidence': [[[127089, 141573, 'Andrew_Kevin_Walker', 0]]],
 'docs_selected': [3680518,
  3680749,
  3682302,
  4041240,
  4323813,
  4548726,
  4566445,
  4617659,
  4724012,
  2611161,
  2703062,
  531639,
  480771,
  484565,
  465538,
  2666840,
  2654995,
  5182741,
  1001990,
  3614807],
 'results': {'e_score': 1.0, 'f_score': 1, 'f_score_labelled': 1}}

In [100]:
from utils_db import dict_save_json, dict_load_json
path_dict = 'tmp.json'
dict = {}
dict['Simón_Bolívar'] = 1
dict_save_json(dict, path_dict)
dict['Simón_Bolívar']

overwriting file: tmp.json


1

In [78]:
with SqliteDict('tmp.sqlite') as dict_tmp:
    dict_tmp['Simón_Bolívar'] = 1
    dict_tmp.commit()
with SqliteDict('tmp.sqlite') as dict_tmp:
    print(dict_tmp['Simón_Bolívar'])

In [30]:
import unicodedata
unicodedata.normalize('NFD', 'Simón_Bolívar')

'Simón_Bolívar'

In [None]:
list_titles = list(wiki_database.title_2_id_db.keys())

In [17]:
from tqdm import tqdm
id_list = []
for key, value in tqdm(wiki_database.title_2_id_db.items()):
    if 'n_Bol' in key:
        id_list.append(key)

5416536it [01:13, 74033.34it/s]


In [32]:
start = 460
unicodedata.normalize('NFD', id_list[460]) == unicodedata.normalize('NFD', 'Simón_Bolívar')


True

In [95]:
from utils_db import load_jsonl

path_text = '/home/bmelman/Desktop/C_disk/02_university/06_thesis/01_code/fever/_01_data/_02_wikipedia_pages/wiki-pages/wiki-090.jsonl'

text_list = load_jsonl(path_text)
id = 2001
print(text_list[id])
print(text_list[id])

{'id': 'St._Francis_de_Sales_High_School_-LRB-Detroit,_Michigan-RRB-', 'text': 'St. Francis de Sales High School was a coeducational Catholic high school in Detroit , Michigan . The school closed in 1971 . Since 1994 , the school building has been home to Loyola High School . ', 'lines': '0\tSt. Francis de Sales High School was a coeducational Catholic high school in Detroit , Michigan .\tDetroit\tDetroit\tMichigan\tMichigan\tCatholic\tCatholic\n1\tThe school closed in 1971 .\n2\tSince 1994 , the school building has been home to Loyola High School .\tLoyola High School\tLoyola High School (Detroit)\n3\t'}
{'id': 'St._Francis_de_Sales_High_School_-LRB-Detroit,_Michigan-RRB-', 'text': 'St. Francis de Sales High School was a coeducational Catholic high school in Detroit , Michigan . The school closed in 1971 . Since 1994 , the school building has been home to Loyola High School . ', 'lines': '0\tSt. Francis de Sales High School was a coeducational Catholic high school in Detroit , Michiga

In [177]:
claim_db_simon = claim_db.get_claim_from_id(26)['evidence'][0][0][2]
# wiki_db_simon = unicodedata.normalize('NFD', normalise_text(id_list[460]))
# claim_db_simon == normalise_text(id_list[460])

In [183]:
with SqliteDict('tmp.sqlite') as dict_tmp:
    dict_tmp[normalise_text(id_list[460])] = normalise_text(id_list[460])
    dict_tmp.commit()
with SqliteDict('tmp.sqlite') as dict_tmp:
    print(dict_tmp[claim_db_simon])
    print(dict_tmp[claim_db_simon] == claim_db_simon)

Simón Bolívar
True


In [156]:
from tqdm import tqdm
id_list = []
for key, value in tqdm(wiki_database.title_2_id_db.items()):
    if 'n_Bol' in key:
        id_list.append(key)

5416536it [02:07, 42616.98it/s]


In [184]:
path_dict = '/home/bmelman/Desktop/C_disk/02_university/06_thesis/01_code/fever/_01_data/_03_database/id_2_title.json'
dict = dict_load_json(path_dict)

In [186]:
dict['1']

'1986 NBA Finals'

In [149]:
import config 

claim_data_set = 'dev'
path_dir_database = os.path.join(config.ROOT, config.DATA_DIR, config.DATABASE_DIR)
path_raw_data = os.path.join(config.ROOT, config.DATA_DIR, config.RAW_DATA_DIR)

claim_db = ClaimDatabase(path_dir_database = path_dir_database, path_raw_data = path_raw_data, claim_data_set = claim_data_set)

for i in tnrange(1000):
    claim_db.get_claim_from_id(i)

claim database already exists


HBox(children=(IntProgress(value=0, max=1000), HTML(value='')))




In [139]:
import os
import unicodedata
from utils_db import dict_load_json, dict_save_json, load_jsonl
from tqdm import tqdm

class ClaimDatabase:
    def __init__(self, path_dir_database, path_raw_data, claim_data_set):
        self.path_dir_database = path_dir_database
        self.path_raw_data = path_raw_data
        self.claim_data_set = claim_data_set
        
        self.path_dir_database_claims = os.path.join(self.path_dir_database, 'claims_' + str(self.claim_data_set))
        self.path_raw_claims = os.path.join(path_raw_data, str(self.claim_data_set) + '.jsonl')
        self.path_settings = os.path.join(path_dir_database_claims, 'settings.json')
        
        if not os.path.isdir(self.path_dir_database_claims):
            print('create claim database')
            os.makedirs(self.path_dir_database_claims)
            self.create_database()
        else:
            print('claim database already exists')
        
        if os.path.isfile(self.path_settings):
            self.settings = dict_load_json(self.path_settings)
            self.nr_claims = settings['nr_claims']
        else:
            raise ValueError('settings file should exist')
        
        
    def create_database(self):
        list_claim_dicts = load_jsonl(self.path_raw_claims)
        self.nr_claims = len(list_claim_dicts)
        for id in tqdm(range(self.nr_claims)):
            path_claim = os.path.join(self.path_dir_database_claims, str(id) + '.json')
            dict_claim_id = list_claim_dicts[id]
            dict_claim_id['verifiable'] = unicodedata.normalize('NFD', normalise_text(dict_claim_id['verifiable']))
            dict_claim_id['claim'] = unicodedata.normalize('NFD', normalise_text(dict_claim_id['claim']))
            for interpreter in range(len(dict_claim_id['evidence'])):
                for proof in range(len(dict_claim_id['evidence'][interpreter])):
                    if dict_claim_id['evidence'][interpreter][proof][2] != None:
                        dict_claim_id['evidence'][interpreter][proof][2] = unicodedata.normalize('NFD', normalise_text(dict_claim_id['evidence'][interpreter][proof][2]))
            
            dict_save_json(dict_claim_id, path_claim)
        
        if os.path.isfile(self.path_settings):
            settings = dict_load_json(self.path_settings)
            
        else:
            settings = {}
        
        settings['nr_claims'] = self.nr_claims
        dict_save_json(settings, self.path_settings)
    
    def get_claim_from_id(self, id):
        path_claim = os.path.join(self.path_dir_database_claims, str(id) + '.json')
        dict_claim_id = dict_load_json(path_claim)
        return dict_claim_id
    
            
            

In [18]:
from utils_db import dict_save_json
from doc_results_db_utils import get_tf_idf_name
from wiki_database import Text
from doc_results_db import get_dict_from_n_gram
from tqdm import tnrange, tqdm
from utils_doc_results import Claim, ClaimDocTokenizer

class ClaimFile:
    """A sample Employee class"""
    def __init__(self, id, path_dir_files):
        self.path_claim = os.path.join(path_dir_files, str(id) + '.json')
        if os.path.isfile(self.path_claim):
            self.claim_dict = dict_load_json(self.path_claim)
        else:
            self.claim_dict = {}
            self.claim_dict['claim'] = {}
            self.claim_dict['claim']['1_gram'] = {}
            self.claim_dict['claim']['1_gram']['nr_words'] = None
            self.claim_dict['claim']['1_gram']['nr_words_per_pos'] = get_empty_tag_dict()
            self.claim_dict['title'] = {}
            self.claim_dict['title']['1_gram'] = {}
#             self.claim_dict['title']['1_gram']['nr_words'] = None
#             self.claim_dict['title']['1_gram']['nr_words_per_pos'] = get_empty_tag_dict()
#             self.claim_dict['title']['ids'] = {}
            self.save_claim()
    
    def process_claims_selected(self, claim_dictionary):
        # add ids to selected dictionary which are the proof
        if 'ids_selected' not in self.claim_dict:
            interpreter_list = claim_dictionary.evidence
            id_list = []
            for interpreter in interpreter_list:
                for proof in interpreter:
                    title = proof[2]
                    if title is not None:
                        id = wiki_database.get_id_from_title(title)
                        id_list.append(id)
            self.claim_dict['ids_selected'] = id_list
        
        # === add from selected_ids in claim_dictionary === #    
        if 'docs_selected' in claim_dictionary:
            claim = Claim(claim_dictionary)
            self.claim_dict['ids_selected'] += claim['docs_selected']
            
        # === save === #
        self.save_claim()
        
    def process_claim(self, claim):
        self.claim_dict['claim']['text'] = claim
        self.save_claim()

    def process_tags(self, tag_list, n_gram):
#         self.claim_dict['claim'][str(n_gram) +'_gram'] = {}
        if n_gram == 1:
            self.claim_dict['claim'][str(n_gram) +'_gram']['tag_list'] = tag_list
        else:
            raise ValueError('written for n_gram == 1')
        self.save_claim()
    
    def process_tf_idf_experiment(self, tag_2_id_dict, tf_idf_db, mydict_ids, mydict_tf_idf):
        tf_idf_name = get_tf_idf_name(experiment_nr)
        if tf_idf_db.n_gram == 1:
            doc = tf_idf_db.vocab.wiki_database.nlp(self.claim_dict['claim']['text'])
            
            tag_list = [word.pos_ for word in doc]        
#             claim_doc_tokenizer = ClaimDocTokenizer(doc, tf_idf_db.vocab.delimiter_words)
#             n_grams_dict, nr_words = claim_doc_tokenizer.get_n_grams(tf_idf_db.vocab.method_tokenization, tf_idf_db.vocab.n_gram)
            # === write tf-idf values === #
            claim_text = Text(doc)
            tokenized_claim_list = claim_text.process(tf_idf_db.vocab.method_tokenization)
#             print(tag_list, tokenized_claim_list)
            idx = 0
            for i in range(len(tag_list)):
                tag = tag_list[i]
                word = tokenized_claim_list[i]
            
                pos_id = tag_2_id_dict[tag]
                
                with HiddenPrints():
                    dictionary = get_dict_from_n_gram([word], mydict_ids, mydict_tf_idf, tf_idf_db)
#                 print(len(dictionary))
                if len(dictionary) < 2000:
                    for id, tf_idf_value in dictionary.items():
                        # === create dictionary if does not exist === #
                        if str(id) not in self.claim_dict['title']['1_gram']:
                            self.claim_dict['title']['1_gram'][str(id)] = {}
                            title = tf_idf_db.vocab.wiki_database.get_title_from_id(id)

                            doc = tf_idf_db.vocab.wiki_database.nlp(title)
                            claim_doc_tokenizer = ClaimDocTokenizer(doc, tf_idf_db.vocab.delimiter_words)
                            n_grams_dict_title, nr_words_title = claim_doc_tokenizer.get_n_grams(tf_idf_db.vocab.method_tokenization, tf_idf_db.vocab.n_gram)

                            self.claim_dict['title']['1_gram'][str(id)]['nr_words'] = nr_words_title

                        if tf_idf_db.vocab.method_tokenization[0] not in self.claim_dict['title']['1_gram'][str(id)].keys():
                            self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]] = {}
                            if tf_idf_name not in self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]].keys():
                                self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]][tf_idf_name] = get_empty_tag_dict()

                        self.claim_dict['title']['1_gram'][str(id)][tf_idf_db.vocab.method_tokenization[0]][tf_idf_name][str(pos_id)] += tf_idf_value   
                idx += 1
            self.save_claim()
        else:
            raise ValueError('Adapt function for bigrams')
        
        self.save_claim()
    
    def process_nr_words_per_pos(self, tf_idf_db, tag_2_id_dict):
        if tf_idf_db.n_gram == 1:
            doc = tf_idf_db.vocab.wiki_database.nlp(self.claim_dict['claim']['text'])
            claim_doc_tokenizer = ClaimDocTokenizer(doc, tf_idf_db.vocab.delimiter_words)
            n_grams_dict, nr_words = claim_doc_tokenizer.get_n_grams(tf_idf_db.vocab.method_tokenization, tf_idf_db.vocab.n_gram)

            self.claim_dict['claim']['1_gram']['nr_words'] = sum(n_grams_dict.values())
            
            for key, count in n_grams_dict.items():
                tag, word = get_tag_word_from_wordtag(key, vocab.delimiter_tag_word)
                pos_id = tag_2_id_dict[tag]
                self.claim_dict['claim']['1_gram']['nr_words_per_pos'][str(pos_id)] += count
            self.save_claim()
        else:
            raise ValueError('Adapt function for bigrams')

    def save_claim(self):
        with HiddenPrints():
            dict_save_json(self.claim_dict, self.path_claim)

ModuleNotFoundError: No module named 'doc_results_db_utils'

Collecting spacy
  Using cached https://files.pythonhosted.org/packages/a1/5b/0fab3fa533229436533fb504bb62f4cf7ea29541a487a9d1a0749876fc23/spacy-2.1.4-cp36-cp36m-manylinux1_x86_64.whl
Requirement already up-to-date: requests<3.0.0,>=2.13.0 in /home/bmelman/C_disk/03_environment/03_fever/lib/python3.6/site-packages (from spacy)
Requirement already up-to-date: murmurhash<1.1.0,>=0.28.0 in /home/bmelman/C_disk/03_environment/03_fever/lib/python3.6/site-packages (from spacy)
Collecting wasabi<1.1.0,>=0.2.0 (from spacy)
  Using cached https://files.pythonhosted.org/packages/f4/c1/d76ccdd12c716be79162d934fe7de4ac8a318b9302864716dde940641a79/wasabi-0.2.2-py3-none-any.whl
Collecting blis<0.3.0,>=0.2.2 (from spacy)
  Using cached https://files.pythonhosted.org/packages/34/46/b1d0bb71d308e820ed30316c5f0a017cb5ef5f4324bcbc7da3cf9d3b075c/blis-0.2.4-cp36-cp36m-manylinux1_x86_64.whl
Requirement already up-to-date: jsonschema<3.1.0,>=2.6.0 in /home/bmelman/C_disk/03_environment/03_fever/lib/python3.6

# Experiment