In [11]:
from __future__ import print_function
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms

import config


In [12]:
from random import shuffle
import torch
import os

from doc_results_db import ClaimTensorDatabase
from utils_db import mkdir_if_not_exist, dict_save_json, dict_load_json

import config

class DataNeuralNetwork():
    def __init__(self, method_database, setup):
        self.setup = setup
        self.method_database = method_database
        self.path_wiki_pages = os.path.join(config.ROOT, config.DATA_DIR, config.WIKI_PAGES_DIR, 'wiki-pages')
        self.path_wiki_database_dir = os.path.join(config.ROOT, config.DATA_DIR, config.DATABASE_DIR)
        self.tensor_db = ClaimTensorDatabase(self.path_wiki_pages, self.path_wiki_database_dir, self.setup)
        self.path_combined_data_dir = os.path.join(self.tensor_db.path_results_dir, 'neural_network', 'data_setup_' + str(self.setup) + '_' + self.method_database)
        self.path_settings = os.path.join(self.path_combined_data_dir, 'settings.json')
        
        print('DataNeuralNetwork')
        if os.path.isdir(self.path_combined_data_dir):
            self.settings = dict_load_json(self.path_settings)
        else:
            self.settings = {}
            self.create_folder_combined_files()
            
    def create_folder_combined_files(self):
        mkdir_if_not_exist(self.path_combined_data_dir)

        dir_list = [self.tensor_db.path_label_correct_evidence_true_dir, 
                   self.tensor_db.path_label_correct_evidence_false_dir,
                   self.tensor_db.path_label_refuted_evidence_true_dir,
                   self.tensor_db.path_label_refuted_evidence_false_dir]
        
        nr_observations_list = [self.tensor_db.settings['nr_correct_true'], 
                                self.tensor_db.settings['nr_correct_false'], 
                                self.tensor_db.settings['nr_refuted_true'], 
                                self.tensor_db.settings['nr_refuted_false']]

        if method_database == 'include_all':
            total_nr_observations = sum(nr_observations_list)
            random_id_list = list(range(total_nr_observations))
            shuffle(random_id_list)

            for i in tqdm(range(len(dir_list))):
                dir = dir_list[i]
                nr_obervations = nr_observations_list[i]
                for idx in range(nr_obervations):
                    transformed_ids = random_id_list[idx + sum(nr_observations_list[0:i])]
                    
                    file_name_variables_load = os.path.join(dir, 'variable_' + str(idx) + '.pt')
                    file_name_label_load = os.path.join(dir, 'label_' + str(idx) + '.pt')
                    
                    file_name_variables_write = os.path.join(self.path_combined_data_dir, 'variable_' + str(transformed_ids) + '.pt')
                    file_name_label_write = os.path.join(self.path_combined_data_dir, 'label_' + str(transformed_ids) + '.pt')

                    X = torch.load(file_name_variables_load)
                    Y = torch.load(file_name_label_load)

                    torch.save(X, file_name_variables_write)
                    torch.save(Y, file_name_label_write)

            self.settings['nr_observations'] = len(random_id_list)

        elif method_database == 'equal_class':
            min_nr_observations = min(nr_observations_list)
            random_id_list = list(range(min_nr_observations*4))
            shuffle(random_id_list)

            for i in tqdm(range(len(dir_list))):
                dir = dir_list[i]
                nr_obervations = nr_observations_list[i]
                random_id_list_setting = list(range(nr_obervations))
                shuffle(random_id_list_setting)
                j=0
                for idx in random_id_list_setting[0:min_nr_observations]:
                    transformed_ids = random_id_list[j + min_nr_observations*i]
                    
                    file_name_variables_load = os.path.join(dir, 'variable_' + str(idx) + '.pt')
                    file_name_label_load = os.path.join(dir, 'label_' + str(idx) + '.pt')                    
                    file_name_variables_write = os.path.join(self.path_combined_data_dir, 'variable_' + str(transformed_ids) + '.pt')
                    file_name_label_write = os.path.join(self.path_combined_data_dir, 'label_' + str(transformed_ids) + '.pt')

                    X = torch.load(file_name_variables_load)
                    Y = torch.load(file_name_label_load)

                    torch.save(X, file_name_variables_write)
                    torch.save(Y, file_name_label_write)
                    j += 1

            self.settings['nr_observations'] = len(random_id_list)

        else:
            raise ValueError('method not in method options', method_database)

        dict_save_json(self.settings, self.path_settings)

In [13]:
# https://github.com/pytorch/examples/blob/master/mnist/main.py

class Net(nn.Module):
    def __init__(self, nr_input_variables, nr_hidden_neurons, nr_output_variables):
        super(Net, self).__init__()
#         self.conv1 = nn.Conv2d(1, 20, 5, 1)
#         self.conv2 = nn.Conv2d(20, 50, 5, 1)
        self.fc_input = nn.Linear(nr_input_variables, nr_hidden_neurons)
        self.fc_hidden = nn.Linear(nr_hidden_neurons, nr_hidden_neurons)
        self.fc_output = nn.Linear(nr_hidden_neurons, nr_output_variables)
        self.sigmoid = nn.Sigmoid()
        self.bn1 = nn.BatchNorm1d(nr_hidden_neurons)
        self.bn2 = nn.BatchNorm1d(nr_hidden_neurons)
        
    def forward(self, x):
        x = F.relu(self.fc_input(x))
        x = self.bn1(x)
        x = F.relu(self.fc_hidden(x))
        x = self.bn2(x)
        x = F.relu(self.fc_output(x))
    
        return self.sigmoid(x)
    
def train(log_interval, model, device, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device, dtype=torch.int64)
        optimizer.zero_grad()
        output = model(data.float())
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        if batch_idx % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
                epoch, batch_idx * len(data), len(train_loader.dataset),
                100. * batch_idx / len(train_loader), loss.item()))

def train_performance(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device, dtype=torch.int64)
            output = model(data.float())
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('\nTraining set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))
    
def test_performance(model, device, test_loader):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device, dtype=torch.int64)
            output = model(data.float())
            test_loss += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)

    print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [65]:
settings = data_nn.tensor_db.settings
# settings

In [66]:
nr_variables = data_nn.tensor_db.settings['nr_variables']
# nr_variables
# nr_variables = 0
# list_keys = ['observation_key_list_claim', 'observation_key_list_title', 'observation_key_list_text']
# for key in list_keys:
#     nr_variables += len(settings[key])
# nr_variables

In [14]:
import os

import torch
from torch.utils import data

class Dataset(data.Dataset):
    'Characterizes a dataset for PyTorch'
    def __init__(self, path_data_set, list_ids):
        'Initialization'
        self.path_data_set = path_data_set
        self.list_ids = list_ids
        self.nr_observations = len(list_ids)
        
    def __len__(self):
        'Denotes the total number of samples'
        return self.nr_observations

    def __getitem__(self, index):
        'Generates one sample of data'
        
        file_name_variables = os.path.join(self.path_data_set, 'variable_' + str(self.list_ids[index])  + '.pt')
        file_name_label = os.path.join(self.path_data_set, 'label_' + str(self.list_ids[index])  + '.pt')
        
        X = torch.load(file_name_variables)
        y = torch.load(file_name_label)[0]
        
        return X, y
    

In [26]:
import os 

from doc_results_db import ClaimTensorDatabase

import config

class NeuralNetwork():
    def __init__(self, claim_data_set, method_database, setup, settings_model):
        self.claim_data_set = claim_data_set
        self.method_database = method_database
        self.setup = setup
        
        self.fraction_training = settings_model['fraction_training']
        self.use_cuda = settings_model['use_cuda']
        self.seed = settings_model['seed']
        self.lr = settings_model['lr']
        self.momentum = settings_model['momentum']
        self.params = settings_model['params']
        self.nr_epochs = settings_model['nr_epochs']
        self.log_interval = settings_model['log_interval']
        
        self.data_nn = self.get_data()
        self.nr_observations = self.data_nn.settings['nr_observations']
        self.nr_variables = self.data_nn.tensor_db.settings['nr_variables']
        
        self.path_model_dir = os.path.join(self.data_nn.tensor_db.path_results_dir, 'neural_network', 'model')
        self.path_model = os.path.join(self.path_model_dir ,'model.pt')
        self.path_settings = os.path.join(self.path_model_dir, 'settings.json')
        
        mkdir_if_not_exist(self.path_model_dir)
        print('NeuralNetwork')
        if os.path.isfile(self.path_model):
            print('- load model')
            self.model = torch.load(self.path_model)
        else:
            print('- train model')
            self.partition = self.get_partition()
            self.training_data_loader, self.validation_data_loader = self.get_data_generators()
            self.model = self.train_model()
            torch.save(self.model.state_dict(), self.path_model)
            
        
    def get_data(self):
        return DataNeuralNetwork(self.method_database, self.setup)
    
    def get_partition(self):
        partition = {}
        partition['train'] = list(range(0, int(self.nr_observations*self.fraction_training)))
        partition['validation'] = list(range(int(self.nr_observations*self.fraction_training), self.nr_observations))
        return partition
    
    def get_data_generators(self):
        training_set = Dataset(self.data_nn.path_combined_data_dir, self.partition['train'])
        training_data_loader = data.DataLoader(training_set, **self.params)

        validation_set = Dataset(self.data_nn.path_combined_data_dir, self.partition['validation'])
        validation_data_loader = data.DataLoader(validation_set, **self.params)
        return training_data_loader, validation_data_loader
        
    def train_model(self):
        torch.manual_seed(self.seed)

        kwargs = {'num_workers': 6, 'pin_memory': True} if self.use_cuda else {}

        device = torch.device("cuda" if self.use_cuda else "cpu")

        model = Net(nr_input_variables = self.nr_variables, nr_hidden_neurons = 10, nr_output_variables = 2).to(device)
        optimizer = optim.SGD(model.parameters(), lr=self.lr, momentum=self.momentum)
        
        for epoch in range(1, self.nr_epochs + 1):
            train(self.log_interval, model, device, self.training_data_loader, optimizer, epoch)
            train_performance(model, device, self.training_data_loader)
            test_performance(model, device, self.validation_data_loader)
        
        return model
    


In [28]:
# https://stanford.edu/~shervine/blog/pytorch-how-to-generate-data-parallel
# === constants === #

# === variables === #
claim_data_set = 'dev'
method_database = 'equal_class' # include_all, equal_class
setup = 1
settings_model = {}
settings_model['fraction_training'] = 0.9
settings_model['use_cuda'] = False
settings_model['seed'] = 1
settings_model['lr'] = 0.001
settings_model['momentum'] = 0.9 # 0.5
settings_model['params'] = {'batch_size': 64, 'shuffle': True}
settings_model['nr_epochs'] = 5
settings_model['log_interval'] = 10

neural_network = NeuralNetwork(claim_data_set, method_database, setup, settings_model)
    
model = neural_network.model
# for every 




DataNeuralNetwork
NeuralNetwork
- load model


In [30]:
from doc_results_db import ClaimTensorDatabase

claim_data_set = 'dev'
path_wiki_pages = os.path.join(config.ROOT, config.DATA_DIR, config.WIKI_PAGES_DIR, 'wiki-pages')
path_wiki_database_dir = os.path.join(config.ROOT, config.DATA_DIR, config.DATABASE_DIR)

setup = 3

claim_tensor_db = ClaimTensorDatabase(path_wiki_pages, path_wiki_database_dir, setup)


ValueError: ('json file does not exist', '/home/bmelman/C_disk/02_university/06_thesis/01_code/fever/_04_results/01_score_combination/setup_3/settings.json')

In [None]:


print('claim database: save results to folder with tensors')

settings_dict = {}

id = 5
file = ClaimFile(id = id, path_dir_files = self.path_claims_dir)

id_list_title = list(file.claim_dict['title']['1_gram'].keys())
id_list_text = list(file.claim_dict['text']['1_gram'].keys())

observation_key_list_claim, _ = get_list_properties(file.claim_dict['claim']['1_gram'], [], [], [])

if len(id_list_title) > 0:
    observation_key_list_title , _ = get_list_properties(file.claim_dict['title']['1_gram'][id_list_title[0]], [], [], []) 
else:
    observation_key_list_title = []

if len(id_list_text) > 0:
    observation_key_list_text,  _ = get_list_properties(file.claim_dict['text']['1_gram'][id_list_text[0]], [], [], [])
else:
    observation_key_list_text = []

settings_dict['observation_key_list_claim'] = observation_key_list_claim
settings_dict['observation_key_list_title'] = observation_key_list_title
settings_dict['observation_key_list_text'] = observation_key_list_text

idx = 0

nr_correct_false = 0
nr_correct_true = 0
nr_refuted_false = 0
nr_refuted_true = 0

for id in tqdm(range(self.claim_database.nr_claims), desc = 'save_2_tensor'):
    file = ClaimFile(id = id, path_dir_files = self.path_claims_dir)

    label = file.claim_dict['claim']['label']

    if label != 'NOT ENOUGH INFO':
        label_nr = label_2_num(label)

        for id_document in file.claim_dict['ids_selected']:

            if label == 'SUPPORTS':
                if id_document in file.claim_dict['ids_correct_docs']:
                    file_name_variables = os.path.join(self.path_label_correct_evidence_true_dir, 'variable_' + str(nr_correct_true) + '.pt')
                    file_name_label = os.path.join(self.path_label_correct_evidence_true_dir, 'label_' + str(nr_correct_true) + '.pt')
                    nr_correct_true += 1
                else:
                    file_name_variables = os.path.join(self.path_label_correct_evidence_false_dir, 'variable_' + str(nr_correct_false) + '.pt')
                    file_name_label = os.path.join(self.path_label_correct_evidence_false_dir, 'label_' + str(nr_correct_false) + '.pt')
                    nr_correct_false += 1

            elif label == 'REFUTES':
                if id_document in file.claim_dict['ids_correct_docs']:
                    file_name_variables = os.path.join(self.path_label_refuted_evidence_true_dir, 'variable_' + str(nr_refuted_true) + '.pt')
                    file_name_label = os.path.join(self.path_label_refuted_evidence_true_dir, 'label_' + str(nr_refuted_true) + '.pt')
                    nr_refuted_true += 1
                else:
                    file_name_variables = os.path.join(self.path_label_refuted_evidence_false_dir, 'variable_' + str(nr_refuted_false) + '.pt')
                    file_name_label = os.path.join(self.path_label_refuted_evidence_false_dir, 'label_' + str(nr_refuted_false) + '.pt')
                    nr_refuted_false += 1
            else:
                raise ValueError('label not correct', label)

            list_variables = []

            id_list = list(file.claim_dict['title']['1_gram'].keys())

            _, values_claim = get_list_properties(file.claim_dict['claim']['1_gram'], [], [], [])
            list_variables += values_claim
            if str(id_document) in file.claim_dict['title']['1_gram']:
                _, values_title = get_list_properties(file.claim_dict['title']['1_gram'][str(id_document)], [], [], [])
                list_variables += values_title

            if str(id_document) in file.claim_dict['text']['1_gram']:
                _, values_text  = get_list_properties(file.claim_dict['text']['1_gram'][str(id_document)], [], [], [])
                list_variables += values_text

            tensor_variable = torch.FloatTensor(list_variables)  
            tensor_label = torch.LongTensor([label_nr])

            torch.save(tensor_variable, file_name_variables)
            torch.save(tensor_label, file_name_label)

            idx += 1

nr_variables = 0
list_keys = ['observation_key_list_claim', 'observation_key_list_title', 'observation_key_list_text']
for key in list_keys:
    nr_variables += len(settings_dict[key])

settings_dict['nr_variables'] = nr_variables
settings_dict['nr_total'] = idx
settings_dict['nr_correct_false'] = nr_correct_false
settings_dict['nr_correct_true'] = nr_correct_true
settings_dict['nr_refuted_false'] = nr_refuted_false
settings_dict['nr_refuted_true'] = nr_refuted_true

settings_dict['nr_claims'] = self.claim_database.nr_claims

dict_save_json(settings_dict, self.path_settings_dict)

In [61]:
id = 6
file_name_variables = '/home/bmelman/Desktop/C_disk/02_university/06_thesis/01_code/fever/_04_results/01_score_combination/neural_network/data_setup_1_include_all/variable_' + str(id) + '.pt'
file_name_label = '/home/bmelman/Desktop/C_disk/02_university/06_thesis/01_code/fever/_04_results/01_score_combination/neural_network/data_setup_1_include_all/label_' + str(id) + '.pt'

X = torch.load(file_name_variables)
y = torch.load(file_name_label)
y, X

FileNotFoundError: [Errno 2] No such file or directory: '/home/bmelman/Desktop/C_disk/02_university/06_thesis/01_code/fever/_04_results/01_score_combination/neural_network/data_setup_1_include_all/variable_6.pt'

In [51]:
dir = '/home/bmelman/Desktop/C_disk/02_university/06_thesis/01_code/fever/_04_results/01_score_combination/setup_1/dev_correct_false_tensor'

In [60]:
id = 1
file_name_variables = os.path.join(dir,'variable_' + str(id) + '.pt')
file_name_label = os.path.join(dir,'label_' + str(id) + '.pt')

X = torch.load(file_name_variables)
y = torch.load(file_name_label)
y,X

(tensor([0]),
 tensor([9.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 1.0000, 0.0000,
         2.0000, 1.0000, 0.0000, 0.0000, 3.0000, 1.0000, 0.0000, 0.0000, 1.0000,
         0.0000, 0.0000, 1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 3.0780, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 3.0785, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 3.0780, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 3.0785]))

In [50]:
label_nr = 0
tensor_label = torch.LongTensor([label_nr])
torch.save(tensor_label, 'tmp.pt')
tensor_temp = torch.load('tmp.pt')
torch.save(tensor_temp, 'tmp.pt')
torch.load('tmp.pt')


tensor([0])