<a href="https://colab.research.google.com/github/DAlkemade/bert-for-fever/blob/master/L101_preprocess_document_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocess the data
The end result is a .tsv file with the columns:


*   id
*   label (evidence or not)
*   sentence (from wikipedia)
*   claim

```bash
jupyter notebook --NotebookApp.allow_origin='https://colab.research.google.com' --port=8888 --NotebookApp.port_retries=0
```



In [0]:
TEST = False
EMPTY_TOKEN = 'EMPTY'
OUT_FILE_NAME = 'document_selection_test_n=50'
LOCAL = False
SAMPLE_NEGATIVE_INSTANCES = False
APPEND_GOLD_DOCUMENT = False
TEST_SET = True

In [0]:
if not LOCAL:
    from google.colab import drive
    drive.mount('/content/drive')

In [0]:
import argparse
import json
import sqlite3
import pandas as pd
from tqdm import tqdm
import os
import random

In [0]:
if LOCAL:
    fever_db = 'fever/fever.db'
    root = 'D:/GitHubD/fever-allennlp/data'
else:
    fever_db = 'fever.db'
    root = '/content/drive/My Drive/Overig/'

db = os.path.join(root, fever_db)
# in_file_fname = 'D:/GitHubD/fever-allennlp/data/dev_complete.sentences.p5.s5.jsonl'
in_file_fname = os.path.join(root, 'test_baseline_pages.sentences.p5.s5.jsonl')
out_file = os.path.join(root, f'{OUT_FILE_NAME}.tsv')

conn = sqlite3.connect(db)

In [0]:
chars = []

In [0]:
def get_doc_text(id):
    cursor = conn.cursor()
    cursor.execute(
        "SELECT lines FROM documents WHERE id = ?",
        (id,)
    )
    result = cursor.fetchone()
    cursor.close()
    return result

def get_golden_docs(evidence):
    all_evi = [[e[2], e[3]] for eg in instance["evidence"] for e in eg if e[3] is not None] # from baseline scorer
    docs = []
    for entry in all_evi:
        id = entry[0]
        docs.append(id)
        
    return docs

def parse_doc(doc_raw):
    """
    Parse a list of lines from a raw document text, with the index in the list
    correponding to the line index in the data entries
    """
    new = []
    lines = doc_raw.split("\n")
    char_count = 0
    for line in lines:
        # print('Line: {}'.format(line))
        line = line.split("\t")
    #   TODO: THIS MIGHT DROP PARTS OF SENTENCES AFTER A TAB
        if len(line) > 1 and len(line[1]) > 1:
            new.append(line[1])
            char_count += len(line[1])
        else:
            new.append(EMPTY_TOKEN)
    chars.append(char_count)
    return new

In [0]:
# TODO: WAT DOEN WE MET DE NOT VERIFIABLES?
with open(in_file_fname, "r") as in_file:
    instances = []
    for line in in_file:
        instances.append(json.loads(line))
    # print(f"Number of instances: {len(instances)}")
    # instances = instances[:75000]
   
    training_instances = []
    # if TEST:
    #     new_instances = []
    #     for ins in instances:
    #         if ins['id'] == 18884:
    #             new_instances.append(ins)
    #     instances = new_instances
    if TEST:
        instances = instances[:100]
    for i in tqdm(range(len(instances))):
        instance = instances[i]
        if TEST_SET or instance['verifiable'] != 'NOT VERIFIABLE':
            claim = instance['claim']
            claim_id = instance['id']
            docs = instance['predicted_pages']
            if APPEND_GOLD_DOCUMENT:
                gold_docs = get_golden_docs(instance['evidence'])
                for gold_doc in gold_docs:
                    if gold_doc not in docs:
                        docs.append(gold_doc) # make sure all positive examples are added to the data
            
            for doc_id in docs:
                doc_raw = get_doc_text(doc_id)[0]

                    
                doc_sentences = parse_doc(doc_raw)
                doc_as_string = ' '.join(doc_sentences)
                doc_as_string_shortened = doc_as_string[:512]
                context = doc_as_string

                if not TEST_SET:
                    if doc_id in gold_docs:
                        label = 1
                    else:
                        label = 0
                else:
                    label = None                                             
                training_instances.append([label, claim, context, claim_id, doc_id])
    
    
    


In [0]:
# for instance in instances:
#     if instance['id'] == 75397:
#         print(instance)

In [0]:
if SAMPLE_NEGATIVE_INSTANCES:
    new_instances = []
    for f in training_instances:
        if f[0] == 1:
            new_instances.append(f)
        else:
            #throw away 90% of neg instances at random
            if random.uniform(0,1) < 0.1:
                new_instances.append(f)
    training_instances = new_instances


len(training_instances)

In [0]:
print(len(training_instances))
data = pd.DataFrame(training_instances, columns =['label', 'claim', 'context', 'claim_id', 'doc_id']) 

In [0]:
data.head(51)

In [0]:
data.to_csv(out_file)


In [0]:
len(data.index)

        # TODO: think about what to take as negative samples; just save all for now
        #  But the BERT article does HNM (Hard negative Mining);
        # look into that

        #Do:
        # get all sentences that are in the training instance. Loop over and label them using the instance data
        # add every sentence as a line to a pandas dataframe
        # save it as a .tsv

In [0]:
# first = pd.read_csv('/content/drive/My Drive/Overig/document_selection_train_dataset_first75000.tsv')
# second = pd.read_csv('/content/drive/My Drive/Overig/document_selection_train_dataset_last75000.tsv')
# combine = pd.concat([first, second])
# combine.to_csv('/content/drive/My Drive/Overig/document_selection_train_dataset_combine.tsv')