# Extract claim_evidence pairs from training set

## Construct index dict

In [1]:
import os
import json

wiki_file_path = "../new-wiki-pages-text/"
train_file_path = "../JSONFiles/" + "train.json"
dev_file_path = "../JSONFiles/" + "devset.json"

train_output_path = "../TrainSentence/" + "training.csv"
dev_output_path = "../TrainSentence/" + "dev.csv"

# create index for evidences
evi_index = {}
head_index = {}
for file in os.listdir(wiki_file_path):
    print("processing file: " + file)
    file_name = wiki_file_path + file
    with open(file=file_name) as f:
        lines = f.readlines()
        for i in range(len(lines)):
            words = lines[i].split(" ")
            head = words[0]

            try:
                evi = (words[0], int(words[1]))
                evi_index.update({
                evi: (file, i)
                })
            except ValueError as e:
                continue
                
            if head in head_index.keys():
                head_index[head].append(int(words[1]))
            else:
                head_index.update({
                    head: [int(words[1])]
                })


processing file: wiki-009.txt
processing file: wiki-021.txt
processing file: wiki-035.txt
processing file: wiki-034.txt
processing file: wiki-020.txt
processing file: wiki-008.txt
processing file: wiki-036.txt
processing file: wiki-022.txt
processing file: wiki-023.txt
processing file: wiki-037.txt
processing file: wiki-033.txt
processing file: wiki-027.txt
processing file: wiki-026.txt
processing file: wiki-032.txt
processing file: wiki-024.txt
processing file: wiki-030.txt
processing file: wiki-018.txt
processing file: wiki-019.txt
processing file: wiki-031.txt
processing file: wiki-025.txt
processing file: wiki-042.txt
processing file: wiki-056.txt
processing file: wiki-081.txt
processing file: wiki-095.txt
processing file: wiki-094.txt
processing file: wiki-080.txt
processing file: wiki-057.txt
processing file: wiki-043.txt
processing file: wiki-069.txt
processing file: wiki-055.txt
processing file: wiki-041.txt
processing file: wiki-096.txt
processing file: wiki-082.txt
processing

In [8]:
# head_index.get("José_Ferrer")

## Build training set according to file

In [6]:
import pandas as pd
import linecache
import numpy as np
import random
from sklearn.utils import shuffle

use_file = 0    # 0 for training set , 1 for dev set

# read training data
if use_file == 0:
    with open(train_file_path, 'r') as f:
        train = json.load(f)
elif use_file == 1:
    with open(dev_file_path, 'r') as f:
        train = json.load(f)

def get_evi_text(evi) -> str:
    evidence = (evi[0], int(evi[1]))
    file_index_tuple = evi_index.get(evidence)
    file = wiki_file_path + file_index_tuple[0]
    line_num = file_index_tuple[1]
    evi_text = linecache.getline(file,line_num + 1)
#     evi_text = evi_text[2: len(evi_text) + 1]
    words = evi_text.split(" ")[2: len(evi_text)]
    evi_text = " ".join(words)
    return evi_text[0: -1]
    

claim_list = []
evi_list = []
claim_evi_list = []
label_list = []

np.random.seed(3)

for key in train.keys():
    item = train.get(key)
    evidence_list = item['evidence']
    if len(evidence_list) == 0:
        continue
    claim = item['claim']
        
    head_set = set()
    for evi in evidence_list:
        head = evi[0]
        head_set.add(head)
        
    for head in head_set:
        sentence_numbers = head_index.get(head)
        if sentence_numbers is None:
            continue
        for number in sentence_numbers:
            query_combination = [head, number]
            evidence_text = get_evi_text(query_combination)
            
            if query_combination in evidence_list:
                label_list.append(1)
            else:
                probability = np.random.rand()
                if probability > 0.2:
                    continue
                label_list.append(0)
                
            claim_list.append(claim)
            evi_list.append(evidence_text)
            claim_evi_list.append(claim + " ||| " + evidence_text)

result_dict = {
    "claim": claim_list,
    "evidence": evi_list,
#     "claim_evi_pair": claim_evi_list,
    "label": label_list
}
result_df = pd.DataFrame(result_dict)
result_df = shuffle(result_df)

if use_file == 0:
    result_df.to_csv(train_output_path, index=False, sep='\t')
elif use_file == 1:
    result_df.to_csv(dev_output_path, index=False, sep='\t')



In [4]:
result_df['evidence'][0]

"Soul Food film Soul Food is a 1997 American comedy-drama film produced by Kenneth `` Babyface '' Edmonds , Tracey Edmonds and Robert Teitel and released by Fox 2000 Pictures ."

In [8]:
result_df['label'].describe()

count    11982.000000
mean         0.471624
std          0.499215
min          0.000000
25%          0.000000
50%          0.000000
75%          1.000000
max          1.000000
Name: label, dtype: float64