# Merge Facts from Submitted Runs

This notebook selects runs to include from `submissions.csv` via the `priority` column and merges facts from all relevant runs into a common set of files for each request-id/event-day pair. We will use these files to de-duplicate submitted facts in the next step. For each included run, we group facts by request-id (again, event-day pair) and copy these facts to a single file all facts from all included submissions that are produced for this request-id. After running this script, we have an `event-day` directory with one file for each request-id, and each file contains all submitted facts from all included runs for that event-day.

In [None]:
import glob
import gzip
import json
import bert_score
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from itertools import combinations

In [3]:
from nltk.tokenize import TweetTokenizer

In [4]:
run_data_df = pd.read_csv("submissions.csv")

In [5]:
all_runs_to_include = set()
for team,group in run_data_df.groupby("team"):
    print(team)
    print("\t", ", ".join(group["priority"].apply(str)))
    
    runs_to_include = group[group["priority"] <= 2].sort_values(by="priority", ascending=False).head(2)
    all_runs_to_include = all_runs_to_include.union(runs_to_include["filename"])

thesis.v1
	 1


In [6]:
filename_to_runtag = {row["filename"]:row["runtag"] for idx,row in run_data_df.iterrows()}

In [7]:
filename_to_runtag

{'submissions\\Thesis_Retriver.gz': 'Thesis_Retriver'}

In [8]:
OUTPUT_DIR = "event-days"

In [9]:
tknzr = TweetTokenizer()

In [11]:
for submission_file in glob.glob("submissions/*.gz"):
    print(submission_file, filename_to_runtag[submission_file])
    
    if not submission_file in all_runs_to_include:
        print("\t", "SKIPPING")
        continue
    
    runtag = filename_to_runtag[submission_file]
    
    with gzip.open(submission_file, "rb") as in_file:
        rows = []
        for line_ in in_file:
            line = line_.decode("utf8")
            fact = json.loads(line)
            
            rows.append(fact)
            
        this_run_df = pd.DataFrame(rows)
        for requestId,group in this_run_df.groupby("requestID"):
            new_group_df = group.sort_values(by="unixTimestamp")

            # Data hygiene to ensure we have non-empty sentences with more than one token
            new_group_df["tokens"] = new_group_df["factText"].apply(lambda s: len(tknzr.tokenize(s)))
            new_group_df = new_group_df[new_group_df["tokens"] > 1]
            new_group_df = new_group_df[new_group_df["factText"].str.len() > 0].copy()
            
            # Create new fact IDs starting with the request ID, run-tag, and fact number
            #. This new fact ID helps us keep track of what submitted facts get combined
            #. in the de-duplication step.
            new_group_df.index = list(range(0,new_group_df.shape[0]))
            new_group_df["factID"] = ["%s-%s-%04d" % (requestId,runtag,i) for i in new_group_df.index]
            new_group_df["runtag"] = runtag
            
            with open("%s/%s.json" % (OUTPUT_DIR,requestId), "a") as out_file:
                [out_file.write("%s\n" % (json.dumps(r))) for r in new_group_df.to_dict(orient="records")]


submissions\Thesis_Retriver.gz Thesis_Retriver
