# De-Duplicating Run Facts

In prior years, we de-duplicated using `streamID` from the CrisisFACTS data. This approach is problematic for abstractive runs, so we instead de-duplicated via BERTScore, where two facts are considered duplicates if they have a BERTScore > some threshold.

We then merge these facts to produce a collapsed meta-fact set, where the text of the meta-fact as the text most similar to its neighboring facts and importance is the maximum importance across all neighboring facts.

These collapsed meta-facts and their associated raw facts from each run are stored for summary generation.

In [17]:
import glob
import gzip
import json
import bert_score
import pandas as pd

In [18]:
from collections import Counter

In [19]:
from itertools import combinations

In [20]:
import networkx as nx

In [21]:
INPUT_DIR = "event-days"
OUTPUT_DIR = "collapsed-event-days"

TOP_K = 20
TOP_K_FOR_SUMMARY = 512

BERT_THRESHOLD = 0.91

In [22]:
def collapse_facts(merged_df, req_id):
    # Create all pairs
    potential_dups = [
        (l.Index,r.Index, l.factText, r.factText) 
        for l,r in combinations(merged_df.itertuples(), 2) 
        if l.Index != r.Index
    ]

    # Seperate left and right texts
    left_texts = [t[2] for t in potential_dups]
    right_texts = [t[3] for t in potential_dups]


    # Generate the de-duplication scores
    outscores = bert_score.score(
        left_texts, 
        right_texts, 
        model_type="microsoft/deberta-xlarge-mnli", 
        device="cuda:0", 
        batch_size=32
    )


    # Filter duplicates by BERT_THRESHOLD
    scored_dups = list(zip(potential_dups, outscores[2]))
    dedup_dups = [d for d in scored_dups if d[1] >= BERT_THRESHOLD]
    dedup_df = pd.DataFrame(
        [list(d[0]) + [d[1]] for d in dedup_dups], 
        columns=["l_index", "r_index", "l_text", "r_text", "score"]
    )

    # Create a graph of nodes based on similarity (over threshold creates an edge)
    g = nx.Graph()
    for n_id in merged_df.index:
        g.add_node(n_id)
    for tup in dedup_df[["l_index", "r_index"]].itertuples():
        g.add_edge(tup.l_index, tup.r_index)

    # Copy, so we can delete top degree nodes
    mod_g = g.copy()

    # Keep track of nodes that have been collapsed
    collapsed_facts = {}
    collapsed_facts_counter = 0

    # Iteratively remove the highest-degree node
    while True:
        degree_view = dict(mod_g.degree())
        top_node = max(degree_view, key=degree_view.get)
        top_node_degree = degree_view[top_node]

        # if we have no neighbors, we can stop
        if top_node_degree == 0:
            break

        # Otherwise, we want to collect all neighbors into this collapsed fact
        neighbors = list(mod_g.neighbors(top_node))

        # Create the fact ID and increment counter
        this_collapsed_fact_id = "%s-collapsed-%04d" % (req_id, collapsed_facts_counter)
        collapsed_facts_counter += 1

        # Add all neighbors to this collapsed fact
        collapsed_facts[this_collapsed_fact_id] = [top_node] + neighbors

        # Remove all nodes covered by this collapsed fact
        for neighbor_id in neighbors:
            mod_g.remove_node(neighbor_id)
        mod_g.remove_node(top_node)

    # Now we take all remaining singletons
    for n_id in mod_g.nodes():   
        this_collapsed_fact_id = "%s-collapsed-%04d" % (req_id, collapsed_facts_counter)
        collapsed_facts_counter += 1
        collapsed_facts[this_collapsed_fact_id] = [n_id]


    # Collect all collapsed facts into one set
    collapsed_fact_set = []
    for c_fact_id,collapsed_fact_ids in collapsed_facts.items():

        rel_df = merged_df.loc[collapsed_fact_ids]
        c_fact_text = merged_df.loc[collapsed_fact_ids[0]]["factText"]
        c_fact_import = rel_df["importance"].max()

        collapsed_fact_set.append({
            "collapsed_fact_id": c_fact_id,
            "fact_text": c_fact_text,
            "relevant_facts": rel_df["factID"].values.tolist(),
            "num_relevant_facts": rel_df.shape[0],
            "importance": c_fact_import
        })
    print("Total Collapsed Facts:", len(collapsed_facts))

    # Create a dataframe, so we can sort by importance and truncate
    collapsed_df = pd.DataFrame(collapsed_fact_set)
    final_df = collapsed_df.sort_values(by="importance", ascending=False).head(TOP_K_FOR_SUMMARY)
    
    return final_df

In [23]:
for f in glob.iglob("%s/CrisisFACTS-*.json" % INPUT_DIR):
    print(f)
    req_id = f.rpartition("\\")[-1].replace(".json", "")
    
    this_day_df = pd.read_json(f, orient="records", lines=True)
    this_day_top_k_dfs = [group.sort_values(by="importance", ascending=False).head(TOP_K)[["factText", "factID", "importance", "runtag"]] \
         for runtag,group in this_day_df.groupby("runtag")]
    
    merged_df = pd.concat(this_day_top_k_dfs)
    
    final_df = collapse_facts(merged_df, req_id)
    final_df.to_json("%s/Collapsed-%s.json" % (OUTPUT_DIR, req_id), orient="records")
    #print(req_id)
    # final_df.to_json(f"{OUTPUT_DIR}/Collapsed-{req_id}.json", orient="records")

event-days\CrisisFACTS-001-r10.json
Total Collapsed Facts: 17
event-days\CrisisFACTS-001-r11.json
Total Collapsed Facts: 16
event-days\CrisisFACTS-001-r3.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-001-r4.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-001-r5.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-001-r6.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-001-r7.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-001-r8.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-001-r9.json
Total Collapsed Facts: 19
event-days\CrisisFACTS-002-r1.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-002-r2.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-002-r3.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-002-r4.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-002-r5.json
Total Collapsed Facts: 20
event-days\CrisisFACTS-002-r6.json
Total Collapsed Facts: 6
event-days\CrisisFACTS-003-r10.json
Total Collapsed Facts: 20
event-days\CrisisFACTS