# Merge Annotations into One File

Annotations occurred over several files, and here, we merge them into one, so all event-day pairs have a single set of annotations from all NIST assessors.

In [None]:
import collections
import glob 
import json
import pandas as pd

import matplotlib.pyplot as plt

In [None]:
merged_annotation_data = {}

for f in glob.glob("data/byAnnotator.*.json") + ["data/CrisisFACTS-all-r4.json"]:
    print(f)
    with open(f, "r") as in_file:
        for line in in_file:
            annotations = json.loads(line)
            
            # Pull out assessor IDs
            a_uid = annotations["_annotator_id"].rpartition("-")[-1]
            a_title = annotations["title"]
            
            if "spans" not in annotations:
                print("\t", "No annotations in:", a_title)
                continue

            # Add names to spans
            spans = annotations["spans"]
            [s.update({"uid": a_uid}) for s in spans]
            
            # Get the annotated data
            this_annotation = merged_annotation_data.get(a_title, {
                "text": annotations["text"], 
                "spans": [],
            })
            
            this_annotation["spans"] = this_annotation["spans"] + spans
            
            # Update the annotations
            merged_annotation_data[a_title] = this_annotation

In [None]:
# We have to take dfmcurry annotations separately
for f in glob.glob("data/CrisisFACTS-0*.json"):
    print(f)
    with open(f, "r") as in_file:
        for line in in_file:
            annotations = json.loads(line)
            
            a_uid = annotations["_annotator_id"].rpartition("-")[-1]
            a_title = annotations["title"]
            
            if a_uid != "dfmcurry":
                continue

            spans = annotations["spans"]
            [s.update({"uid": a_uid}) for s in spans]
            
            if a_title not in merged_annotation_data:
                print("\t", "UNIQUE ANNOTATIONS in:", a_title)

            # Get the annotated data
            this_annotation = merged_annotation_data.get(a_title, {
                "text": annotations["text"], 
                "spans": [],
            })
            
            this_annotation["spans"] = this_annotation["spans"] + spans
            
            # Update the annotations
            merged_annotation_data[a_title] = this_annotation

            

In [None]:
with open("merged-annotations.json", "w") as out_file:
    json.dump(merged_annotation_data, out_file)