# Process CrisisFACTS Submissions

This notebook converts the NIST-provided submission metadata (`run.metadata.tsv`) in the `submissions.csv` file, which contains metadata associated with each submitted run. This metadata includes run tag, team, descriptions of how the submission operates (e.g., how it calculates importance, which platforms it uses, etc), and priority. We use this submissions file to determine which runs to include in manual evaluation, as we only include submissions with priority <= 2.

In [26]:
import gzip
import glob

import os.path

import pandas as pd

In [27]:
runs = []

with open("metadata", "r") as in_file:
    for line in in_file:
        this_meta = line.split(":")
        print(this_meta)
        # 1 Run-ID: (runtag)
        #  2 PID: (group)
        #  3 SubmitterEmail: (email)
        #  4 Track:crisis
        #  5 RunType:automatic
        #  6 Fields: (not used for crisis)
        #  7 Task: (not used for crisis)
        #  8 Date-Received:8/30/2023
        #  9 OptionalField1: (did you use the TREC-IS tweet categories)
        # 10 OptionalField2: (how does your system calculate importance)
        # 11 OptionalField3: (streams: Twitter, FB, Reddit, news; 1=yes)
        # 12 JudgingPrecedence:1
        # 13 Sortmerge-100: (not used)
        # 14 OptionalField4: (extractive or abstractive)
        # 15 OptionalField5: (not used for crisis)
        # 16 OptionalField6: (description of your run)
        # 17 MD5-Sum:3cacd7daaaada168dace8e6df37766ab
        # 18 Notes:
        
        runtag = this_meta[0]
        team = this_meta[1]
        email = this_meta[2]
        
        runtype = this_meta[4]
        
        rundate = this_meta[7]
        uses_trecis = this_meta[8]
        importance = this_meta[9]
        platforms = this_meta[10]
        priority = this_meta[11]
        
        summary_type = this_meta[13]
        description = this_meta[15]
        
        filename = f"submissions\{runtag}.gz"
        if not os.path.exists(filename):
            print("MISSING:", filename)
        
        runs.append({
            "filename": filename,
            "runtag": runtag,
            "team": team,
            "email": email,
            "runtype": runtype,
            "rundate": rundate,
            "uses_trecis": uses_trecis,
            "importance": importance,
            "platforms": platforms,
            "summary_type": summary_type,
            "description": description,
            "priority": priority
            
        })

['Thesis_Retriver', 'thesis.v1', 'polito', 'cbuntain@umd.edu', 'automatic', '', '', '9/5/2023', 'no', 'Baseline 2', '1111', '1', '', 'extractive', 'Baseline 2', 'xxx', '']


In [28]:
run_df = pd.DataFrame(runs)

In [29]:
run_df.to_csv("submissions.csv", index=None)

In [30]:
run_df["summary_type"].value_counts()

summary_type
extractive    1
Name: count, dtype: int64

In [31]:
run_df["runtype"].value_counts()

runtype
automatic    1
Name: count, dtype: int64

In [32]:
run_df["platforms"].value_counts()

platforms
1111    1
Name: count, dtype: int64