# BERTScore Scoring Against NIST and Wikipedia

Duplicating CrisisFACTS 2022 assessment via BERTScore

In [1]:
%matplotlib inline

In [2]:
import pandas as pd
import numpy as np
import json
import glob
import gzip
import torch

import scipy.stats

import matplotlib.pyplot as plt

In [3]:
import wikipedia

In [None]:
from torchmetrics.text.rouge import ROUGEScore

## Get CrisisFACTS Event Metadata

In [5]:
event_df = pd.read_json("CrisisFACTs-2022to2023.topics.json", lines=False).set_index("eventID")

### Get Summaries from Wikipedia

In [None]:
def get_wiki_summary(url):
    
    if "wikipedia.org" not in url:
        return ""
    
    page_title = url.rpartition("/")[-1]
    print(url)
    page = wikipedia.page(title=page_title, auto_suggest=False)
    
    return page.summary

event_df["wiki.summary"] = event_df["url"].apply(get_wiki_summary)

### Get Summaries from Assessors

In [7]:
with open("final-annotated-facts-results.json", "r") as in_file:
    annotation_data = json.load(in_file)

In [8]:
event_fact_lists = {}

for req_id,req_data in annotation_data.items():
    event_id = req_id.rpartition("-")[0]
    local_fact_set = {f[0] for f in req_data["fact_list"] if f[2] == "USEFUL_FACT"}
    
    with open("../00-createSummaries/collapsed-event-days/Collapsed-%s.json" % req_id) as in_file:
        local_fact_data = json.load(in_file)
        
    local_fact_text = [
        fact["fact_text"] for fact in local_fact_data 
        if fact["collapsed_fact_id"] in local_fact_set
    ]
    
    local_fact_list = event_fact_lists.get(event_id, [])
    local_fact_list = local_fact_list + local_fact_text
    event_fact_lists[event_id] = local_fact_list

In [9]:
event_df["nist.summary"] = [". ".join(event_fact_lists[event_id]).replace("..", ".") for event_id in event_df.index]


In [10]:
# save on file and convert all to ascii on a txt
with open("nist_summary.txt", "w") as out_file:
    out_file.write(event_df["nist.summary"].iloc[0].encode("ascii", "ignore").decode("ascii"))


## Save the Summaries

In [11]:
event_df.to_json("CrisisFACTs-2022to2023.topics.withSummaries.json")

In [12]:
# event_df = pd.read_json("CrisisFACTs-2022to2023.topics.withSummaries_smaller.json", lines=False)

In [13]:
# # save on file and convert all to ascii on a txt
# with open("nist_summary_2.txt", "w") as out_file:
#     out_file.write(event_df["nist.summary"].iloc[0].encode("ascii", "ignore").decode("ascii"))

In [None]:
# rouge = ROUGEScore(
#     use_stemmer=True,
# )

import bert_score
bert_score.__version__

In [15]:
TOP_K = 32

In [16]:
all_req_ids = list(annotation_data.keys())

In [17]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [18]:
# bert_model = "distilbert-base-uncased"
bert_model = "microsoft/deberta-xlarge-mnli"

In [None]:
submission_metrics = {}

for f in glob.glob("submissions/*.gz"):
    
    this_run_id = f.partition("\\")[-1].replace(".gz", "")
    print(f, "-->", this_run_id)
    
    this_run_event_request_facts = {k:[] for k in all_req_ids}
    with gzip.open(f, "r") as in_file:
        for line_ in in_file:
            line = line_.decode("utf8")
            
            entry = json.loads(line)
            this_req_id = entry["requestID"]
            
            # We skip days where we have no relevant facts from assessors
            if this_req_id not in all_req_ids:
                continue
            
            this_run_event_request_facts[this_req_id].append(entry)
            
    local_event_summaries = {e_id:[] for e_id in event_df.index}
    for event_request,this_fact_list in this_run_event_request_facts.items():
        event_id = event_request.rpartition("-")[0]
        
        sorted_fact_list = sorted(this_fact_list, key=lambda v: v["importance"], reverse=True)
        this_day_summary = [this_top_fact["factText"] for this_top_fact in sorted_fact_list[:TOP_K]]
        
        local_event_summaries[event_id] = local_event_summaries[event_id] + this_day_summary
        

    wiki_dfs = []
    nist_dfs = []
    for event_id,event in event_df.iterrows():
        
        this_submitted_summary = local_event_summaries[event_id]

        this_summary_text = ". ".join(this_submitted_summary).replace("..", ".")
        if len(this_summary_text) == 0:
            continue
        
        nist_summary = event["nist.summary"]
        wiki_summary = event["wiki.summary"]

        print(f"event {event_id} - nist: {len(nist_summary)} - submitted: {len(this_summary_text)}")
        
        nist_metric_ = bert_score.score([this_summary_text], [nist_summary], model_type=bert_model, device=device, batch_size=1, nthreads=2)
        torch.cuda.empty_cache()
        wiki_metric_ = bert_score.score([this_summary_text], [wiki_summary], model_type=bert_model, device=device, batch_size=1, nthreads=2)
        torch.cuda.empty_cache()
        
        nist_metric = {
            "f1": nist_metric_[2],
            "precision": nist_metric_[0],
            "recall": nist_metric_[1],
        }
        
        wiki_metric = {
            "f1": wiki_metric_[2],
            "precision": wiki_metric_[0],
            "recall": wiki_metric_[1],
        }
        
        this_wiki_df = pd.DataFrame([{"metric":k, "value":v.item(), "event": event_id} for k,v in wiki_metric.items()])
        this_nist_df = pd.DataFrame([{"metric":k, "value":v.item(), "event": event_id} for k,v in nist_metric.items()])
        
        display(this_nist_df)
        
        wiki_dfs.append(this_wiki_df)
        nist_dfs.append(this_nist_df)
        
    full_wiki_df = pd.concat(wiki_dfs)
    full_nist_df = pd.concat(nist_dfs)
    
    submission_metrics[this_run_id] = {
        "wiki": full_wiki_df,
        "nist": full_nist_df,
    }
    
    display(full_nist_df.groupby("metric")["value"].mean())



In [None]:
all_runs = []
for k,v in submission_metrics.items():
    print(k)
    
    stackable = []
    for comparator,ldf in v.items():
        stackable_ldf = ldf.copy()
        stackable_ldf["target.summary"] = comparator

        stackable.append(stackable_ldf)

    this_run_df = pd.concat(stackable)
    this_run_df["run"] = k
    
    all_runs.append(this_run_df)
    this_run_df.to_csv("evaluation.output.bertscore/%s.csv" % k, index=False)
    
all_runs_df = pd.concat(all_runs)
all_runs_df.to_csv("evaluation.output.bertscore/all_runs.csv", index=False)

In [None]:
target_summaries = {}
for target in ["wiki", "nist"]:
    this_target_df = all_runs_df[all_runs_df["target.summary"] == target]
    
    index = []
    rows = []
    for run_name,group in this_target_df.groupby("run"):
        print(run_name)
        # this_row = group.pivot("event", "metric", "value").mean()
        this_row = group.pivot(index="event", columns="metric", values="value").mean()
        rows.append(this_row)
        index.append(run_name)

    summary_df = pd.DataFrame(rows, index=index)[[
        "f1", 
    ]]

    final_df = summary_df.sort_values(by="f1", ascending=False)
    final_df.to_csv("evaluation.output.bertscore/%s.summary.csv" % target)
    
    target_summaries[target] = final_df

In [22]:
rel2023_events = [
    'CrisisFACTS-009',
    'CrisisFACTS-010',
    'CrisisFACTS-011',
    'CrisisFACTS-012',
    'CrisisFACTS-013',
    'CrisisFACTS-014',
    'CrisisFACTS-015',
    'CrisisFACTS-016',
    'CrisisFACTS-017',
    'CrisisFACTS-018'

]

In [None]:
rows = []
for runtag,group in all_runs_df[all_runs_df["event"].isin(rel2023_events)].groupby("run"):
    print(runtag)
    t_map = {"run": runtag}
    for target,t_group in group.groupby("target.summary"):
        print("\t", target)
        
        if target == "wiki":
            t_group = t_group[~t_group["event"].isin(["CrisisFACTS-011", "CrisisFACTS-012"])]
        for metric,m_group in t_group.groupby("metric"):
            print("\t\t", metric, m_group["value"].mean())
            
            t_map["%s.%s" % (target,metric)] = m_group["value"].mean()
            
    rows.append(t_map)

In [None]:
by_run_df = pd.DataFrame(rows).set_index("run")
by_run_df.to_csv("by_run.bertscore.summary.csv")
by_run_df[["nist.f1", "wiki.f1"]]

In [None]:
by_run_df[["nist.f1", "wiki.f1"]].corr()

In [None]:
by_run_df[["nist.f1", "wiki.f1"]].plot.scatter(x="nist.f1", y="wiki.f1")

In [None]:
fig = plt.figure(figsize=(9,16))
ax = fig.add_subplot(1,1,1)

by_run_df[["nist.f1", "wiki.f1"]].sort_values(by="nist.f1").plot.barh(ax=ax)

## By Day

In [None]:
rows = []
for event,group in all_runs_df[all_runs_df["event"].isin(rel2023_events)].groupby("event"):
    print(event)
    t_map = {"event": event}
    for target,t_group in group.groupby("target.summary"):
        print("\t", target)
        
        if target == "wiki":
            t_group = t_group[~t_group["event"].isin(["CrisisFACTS-011", "CrisisFACTS-012"])]
        for metric,m_group in t_group.groupby("metric"):
            print("\t\t", metric, m_group["value"].mean())
            
            t_map["%s.%s" % (target,metric)] = m_group["value"].mean()
            
    rows.append(t_map)

In [None]:
by_event_df = pd.DataFrame(rows).set_index("event")
by_event_df.to_csv("by_event.bertscore.summary.csv")
by_event_df[["nist.f1", "wiki.f1"]]

In [None]:
fig = plt.figure(figsize=(9,16))
ax = fig.add_subplot(1,1,1)

by_event_df[["nist.f1", "wiki.f1"]].plot.barh(ax=ax)

In [None]:
score_df = pd.read_csv("by_event.bertscore.summary.csv", index_col=0)
print(score_df.groupby("event").mean().style.to_latex())

In [None]:
print(score_df.groupby("event").mean().style.to_latex())