In [1]:
import pandas as pd
from datasets import load_dataset

In [2]:
ds = load_dataset("FiscalNote/billsum")["test"].to_pandas()

In [3]:
# Load baseline and human-written summaries
gold = pd.read_csv("../eval/gold_lftk.csv", index_col=0)
gen = pd.read_csv("../eval/gen_lftk.csv", index_col=0)

In [12]:
# Filter columns for scores
score_cols = [c for c in gen.columns if ("summary" not in c) and (c!="text") and ("t_" not in c)]
gen_cols = [f"{c}_gen" for c in score_cols]

In [95]:
# Take the delta of every score 
delta = pd.DataFrame()

for col in score_cols:
    if "summary" not in col:
        delta[f"{col}_delta"] = abs(gen[col] - gold[col])

In [96]:
# Merge the gold/generated summaries to the "delta DF" after the fact
delta = delta.join(gen.loc[:, score_cols])
delta = delta.rename(columns=lambda c: c + "_gen" if c in score_cols else c)
delta = delta.join(gold.loc[:, score_cols], rsuffix="_gold")
delta = delta.rename(columns=lambda c: c + "_gold" if c in score_cols else c)
# Merge the original bill text as another column
delta = delta.join(ds.text)

In [97]:
delta

Unnamed: 0,auto_delta,cole_delta,smog_delta,fogi_delta,fkgl_delta,fkre_delta,auto_gen,cole_gen,smog_gen,fogi_gen,fkgl_gen,fkre_gen,auto_gold,cole_gold,smog_gold,fogi_gold,fkgl_gold,fkre_gold,text
0,147.127,3.287,25.966,117.691,114.781,304.822,15.578,14.694,9.710,19.629,13.960,36.141,162.705,17.981,35.676,137.320,128.741,-268.681,SECTION 1. ENVIRONMENTAL INFRASTRUCTURE.\n\n ...
1,20.319,0.841,5.717,18.693,16.913,44.076,35.168,13.006,14.750,39.197,29.789,-0.360,14.849,13.847,9.033,20.504,12.876,43.716,That this Act may be cited as the ``Federal Fo...
2,3.950,0.236,0.674,4.031,3.116,6.489,7.327,9.435,5.713,8.969,6.720,71.107,11.277,9.671,6.387,13.000,9.836,64.618,SECTION 1. SHORT TITLE.\n\n This Act may be...
3,11.182,2.355,1.326,9.030,6.785,13.466,13.234,13.372,8.569,18.470,13.285,35.881,24.416,15.727,9.895,27.500,20.070,22.415,SECTION 1. SHORT TITLE.\n\n This Act may be...
4,6.565,0.122,2.879,6.159,5.572,15.849,17.014,14.438,9.895,20.667,15.239,33.305,23.579,14.560,12.774,26.826,20.811,17.456,SECTION 1. SHORT TITLE.\n\n This Act may be...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3264,23.287,3.707,4.975,16.509,16.587,24.522,52.385,11.771,10.688,50.909,42.681,-27.485,29.098,15.478,5.713,34.400,26.094,-2.963,SECTION 1. PLACEMENT PROGRAMS FOR FEDERAL EMPL...
3265,0.875,1.092,3.761,1.142,0.506,3.626,19.351,12.418,5.713,25.429,18.457,25.073,20.226,13.510,9.474,26.571,18.963,21.447,SECTION 1. SHORT TITLE.\n\n This Act may be...
3266,4.052,0.247,3.706,3.377,3.132,8.477,30.448,13.942,14.842,30.607,25.446,11.032,26.396,13.695,11.136,27.230,22.314,19.509,SECTION 1. SHORT TITLE.\n\n This Act may be...
3267,1.408,6.968,3.305,3.779,2.106,30.685,20.431,12.715,10.688,23.135,18.088,30.836,21.839,19.683,13.993,26.914,20.194,0.151,SECTION 1. SHORT TITLE.\n\n This Act may be...


In [98]:
# Initialize lists for storing results
top_results = list()
bottom_results = list()

In [99]:
# Loop over scores for temporary sorting and indexing
k = 2
for scx in score_cols:
    curr = delta.sort_values(f"{scx}_delta", ascending=False)
    top = curr.iloc[0:k, :]
    top_results.append(top)
    bottom = curr.iloc[-k:, :]
    bottom_results.append(bottom)

In [102]:
def create_output(results, result_type):
    for score_name, df in zip(score_cols, results):
        # Join on summary columns
        df = df.join(gen.summary_generated)
        df = df.join(gold.summary)

        # Rename columns to be consistent
        df = df.rename(columns = {
            "summary": "gold_summary",
            "summary_generated": "gen_summary"
        })
        # Reverse columns for readability
        final = df.loc[:, ::-1]
        final.to_csv(f"{score_name}_{result_type}_delta.csv")
    return

In [103]:
create_output(top_results, "top")
create_output(bottom_results, "bottom")

In [104]:
top_results[0]

Unnamed: 0,auto_delta,cole_delta,smog_delta,fogi_delta,fkgl_delta,fkre_delta,auto_gen,cole_gen,smog_gen,fogi_gen,fkgl_gen,fkre_gen,auto_gold,cole_gold,smog_gold,fogi_gold,fkgl_gold,fkre_gold,text
2785,181.095,3.156,29.162,144.162,141.669,375.743,12.971,12.795,6.514,19.101,12.492,42.372,194.066,15.951,35.676,163.263,154.161,-333.371,SECTION 1. SHORT TITLE.\n\n This Act may be...
0,147.127,3.287,25.966,117.691,114.781,304.822,15.578,14.694,9.71,19.629,13.96,36.141,162.705,17.981,35.676,137.32,128.741,-268.681,SECTION 1. ENVIRONMENTAL INFRASTRUCTURE.\n\n ...
