In [1]:
import pandas as pd
from datasets import load_dataset

In [12]:
from Levenshtein import distance

In [2]:
ds = load_dataset("FiscalNote/billsum")["test"].to_pandas()

In [19]:
# Load baseline and human-written summaries
gold = pd.read_csv("../eval/gold_lftk.csv", index_col=0)
gen = pd.read_csv("../eval/gen_lftk.csv", index_col=0)

In [20]:
# gen['levenshtein'] = gen.apply(lambda x: distance(gen.summary_generated, gold.summary), axis=1)
result = []
for gold_summary, gen_summary in zip(gold.summary.tolist(), gen.summary_generated.tolist()):
    curr = distance(gen_summary, gold_summary)
    result.append(curr)

In [4]:
# Filter columns for scores
score_cols = [c for c in gen.columns if ("summary" not in c) and (c!="text") and ("t_" not in c)]
gen_cols = [f"{c}_gen" for c in score_cols]

In [83]:
# Take the delta of every score 
delta = pd.DataFrame()

for col in score_cols:
    if "summary" not in col:
        delta[f"{col}_delta"] = gen[col] - gold[col]

In [84]:
# Merge the gold/generated summaries to the "delta DF" after the fact
delta = delta.join(gen.loc[:, score_cols])
delta = delta.rename(columns=lambda c: c + "_gen" if c in score_cols else c)
delta = delta.join(gold.loc[:, score_cols], rsuffix="_gold")
delta = delta.rename(columns=lambda c: c + "_gold" if c in score_cols else c)
# Merge the original bill text as another column
delta = delta.join(ds.text)

In [85]:
delta

Unnamed: 0,fkre_delta,fkgl_delta,fogi_delta,smog_delta,cole_delta,auto_delta,fkre_gen,fkgl_gen,fogi_gen,smog_gen,cole_gen,auto_gen,fkre_gold,fkgl_gold,fogi_gold,smog_gold,cole_gold,auto_gold,text
0,304.822,-114.781,-117.691,-25.966,-3.287,-147.127,36.141,13.960,19.629,9.710,14.694,15.578,-268.681,128.741,137.320,35.676,17.981,162.705,SECTION 1. ENVIRONMENTAL INFRASTRUCTURE.\n\n ...
1,-44.076,16.913,18.693,5.717,-0.841,20.319,-0.360,29.789,39.197,14.750,13.006,35.168,43.716,12.876,20.504,9.033,13.847,14.849,That this Act may be cited as the ``Federal Fo...
2,6.489,-3.116,-4.031,-0.674,-0.236,-3.950,71.107,6.720,8.969,5.713,9.435,7.327,64.618,9.836,13.000,6.387,9.671,11.277,SECTION 1. SHORT TITLE.\n\n This Act may be...
3,13.466,-6.785,-9.030,-1.326,-2.355,-11.182,35.881,13.285,18.470,8.569,13.372,13.234,22.415,20.070,27.500,9.895,15.727,24.416,SECTION 1. SHORT TITLE.\n\n This Act may be...
4,15.849,-5.572,-6.159,-2.879,-0.122,-6.565,33.305,15.239,20.667,9.895,14.438,17.014,17.456,20.811,26.826,12.774,14.560,23.579,SECTION 1. SHORT TITLE.\n\n This Act may be...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3264,-24.522,16.587,16.509,4.975,-3.707,23.287,-27.485,42.681,50.909,10.688,11.771,52.385,-2.963,26.094,34.400,5.713,15.478,29.098,SECTION 1. PLACEMENT PROGRAMS FOR FEDERAL EMPL...
3265,3.626,-0.506,-1.142,-3.761,-1.092,-0.875,25.073,18.457,25.429,5.713,12.418,19.351,21.447,18.963,26.571,9.474,13.510,20.226,SECTION 1. SHORT TITLE.\n\n This Act may be...
3266,-8.477,3.132,3.377,3.706,0.247,4.052,11.032,25.446,30.607,14.842,13.942,30.448,19.509,22.314,27.230,11.136,13.695,26.396,SECTION 1. SHORT TITLE.\n\n This Act may be...
3267,30.685,-2.106,-3.779,-3.305,-6.968,-1.408,30.836,18.088,23.135,10.688,12.715,20.431,0.151,20.194,26.914,13.993,19.683,21.839,SECTION 1. SHORT TITLE.\n\n This Act may be...


In [86]:
# Initialize lists for storing results
top_results = list()
bottom_results = list()
mid_results = list()

In [87]:
# Loop over scores for temporary sorting and indexing
k = 2
midpoint = (len(gen) - k) // 2

for scx in score_cols:
    curr = delta.sort_values(f"{scx}_delta", ascending=False)
    top = curr.iloc[0:k, :]
    top_results.append(top)
    bottom = curr.iloc[-k:, :]
    bottom_results.append(bottom)
    mid = curr.iloc[midpoint:midpoint + k]
    mid_results.append(mid)

In [81]:
def create_output(results, result_type):
    for score_name, df in zip(score_cols, results):
        # Join on summary columns
        df = df.join(gen.summary_generated)
        df = df.join(gold.summary)

        # Rename columns to be consistent
        df = df.rename(columns = {
            "summary": "gold_summary",
            "summary_generated": "gen_summary"
        })
        # Reverse columns for readability
        final = df.loc[:, ::-1]
        final.to_csv(f"{score_name}_{result_type}_delta_no_.csv")
    return

In [82]:
# create_output(top_results, "top")
# create_output(bottom_results, "bottom")
create_output(mid_results, "mid")

In [104]:
top_results[0]

Unnamed: 0,auto_delta,cole_delta,smog_delta,fogi_delta,fkgl_delta,fkre_delta,auto_gen,cole_gen,smog_gen,fogi_gen,fkgl_gen,fkre_gen,auto_gold,cole_gold,smog_gold,fogi_gold,fkgl_gold,fkre_gold,text
2785,181.095,3.156,29.162,144.162,141.669,375.743,12.971,12.795,6.514,19.101,12.492,42.372,194.066,15.951,35.676,163.263,154.161,-333.371,SECTION 1. SHORT TITLE.\n\n This Act may be...
0,147.127,3.287,25.966,117.691,114.781,304.822,15.578,14.694,9.71,19.629,13.96,36.141,162.705,17.981,35.676,137.32,128.741,-268.681,SECTION 1. ENVIRONMENTAL INFRASTRUCTURE.\n\n ...
