In [1]:
import time
import spacy
import lftk
import pandas as pd
from datasets import load_dataset

In [3]:
# load a trained pipeline of your choice from spacy
nlp = spacy.load("en_core_web_sm")

# readability formula scores - we can always add more general linguistic features
metrics = lftk.search_features(family="readformula", return_format = "list_key")

In [14]:
# data: generated summaries, gold: human-written summaries
data = pd.read_csv('../output/baseline_test.csv', usecols=['summary_generated'])
gold = load_dataset("FiscalNote/billsum")["test"].to_pandas()

In [4]:
# returns a dictionary of readability scores from LFTK
def readability_score_single(text):
    doc = nlp(text)
    LFTK = lftk.Extractor(docs = doc)
    features = LFTK.extract(features = metrics)
    return features

def readability_eval(df, column):
    start_time = time.time()
    df["scores"] = df[column].apply(readability_score_single)
    print(f"{time.time() - start_time} sec to run")
    return

In [9]:
# replaces the dictionary column with exploded keys as new columns
def convert_cols(df, column):
    return df.join(
        pd.json_normalize(
            df[column]
        )
    ).drop(
        column,
        axis='columns'
    )

In [6]:
readability_eval(data, "summary_generated")
readability_eval(gold, "summary")

82.2096209526062 sec to run
103.58868312835693 sec to run


In [12]:
# Only run once!
# gen = convert_cols(data, "scores")
# gold = convert_cols(gold, "scores")

In [25]:
gold.drop(columns=['text', 'title'], inplace=True)

Unnamed: 0,summary,fkre,fkgl,fogi,smog,cole,auto
0,Amends the Water Resources Development Act of ...,-268.681,128.741,137.320,35.676,17.981,162.705
1,Federal Forage Fee Act of 1993 - Subjects graz...,43.716,12.876,20.504,9.033,13.847,14.849
2,. Merchant Marine of World War II Congression...,64.618,9.836,13.000,6.387,9.671,11.277
3,Small Business Modernization Act of 2004 - Ame...,22.415,20.070,27.500,9.895,15.727,24.416
4,Fair Access to Investment Research Act of 2016...,17.456,20.811,26.826,12.774,14.560,23.579
...,...,...,...,...,...,...,...
3264,Public Servant Priority Placement Act of 1995 ...,-2.963,26.094,34.400,5.713,15.478,29.098
3265,Sportsmanship in Hunting Act of 2008 - Amends ...,21.447,18.963,26.571,9.474,13.510,20.226
3266,Helping College Students Cross the Finish Line...,19.509,22.314,27.230,11.136,13.695,26.396
3267,Makes proceeds from such conveyances available...,0.151,20.194,26.914,13.993,19.683,21.839


In [23]:
gen.to_csv("gen_all_readability_scores.csv", index=False, escapechar='\\')
gold.to_csv("gold_all_readability_scores.csv", index=False, escapechar='\\')

In [18]:
gold.loc[gold.summary.str.contains("Children's Bicycle Helmet Safety Act of 1993")].text.tolist()

["SECTION 1. SHORT TITLE.\n\n    This Act may be cited as the ``Children's Bicycle Helmet Safety Act \nof 1993''.\n\nSEC. 2. FINDINGS.\n\n    The Congress finds that--\n            (1) 90 million Americans ride bicycles and 20 million ride \n        a bicycle more than once a week;\n            (2) between 1984 and 1988, 2,985 bicyclists in the United \n        States died from head injuries and 905,752 suffered head \n        injuries that were treated in hospital emergency rooms;\n            (3) 41 percent of bicycle-related head injury deaths and 76 \n        percent of bicycle-related head injuries occurred among \n        American children under age 15;\n            (4) deaths and injuries from bicycle accidents cost society \n        $7.6 billion annually; and a child suffering from a head \n        injury, on average, will cost society $4.5 million over the \n        child's lifetime;\n            (5) universal use of bicycle helmets in the United States \n        would have pr

In [20]:
# gold.describe().to_csv("gold_describe.csv")
gold.describe().compare(gen.describe(), keep_equal=True, align_axis=1, result_names=("gold", "generated"))

# aggr.to_csv("baseline_gold_compare.csv", index=False)

Unnamed: 0_level_0,fkre,fkre,fkgl,fkgl,fogi,fogi,smog,smog,cole,cole,auto,auto
Unnamed: 0_level_1,gold,generated,gold,generated,gold,generated,gold,generated,gold,generated,gold,generated
mean,12.950785,5.292479,22.548614,26.011575,28.701746,32.256271,11.587442,12.407292,15.162678,14.809797,26.354612,30.819341
std,28.561424,39.147077,10.030598,14.147657,10.336453,14.545976,3.764673,4.721686,2.216489,2.464433,12.738404,17.997384
min,-333.371,-197.615,6.311,6.217,10.15,8.969,0.0,0.0,7.802,6.261,5.72,6.407
25%,2.67,-8.085,16.792,17.666,22.815,23.667,9.329,9.329,13.66,13.118,19.089,20.248
50%,17.009,12.846,20.577,22.472,26.733,28.619,11.22,11.892,15.045,14.684,23.714,26.103
75%,29.321,29.003,25.451,29.742,31.892,36.0,13.398,14.75,16.556,16.32,29.975,35.103
max,80.098,81.956,154.161,97.404,163.263,105.737,35.676,36.579,25.251,24.759,194.066,122.723


In [15]:
# gen.describe().to_csv("gen_describe.csv")
gen.describe()

Unnamed: 0,fkre,fkgl,fogi,smog,cole,auto
count,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0
mean,5.292479,26.011575,32.256271,12.407292,14.809797,30.819341
std,39.147077,14.147657,14.545976,4.721686,2.464433,17.997384
min,-197.615,6.217,8.969,0.0,6.261,6.407
25%,-8.085,17.666,23.667,9.329,13.118,20.248
50%,12.846,22.472,28.619,11.892,14.684,26.103
75%,29.003,29.742,36.0,14.75,16.32,35.103
max,81.956,97.404,105.737,36.579,24.759,122.723


In [2]:
gold_scores = pd.read_csv("gold_all_readability_scores.csv")
gen_scores = pd.read_csv("gen_all_readability_scores.csv")

In [25]:
flist = lftk.search_features(family="worddiff", return_format="list_key")
flist.append("t_n_ent_law")
flist

['t_kup', 't_bry', 't_subtlex_us_zipf', 't_n_ent_law']

In [32]:
# returns a dictionary of readability scores from LFTK
def feature_single(text, feat):
    doc = nlp(text)
    LFTK = lftk.Extractor(docs = doc)
    features = LFTK.extract(features = [feat])
    return features[feat]

def feature_eval(df, column):
    for f in flist:
        print(f)
        start_time = time.time()
        df[f] = df[column].apply(lambda r: feature_single(r, f))
        print(f"{time.time() - start_time} sec to run")
    return

In [33]:
feature_eval(gen_scores, "summary_generated")
feature_eval(gold_scores, "summary")

t_kup
997.7767179012299 sec to run
t_bry
1341.2437300682068 sec to run
t_subtlex_us_zipf
2147.247776031494 sec to run
t_n_ent_law
103.05808711051941 sec to run
t_kup
4688.640254974365 sec to run
t_bry
3580.801120996475 sec to run
t_subtlex_us_zipf
2131.8419280052185 sec to run
t_n_ent_law
100.15771007537842 sec to run


In [36]:
for c in gen_scores.columns:
    if "t_" in c:
        gen_final[c] = gen_scores[c]
        gold_final[c] = gold_scores[c]

In [10]:
gen_final = convert_cols(gen_scores, "features")
gold_final = convert_cols(gold_scores, "features")

In [12]:
gen_final.to_csv("gen_readformula_wordsent_worddiff_features.csv")
gold_final.to_csv("gold_readability_wordsent_features.csv")

In [39]:
gen_final[[c for c in gen_final.columns if c in flist]].describe()

Unnamed: 0,t_kup,t_bry,t_subtlex_us_zipf,t_n_ent_law
count,3269.0,3269.0,3269.0,3269.0
mean,725.649061,581.66047,694.295961,0.222086
std,379.991266,307.656867,339.627031,0.54108
min,44.99,31.536,102.607,0.0
25%,378.49,302.351,384.765,0.0
50%,707.88,561.962,683.621,0.0
75%,1081.0,869.513,1035.688,0.0
max,1517.12,1274.352,1317.443,5.0


In [40]:
gold_final[[c for c in gold_final.columns if c in flist]].describe()

Unnamed: 0,t_kup,t_bry,t_subtlex_us_zipf,t_n_ent_law
count,3269.0,3269.0,3269.0,3269.0
mean,958.43149,764.343065,882.93486,0.298256
std,650.269915,523.75416,577.809586,0.686511
min,25.08,29.5,48.67,0.0
25%,476.33,372.024,452.264,0.0
50%,829.15,658.951,763.686,0.0
75%,1289.52,1029.839,1188.576,0.0
max,4478.61,3642.328,3995.888,7.0


In [44]:
gen_final.to_csv("gen_lftk.csv")
gold_final.to_csv("gold_lftk.csv")