In [1]:
import time
import spacy
import lftk
import pandas as pd
from datasets import load_dataset

In [4]:
# load a trained pipeline of your choice from spacy
nlp = spacy.load("en_core_web_sm")

# LFTK feature families of interest
readformula = lftk.search_features(family="readformula", return_format = "list_key")
wordsent = lftk.search_features(family="wordsent", return_format = "list_key")
worddiff = lftk.search_features(family="worddiff", return_format = "list_key")

target_features = readformula + wordsent + worddiff

In [7]:
# data: generated summaries, gold: human-written summaries
gen = pd.read_csv('../output/deliverable_2/pegasusbillsum_baseline.csv', usecols=['summary_generated'])
gold = load_dataset("FiscalNote/billsum")["test"].to_pandas()
gold.drop(columns=['text', 'title'], inplace=True)
gen.rename(columns={"summary_generated": "summary"}, inplace=True)

In [17]:
# return a list of dictionaries; each dictionary has all features with values for the summary at index I
def lftk_eval(df):
    start_time = time.time()
    docs = [nlp(s) for s in df.summary.tolist()]
    LFTK = lftk.Extractor(docs = docs)
    extracted = LFTK.extract(features = target_features)
    print(f"Generated summary features extracted in {time.time() - start_time} sec")
    return extracted

In [18]:
# Log runtime for adding all these linguistic features
print("generated summaries")
gen_lftk = lftk_eval(gen)
print("gold summaries")
gold_lftk = lftk_eval(gold)

generated summaries
Generated summary features extracted in 3287.916380882263 sec
gold summaries
Generated summary features extracted in 5005.549740076065 sec


In [20]:
# Merge summary columns on corresponding features
def add_features(df, fdicts):
    fdf = pd.DataFrame(fdicts)
    new = df.merge(fdf, left_index=True, right_index=True)
    return new

In [22]:
gen_final = add_features(gen, gen_lftk)
gold_final = add_features(gold, gold_lftk)

In [12]:
# Write metrics for each summary to output
MODEL_NAME = "pegasusbillsum"
gen_final.to_csv(MODEL_NAME + "_baseline_lftk.csv")
gold_final.to_csv("gold_lftk.csv")

In [24]:
# Write basic stat report as well
# gen.describe().to_csv("report_gen_lftk.csv")
gen_final.describe()
# gold_final.describe().to_csv("report_gold_lftk.csv")
# gold_final.to_csv("gold_lftk.csv")

Unnamed: 0,fkre,fkgl,fogi,smog,cole,auto,t_word,t_stopword,t_punct,t_syll,t_syll2,t_syll3,t_uword,t_sent,t_char,t_kup,t_bry,t_subtlex_us_zipf
count,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0,3269.0
mean,5.292479,26.011575,32.256271,12.407292,14.809797,30.819341,140.939125,58.409299,19.563169,242.309575,36.700214,13.804833,81.702967,3.14041,744.964209,725.648951,581.660572,694.295961
std,39.147077,14.147657,14.545976,4.721686,2.464433,17.997384,67.205049,31.593493,11.750572,115.167318,18.685703,8.700924,29.539885,1.943082,350.606323,379.995074,307.658775,339.627031
min,-197.615,6.217,8.969,0.0,6.261,6.407,25.0,5.0,1.0,35.0,3.0,0.0,18.0,1.0,110.0,44.99,31.536,102.607
25%,-8.085,17.666,23.667,9.329,13.118,20.248,78.0,30.0,10.0,136.0,21.0,7.0,56.0,2.0,419.0,378.49,302.351,384.765
50%,12.846,22.472,28.619,11.892,14.684,26.103,140.0,56.0,18.0,242.0,35.0,12.0,84.0,3.0,745.0,707.88,561.962,683.621
75%,29.003,29.742,36.0,14.75,16.32,35.103,213.0,87.0,28.0,352.0,52.0,19.0,107.0,4.0,1094.0,1081.0,869.513,1035.688
max,81.956,97.404,105.737,36.579,24.759,122.723,251.0,135.0,85.0,509.0,98.0,64.0,148.0,22.0,1499.0,1517.12,1274.352,1317.443
