In [1]:
from rouge_score import rouge_scorer
from operator import add
import json
import pandas as pd
import nltk.data
import numpy as np

In [2]:
def load_jsonl(data_path):
    data = []
    with open(data_path) as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [3]:
textrank = pd.read_csv("textRank_summaries.csv")

In [4]:
textrank.drop(columns="Unnamed: 0", inplace=True)

In [5]:
cands = load_jsonl("reddit_data/reddit_rouge_100.jsonl")

In [6]:
cands = pd.json_normalize(cands)

In [7]:
df = textrank.merge(cands, on="text", how="left")

In [8]:
def join_list(lst):
    return ' '.join(lst)

In [9]:
df['bert_summary'] = df['idx'].apply(join_list)

In [10]:
tokenizer = nltk.data.load('nltk:tokenizers/punkt/english.pickle')
def lead3(sent):
    return ' '.join(tokenizer.tokenize(sent)[:3])

In [11]:
df['lead3'] = df['text'].apply(lead3)

In [12]:
df.drop(columns=["summary", "scores", "idx"], inplace=True)

In [13]:
matchsum = load_jsonl("reddit_data/matchsum_100.jsonl")

In [14]:
matchsum = pd.json_normalize(matchsum)

In [15]:
df = df.merge(matchsum, on="text", how="left")

In [16]:
df.columns = ['text', 'gold_summary', 'text_rank', 'bert_summary', 'lead3',
       'matchsum']

In [17]:
df.to_json("reddit_data/all_summaries.jsonl", orient='records', lines=True)

In [18]:
data = load_jsonl("reddit_data/all_summaries.jsonl")

In [19]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [20]:
def calc_rouge(cand, gold):
    scores = scorer.score(cand, gold)
    return [s.fmeasure for s in scores.values()]

In [21]:
for i in data:
    if type(i['matchsum']) != str:
        i['matchsum'] = i['matchsum'][0]

In [22]:
text_rank = [0] * 3
bert = [0] * 3
lead3 = [0] * 3
matchsum = [0] * 3

for entry in data:
    text_rank = list(map(add, text_rank, calc_rouge(entry['text_rank'], entry['gold_summary'])))
    bert = list(map(add, bert, calc_rouge(entry['bert_summary'], entry['gold_summary'])))
    lead3 = list(map(add, lead3, calc_rouge(entry['lead3'], entry['gold_summary'])))
    matchsum = list(map(add, matchsum, calc_rouge(entry['matchsum'], entry['gold_summary'])))
    
text_rank = [round(x, 3) for x in text_rank]
bert = [round(x, 3) for x in bert]
lead3 = [round(x, 3) for x in lead3]
matchsum = [round(x, 3) for x in matchsum]

In [23]:
print(lead3)
print(text_rank)
print(bert)
print(matchsum)

[16.966, 2.547, 11.154]
[18.998, 3.322, 13.017]
[15.298, 3.149, 10.29]
[17.311, 2.667, 12.607]
