In [1]:
from rouge_score import rouge_scorer
from itertools import combinations
from nltk.tokenize import word_tokenize
import json
import pandas as pd

In [2]:
def load_jsonl(data_path):
    data = []
    with open(data_path) as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [3]:
data = load_jsonl("reddit_data/reddit_cands_100.jsonl")

In [4]:
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

In [5]:
for entry in data:
    cands = list(combinations(entry['idx'], 2))
    cands += list(combinations(entry['idx'], 3))
    cands = [' '.join(tup) for tup in cands]
    scores = []
    for cand in cands:
        rouge_scores = scorer.score(cand, entry['summary'])
        score = sum([s.fmeasure for s in rouge_scores.values()]) / 3
        scores.append((cand, score))
    scores.sort(key=lambda x : x[1], reverse=True)
    entry['scores'] = scores

In [6]:
df_temp = pd.json_normalize(data)

In [7]:
max_len = df_temp.text.str.len().max()
def pad_text(text):
    curr = word_tokenize(text)
    curr.extend([0]*(max_len-len(curr)))
    return curr

In [8]:
df_temp['text'] = df_temp['text'].apply(pad_text)

In [9]:
max_len = df_temp.summary.str.len().max()
def pad_sum(text):
    curr = word_tokenize(text)
    curr.extend([0]*(max_len-len(curr)))
    return curr

In [10]:
df_temp['summary'] = df_temp['summary'].apply(pad_sum)

In [11]:
max_len = max([max([len(s) for s in lst]) for lst in df_temp['idx']]) * 3
def pad_scores(lst):
    result = []
    for tup in lst:
        temp = word_tokenize(tup[0])
        temp.extend([0]*(max_len-len(temp)))
        result.append((temp, tup[1]))
    return result

In [12]:
df_temp['scores'] = df_temp['scores'].apply(pad_scores)

In [13]:
df_temp

Unnamed: 0,text,summary,idx,scores
0,"[this, actually, happened, a, couple, of, year...","[confuse, a, 5th, grade, girl, for, a, boy, in...",[i grew up in germany where i went to a german...,"[([i, grew, up, in, germany, where, i, went, t..."
1,"[it, was, last, october, ,, but, i, 'm, feelin...","[i, found, my, estranged, dad, ,, thought, i, ...","[it was last october, but i'm feeling the fall...","[([during, the, most, acute, months, of, griev..."
2,"[so, i, had, the, brilliant, idea, to, use, ve...","[had, my, balls, burned, by, sauron, and, was,...",[so i had the brilliant idea to use veet hair ...,"[([the, slight, peroxide, kinda, smell, ensued..."
3,"[today, i, was, going, to, have, a, bath, afte...","[peppermint, +, bath, =, burning, cold, ladybi...",[today i was going to have a bath after a long...,"[([today, i, was, going, to, have, a, bath, af..."
4,"[i, have, n't, had, a, bath, in, practically, ...","[got, too, high, and, too, hot, in, the, bath,...","[i haven't had a bath in practically years so,...","[([picture, this, ;, a, very, cramped, bathroo..."
...,...,...,...,...
95,"[a, bit, of, background, :, i, 'm, almost, cer...","[pooped, and, peed, all, over, a, porta-potty,...",[a bit of background: i'm almost certain that ...,"[([as, i, begin, ,, as, my, friend, once, put,..."
96,"[average, day, at, work, ,, not, too, busy, ,,...","[turn, around, for, a, split, second, ,, littl...","[average day at work, not too busy, not too sl...","[([i, 'm, getting, ready, to, go, on, my, brea..."
97,"[so, my, day, pretty, much, sucked, ., let, 's...","[i, ate, a, wendy, 's, frosty, ,, got, the, sh...",[let's start off with the shitty morning of me...,"[([we, drove, to, wendy, 's, and, ordered, fou..."
98,"[yesterday, evening, my, mother, and, i, were,...","[did, n't, sit, down, while, taking, off, my, ...",[yesterday evening my mother and i were walkin...,"[([for, some, odd, reason, ,, i, did, n't, sit..."


In [14]:
df_temp.to_json("reddit_data/reddit_rouge_100.jsonl", orient='records', lines=True)