# Extractive Baseline

Data: PubMed

In [1]:
import gdown
import json
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm.auto import tqdm
tqdm.pandas()
from datasets import load_metric
metric = load_metric("rouge")

In [2]:
df = pd.read_csv('../csci-544-project/data/news_summary_more.csv', encoding='latin-1')
df.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [3]:
def extractive_apply(row):
    # print(row['article_id'])
    sentences = sent_tokenize(row['text'])
    summary_sentences = sent_tokenize(row['headlines'])

    if not sentences or not sentences[0]:
        return np.nan
    elif len(sentences) < len(summary_sentences):
        return np.nan
    
    
    text = ' '.join(sentences)
    stopWords = set(stopwords.words('english'))
    words = word_tokenize(text)
    target_len = len(summary_sentences)
    
    freqTable = dict()
    for word in words:
        word = word.lower()
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1
            
    sentenceValue = dict()
    for sentence in sentences:
        for word, freq in freqTable.items():
            if word in sentence.lower():
                if sentence in sentenceValue:
                    sentenceValue[sentence] += freq
                else:
                    sentenceValue[sentence] = freq
    # print(len(sentences))
    # print(sentenceValue.values())
    try:
        min_val = sorted(sentenceValue.values(), reverse=True)[target_len-1]
    except IndexError: # this case # of sentenceValue < # of sentences, so just leave it 0
        return np.nan
    
    summary = []
    for sentence in sentences:
        if len(summary) == target_len:
            break
        if (sentence in sentenceValue) and (sentenceValue[sentence] >= min_val):
            if '<S> '+ sentence + ' </S>' not in summary:
                summary += ['<S> '+ sentence + ' </S>']
            
    assert len(summary) == target_len, f"summary_len={len(summary)}, target_len={target_len}"
    
    return summary

In [4]:
df.iloc[2]

headlines    New Zealand end Rohit Sharma-led India's 12-ma...
text         New Zealand defeated India by 8 wickets in the...
Name: 2, dtype: object

In [5]:
extractive_summaries = df.progress_apply(extractive_apply, axis=1)

  0%|          | 0/98401 [00:00<?, ?it/s]

In [6]:
na_idx = extractive_summaries[extractive_summaries.isnull()].index

In [7]:
df_comp = pd.DataFrame()
df_comp['predictions'] = extractive_summaries[~extractive_summaries.index.isin(na_idx)]
df_comp['references'] = df['headlines'][~df.index.isin(na_idx)]

In [8]:
df_comp

Unnamed: 0,predictions,references
0,"[<S> Saurav Kant, an alumnus of upGrad and III...",upGrad learner switches to career in ML & Al w...
1,[<S> Users get one CRED coin per rupee of bill...,Delhi techie wins free food from Swiggy for on...
2,[<S> New Zealand defeated India by 8 wickets i...,New Zealand end Rohit Sharma-led India's 12-ma...
3,"[<S> Also, customers have options to insure ag...",Aegon life iTerm insurance plan helps customer...
4,[<S> Speaking about the sexual harassment alle...,"Have known Hirani for yrs, what if MeToo claim..."
...,...,...
98396,"[<S> As per preliminary information, Maoists a...",CRPF jawan axed to death by Maoists in Chhatti...
98397,"[<S> 'Uff Yeh', the first song from the Sonaks...",First song from Sonakshi Sinha's 'Noor' titled...
98398,"[<S> According to reports, a new version of th...",'The Matrix' film to get a reboot: Reports
98399,[<S> The video also shows a TV airing a news c...,Snoop Dogg aims gun at clown dressed as Trump ...


In [9]:
def join_sent_apply(row):
    pred_sents = list(map(lambda x: x[4:-4], row['predictions']))
    return ''.join(pred_sents), row['references']

In [10]:
df_comp['predictions_joined'], df_comp['references_joined'] = zip(*df_comp.progress_apply(join_sent_apply, axis=1))

  0%|          | 0/98397 [00:00<?, ?it/s]

In [13]:
df_comp.to_pickle('output/extraction-news.pkl')

In [12]:
metric.compute(predictions=df_comp['predictions_joined'].to_list(), references=df_comp['references_joined'].to_list())

{'rouge1': AggregateScore(low=Score(precision=0.21776343610935106, recall=0.5862797360228751, fmeasure=0.314407219386756), mid=Score(precision=0.21844842987437374, recall=0.5879668743116022, fmeasure=0.3153243359918708), high=Score(precision=0.2191383002443954, recall=0.5895324458572432, fmeasure=0.31624524484732414)),
 'rouge2': AggregateScore(low=Score(precision=0.0865993869630295, recall=0.24549666133952514, fmeasure=0.12663054994807632), mid=Score(precision=0.08716613853182378, recall=0.246945718144775, fmeasure=0.12742833686344712), high=Score(precision=0.08760164938922263, recall=0.24817676803616742, fmeasure=0.12804766635390682)),
 'rougeL': AggregateScore(low=Score(precision=0.1828095969111098, recall=0.4906118779088201, fmeasure=0.2636874269534128), mid=Score(precision=0.18342449487414403, recall=0.4921718048698779, fmeasure=0.2645352723115651), high=Score(precision=0.18398709458881735, recall=0.4935784267154839, fmeasure=0.26532149016034295)),
 'rougeLsum': AggregateScore(low