In [1]:
import gdown
import json
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm.auto import tqdm
tqdm.pandas()
from datasets import load_metric
metric = load_metric("rouge")

In [2]:
df = pd.read_csv('../csci-544-project/data/news_summary_more.csv', encoding='latin-1')
df.head()

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...


In [3]:
import spacy
import pytextrank

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

def text_rank_apply(row):
    doc = nlp(row['text'])
    
    tr = doc._.textrank
    summary = ''
    for sent in tr.summary(limit_phrases=15, limit_sentences=5):
        summary += str(sent)
    return summary

In [4]:
extractive_summaries = df.progress_apply(text_rank_apply, axis=1)

  0%|          | 0/98401 [00:00<?, ?it/s]

In [5]:
df_comp = pd.DataFrame()
df_comp['predictions'] = extractive_summaries
df_comp['references'] = df['headlines']

In [7]:
df_comp.to_pickle('output/textrank-news.pkl')

In [8]:
metric.compute(predictions=df_comp['predictions'].to_list(), references=df_comp['references'].to_list())

{'rouge1': AggregateScore(low=Score(precision=0.12526050274659903, recall=0.7538163904220454, fmeasure=0.21420688613138872), mid=Score(precision=0.12545410170809312, recall=0.7547991588201832, fmeasure=0.21453000953169782), high=Score(precision=0.12564711647443483, recall=0.7557934787187115, fmeasure=0.21484591820515053)),
 'rouge2': AggregateScore(low=Score(precision=0.05193017388096755, recall=0.3399436995510668, fmeasure=0.08979896701872636), mid=Score(precision=0.052144962448686286, recall=0.34124996662383167, fmeasure=0.09016632387033698), high=Score(precision=0.05234694569506965, recall=0.34252881444874606, fmeasure=0.0905158021766391)),
 'rougeL': AggregateScore(low=Score(precision=0.10508546030412694, recall=0.6341546047846929, fmeasure=0.17977954978592156), mid=Score(precision=0.10527401312792445, recall=0.6352261097992662, fmeasure=0.180095099153321), high=Score(precision=0.10547898229433407, recall=0.6364313275078537, fmeasure=0.18043958223838585)),
 'rougeLsum': AggregateSc

In [15]:
df.sample()['text'].item()

'An MIT study has found Earth harboured a mantle which was 200 Ã\x82ÂºC hotter 3 billion years ago, while the crust was composed of much denser stuff. The combination of a hotter mantle and denser rocks likely caused tectonic plates to sink to the mantle\'s bottom, 2,800 km below the surface, forming a "graveyard" of slabs atop the Earth\'s core.'

In [22]:
doc = nlp(df.sample()['text'].item())
tr = doc._.textrank
tr.plot_keyphrases()

In [37]:
pd.DataFrame([p.__dict__ for p in doc._.phrases]).drop("chunks", axis=1).reset_index().head()

Unnamed: 0,index,text,count,rank
0,0,US President Donald Trump,1,0.149883
1,1,next year,1,0.131913
2,2,Donald Trump,1,0.120094
3,3,Republic Day celebrations,1,0.119003
4,4,South African President Cyril Ramaphosa,1,0.115761


In [36]:
doc

South African President Cyril Ramaphosa will reportedly be the chief guest at Republic Day celebrations next year, weeks after the White House confirmed US President Donald Trump will not be able to attend due to "scheduling constraints". Ramaphosa, a follower of Mahatma Gandhi and Nelson Mandela, was invited as the year will also mark Gandhi's 150th birth anniversary.

In [44]:
summary = ''
for sent in tr.summary(limit_phrases=1, limit_sentences=1):
    summary += str(sent)
summary

'South African President Cyril Ramaphosa will reportedly be the chief guest at Republic Day celebrations next year, weeks after the White House confirmed US President Donald Trump will not be able to attend due to "scheduling constraints".'

In [41]:
df_comp

Unnamed: 0,predictions,references
0,"Saurav Kant, an alumnus of upGrad and IIIT-B's...",upGrad learner switches to career in ML & Al w...
1,Users get one CRED coin per rupee of bill paid...,Delhi techie wins free food from Swiggy for on...
2,New Zealand defeated India by 8 wickets in the...,New Zealand end Rohit Sharma-led India's 12-ma...
3,"Also, customers have options to insure against...",Aegon life iTerm insurance plan helps customer...
4,Speaking about the sexual harassment allegatio...,"Have known Hirani for yrs, what if MeToo claim..."
...,...,...
98396,A CRPF jawan was on Tuesday axed to death with...,CRPF jawan axed to death by Maoists in Chhatti...
98397,The song has been composed by Amaal Mallik wit...,First song from Sonakshi Sinha's 'Noor' titled...
98398,"According to reports, a new version of the 199...",'The Matrix' film to get a reboot: Reports
98399,A new music video shows rapper Snoop Dogg aimi...,Snoop Dogg aims gun at clown dressed as Trump ...


In [42]:
df

Unnamed: 0,headlines,text
0,upGrad learner switches to career in ML & Al w...,"Saurav Kant, an alumnus of upGrad and IIIT-B's..."
1,Delhi techie wins free food from Swiggy for on...,Kunal Shah's credit card bill payment platform...
2,New Zealand end Rohit Sharma-led India's 12-ma...,New Zealand defeated India by 8 wickets in the...
3,Aegon life iTerm insurance plan helps customer...,"With Aegon Life iTerm Insurance plan, customer..."
4,"Have known Hirani for yrs, what if MeToo claim...",Speaking about the sexual harassment allegatio...
...,...,...
98396,CRPF jawan axed to death by Maoists in Chhatti...,A CRPF jawan was on Tuesday axed to death with...
98397,First song from Sonakshi Sinha's 'Noor' titled...,"'Uff Yeh', the first song from the Sonakshi Si..."
98398,'The Matrix' film to get a reboot: Reports,"According to reports, a new version of the 199..."
98399,Snoop Dogg aims gun at clown dressed as Trump ...,A new music video shows rapper Snoop Dogg aimi...
