In [1]:
import gdown
import json
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm.auto import tqdm
tqdm.pandas()
from datasets import load_metric
metric = load_metric("rouge")
import spacy
import pytextrank

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x7f4add7a73d0>

In [2]:
def read_arxiv(path: str) -> pd.DataFrame:
    with open(path, 'r') as json_file:
        json_list = list(json_file)

    # read jsonl
    result = [json.loads(json_str) for json_str in json_list]

    return pd.DataFrame(result)

In [3]:
df_train = read_arxiv('../csci-544-project/data/arxiv-dataset/train.txt')
df_test = read_arxiv('../csci-544-project/data/arxiv-dataset/test.txt')
df_val = read_arxiv('../csci-544-project/data/arxiv-dataset/val.txt')

In [4]:
df_train

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections
0,1405.3379,[additive models @xcite provide an important f...,[<S> additive models play an important role in...,,"[introduction, main results on learning rates,...",[[additive models @xcite provide an important ...
1,0901.1147,[the leptonic decays of a charged pseudoscalar...,[<S> we have studied the leptonic decay @xmath...,,"[[sec:introduction]introduction, [sec:detector...",[[the leptonic decays of a charged pseudoscala...
2,nlin0608019,[the transport properties of nonlinear non - e...,"[<S> in 84 , 258 ( 2000 ) , mateos conjectured...",,"[introduction, regularity and chaos in single-...",[[the transport properties of nonlinear non - ...
3,0903.5449,[studies of laser beams propagating through tu...,[<S> the effect of a random phase diffuser on ...,,"[introduction, the method of photon distributi...",[[studies of laser beams propagating through t...
4,hep-ph0605279,[the so - called `` nucleon spin crisis '' rai...,[<S> with a special intention of clarifying th...,,"[introduction, model lagrangian with pion mass...",[[the so - called `` nucleon spin crisis '' ra...
...,...,...,...,...,...,...
203032,quant-ph0402038,"[e. rasmusen , _ games and information : an in...",[<S> effects of a corrupt source on the dynami...,,[references],"[[e. rasmusen , _ games and information : an i..."
203033,0907.3736,"[the magnetocaloric effect , _, i.e. _ , a tem...",[<S> we compute the entropy of antiferromagnet...,,"[introduction, methods, spin @xmath1 heisenber...","[[the magnetocaloric effect , _, i.e. _ , a te..."
203034,1506.04688,"[as expected , the most interesting combinator...",[<S> as a generalization of orbit - polynomial...,,"[introduction and preliminaries, partitions ar...","[[as expected , the most interesting combinato..."
203035,cond-mat0304118,[by numerical study we find that the branch cu...,[<S> within the lowest - order born approximat...,,[scaling form for branch cut integrals],[[by numerical study we find that the branch c...


In [5]:
def text_rank_apply(row):
    sent_li = row['article_text']
    text = ''.join(sent_li)
    
    doc = nlp(text)
    
    tr = doc._.textrank
    summary = ''
    for sent in tr.summary(limit_phrases=15, limit_sentences=5):
        summary += str(sent)
    return summary

In [7]:
extractive_summaries = df_train.iloc[:30000].progress_apply(text_rank_apply, axis=1)

  0%|          | 0/30000 [00:00<?, ?it/s]

In [8]:
df_comp = pd.DataFrame()
df_comp['predictions'] = extractive_summaries
df_comp['references'] = df_train['abstract_text'].iloc[:30000]
df_comp['references_joined'] = df_comp['references'].apply(lambda x: ''.join(list(map(lambda y: y[4:-4], x))))

In [9]:
df_comp.to_pickle('output/textrank-arxiv.pkl')

In [10]:
metric.compute(predictions=df_comp['predictions'].to_list(), references=df_comp['references_joined'].to_list())

{'rouge1': AggregateScore(low=Score(precision=0.23600631757902965, recall=0.5316809091375873, fmeasure=0.2933634373795379), mid=Score(precision=0.23750327733455195, recall=0.5334781840444933, fmeasure=0.2944833864902685), high=Score(precision=0.23893499120004164, recall=0.5353431763999145, fmeasure=0.2956113370871592)),
 'rouge2': AggregateScore(low=Score(precision=0.07570996577804624, recall=0.17874657996304621, fmeasure=0.0951806730796716), mid=Score(precision=0.07635596044740152, recall=0.1799875018527766, fmeasure=0.09578286291231464), high=Score(precision=0.07703250847515908, recall=0.1813800111759692, fmeasure=0.09636960529006236)),
 'rougeL': AggregateScore(low=Score(precision=0.11936211975474546, recall=0.2779645211035484, fmeasure=0.14797977423485398), mid=Score(precision=0.12009637482150218, recall=0.2792157046212337, fmeasure=0.1485622609250979), high=Score(precision=0.12078296548758274, recall=0.2804645827077607, fmeasure=0.14911540913004656)),
 'rougeLsum': AggregateScore(