In [1]:
import gdown
import json
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm.auto import tqdm
tqdm.pandas()
from datasets import load_metric
metric = load_metric("rouge")
import spacy
import pytextrank

# load a spaCy model, depending on language, scale, etc.
nlp = spacy.load("en_core_web_sm")

# add PyTextRank to the spaCy pipeline
nlp.add_pipe("textrank")

<pytextrank.base.BaseTextRankFactory at 0x7feb946c2d90>

In [2]:
def read_pubmed(path: str) -> pd.DataFrame:
    with open(path, 'r') as json_file:
        json_list = list(json_file)

    # read jsonl
    result = [json.loads(json_str) for json_str in json_list]

    return pd.DataFrame(result)

In [3]:
df_train = read_pubmed('../csci-544-project/data/pubmed-dataset/train.txt')
df_test = read_pubmed('../csci-544-project/data/pubmed-dataset/test.txt')
df_val = read_pubmed('../csci-544-project/data/pubmed-dataset/val.txt')

In [4]:
df_train

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections
0,PMC3872579,[a recent systematic analysis showed that in 2...,[<S> background : the present study was carrie...,,"[INTRODUCTION, MATERIALS AND METHODS, Particip...",[[a recent systematic analysis showed that in ...
1,PMC3770628,[it occurs in more than 50% of patients and ma...,[<S> backgroundanemia in patients with cancer ...,,"[Introduction, Patients and methods, Study des...",[[it occurs in more than 50% of patients and m...
2,PMC5330001,"[tardive dystonia ( td ) , a rarer side effect...",[<S> tardive dystonia ( td ) is a serious side...,,"[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec..."
3,PMC4386667,"[lepidoptera include agricultural pests that ,...",[<S> many lepidopteran insects are agricultura...,,"[1. Introduction, 2. Insect Immunity, 3. Signa...",[[lepidoptera include agricultural pests that ...
4,PMC4307954,[syncope is caused by transient diffuse cerebr...,[<S> we present an unusual case of recurrent c...,,"[Introduction, Case report, Discussion, Confli...",[[syncope is caused by transient diffuse cereb...
...,...,...,...,...,...,...
119919,PMC3502213,[eukaryotic cells depend on vesicle - mediated...,[<S> long - distance trafficking of membranous...,,"[Introduction, Motor-Dependent Transport of Ra...",[[eukaryotic cells depend on vesicle - mediate...
119920,PMC3198562,[as regards the selection criteria of the post...,[<S> aims and objectives : to study the stress...,,"[INTRODUCTION, MATERIALS AND METHODS, Modeling...",[[fiber post systems are routinely used in res...
119921,PMC4436536,[in most of the peer review publications in th...,[<S> abstractbackgroundthe objective of this s...,,"[Introduction, Methods, Results, Discussion, L...",[[in most of the peer review publications in t...
119922,PMC4251613,[the reveal registry is a longitudinal registr...,[<S> background : patients with pulmonary arte...,,"[TRIAL REGISTRY:, Materials and Methods, REVEA...","[[], [the reveal registry is a longitudinal re..."


In [5]:
def text_rank_apply(row):
    sent_li = row['article_text']
    text = ''.join(sent_li)
    
    doc = nlp(text)
    
    tr = doc._.textrank
    summary = ''
    for sent in tr.summary(limit_phrases=15, limit_sentences=5):
        summary += str(sent)
    return summary

In [6]:
extractive_summaries = df_train.progress_apply(text_rank_apply, axis=1)

  0%|          | 0/119924 [00:00<?, ?it/s]

In [7]:
df_comp = pd.DataFrame()
df_comp['predictions'] = extractive_summaries
df_comp['references'] = df_train['abstract_text']
df_comp['references_joined'] = df_comp['references'].apply(lambda x: ''.join(list(map(lambda y: y[4:-4], x))))

In [8]:
df_comp.to_pickle('output/textrank-pubmed.pkl')

In [9]:
metric.compute(predictions=df_comp['predictions'].to_list(), references=df_comp['references_joined'].to_list())

{'rouge1': AggregateScore(low=Score(precision=0.22899993630847423, recall=0.5897126376123218, fmeasure=0.3126848687476407), mid=Score(precision=0.2296850795900143, recall=0.5905953020166259, fmeasure=0.3133698526810192), high=Score(precision=0.23035554735008296, recall=0.591453034291116, fmeasure=0.3140585326662149)),
 'rouge2': AggregateScore(low=Score(precision=0.08963071968656476, recall=0.230680301280307, fmeasure=0.12202534265406723), mid=Score(precision=0.09007281091353685, recall=0.2314169457246204, fmeasure=0.12250471185516137), high=Score(precision=0.09057634314911109, recall=0.2322041034738661, fmeasure=0.12302412075093766)),
 'rougeL': AggregateScore(low=Score(precision=0.11684912763682775, recall=0.30846657113438436, fmeasure=0.15976391065782292), mid=Score(precision=0.11722616932052067, recall=0.3091409957481722, fmeasure=0.1601315035653781), high=Score(precision=0.11760946490101702, recall=0.30982900763170795, fmeasure=0.1605233369243449)),
 'rougeLsum': AggregateScore(lo