# Extractive Baseline

Data: PubMed

In [1]:
import gdown
import json
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm.auto import tqdm
tqdm.pandas()
from datasets import load_metric
metric = load_metric("rouge")

In [2]:
url = 'https://drive.google.com/u/0/uc?export=download&confirm=OaU2&id=1lvsqvsFi3W-pE1SqNZI0s8NR9rC1tsja'
output = '../csci-544-project/data/pubmed-dataset.zip'
gdown.cached_download(url, output, quiet=False)

File exists: ../csci-544-project/data/pubmed-dataset.zip


'../csci-544-project/data/pubmed-dataset.zip'

In [3]:
gdown.extractall(output)

['../csci-544-project/data/pubmed-dataset/',
 '../csci-544-project/data/__MACOSX/._pubmed-dataset',
 '../csci-544-project/data/pubmed-dataset/train.txt',
 '../csci-544-project/data/__MACOSX/pubmed-dataset/._train.txt',
 '../csci-544-project/data/pubmed-dataset/vocab',
 '../csci-544-project/data/__MACOSX/pubmed-dataset/._vocab',
 '../csci-544-project/data/pubmed-dataset/test.txt',
 '../csci-544-project/data/__MACOSX/pubmed-dataset/._test.txt',
 '../csci-544-project/data/pubmed-dataset/val.txt',
 '../csci-544-project/data/__MACOSX/pubmed-dataset/._val.txt']

In [4]:
def read_pubmed(path: str) -> pd.DataFrame:
    with open(path, 'r') as json_file:
        json_list = list(json_file)

    # read jsonl
    result = [json.loads(json_str) for json_str in json_list]

    return pd.DataFrame(result)

In [5]:
df_train = read_pubmed('../csci-544-project/data/pubmed-dataset/train.txt')
df_test = read_pubmed('../csci-544-project/data/pubmed-dataset/test.txt')
df_val = read_pubmed('../csci-544-project/data/pubmed-dataset/val.txt')

In [6]:
df_train

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections
0,PMC3872579,[a recent systematic analysis showed that in 2...,[<S> background : the present study was carrie...,,"[INTRODUCTION, MATERIALS AND METHODS, Particip...",[[a recent systematic analysis showed that in ...
1,PMC3770628,[it occurs in more than 50% of patients and ma...,[<S> backgroundanemia in patients with cancer ...,,"[Introduction, Patients and methods, Study des...",[[it occurs in more than 50% of patients and m...
2,PMC5330001,"[tardive dystonia ( td ) , a rarer side effect...",[<S> tardive dystonia ( td ) is a serious side...,,"[INTRODUCTION, CASE REPORT, DISCUSSION, Declar...","[[tardive dystonia ( td ) , a rarer side effec..."
3,PMC4386667,"[lepidoptera include agricultural pests that ,...",[<S> many lepidopteran insects are agricultura...,,"[1. Introduction, 2. Insect Immunity, 3. Signa...",[[lepidoptera include agricultural pests that ...
4,PMC4307954,[syncope is caused by transient diffuse cerebr...,[<S> we present an unusual case of recurrent c...,,"[Introduction, Case report, Discussion, Confli...",[[syncope is caused by transient diffuse cereb...
...,...,...,...,...,...,...
119919,PMC3502213,[eukaryotic cells depend on vesicle - mediated...,[<S> long - distance trafficking of membranous...,,"[Introduction, Motor-Dependent Transport of Ra...",[[eukaryotic cells depend on vesicle - mediate...
119920,PMC3198562,[as regards the selection criteria of the post...,[<S> aims and objectives : to study the stress...,,"[INTRODUCTION, MATERIALS AND METHODS, Modeling...",[[fiber post systems are routinely used in res...
119921,PMC4436536,[in most of the peer review publications in th...,[<S> abstractbackgroundthe objective of this s...,,"[Introduction, Methods, Results, Discussion, L...",[[in most of the peer review publications in t...
119922,PMC4251613,[the reveal registry is a longitudinal registr...,[<S> background : patients with pulmonary arte...,,"[TRIAL REGISTRY:, Materials and Methods, REVEA...","[[], [the reveal registry is a longitudinal re..."


In [7]:
def extractive_apply(row):
    # print(row['article_id'])
    sentences = row['article_text']
    if not sentences or not sentences[0]:
        return np.nan
    elif len(sentences) < len(row['abstract_text']):
        return np.nan
    
    
    text = ' '.join(sentences)
    stopWords = set(stopwords.words('english'))
    words = word_tokenize(text)
    target_len = len(row['abstract_text'])
    
    freqTable = dict()
    for word in words:
        word = word.lower()
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1
            
    sentenceValue = dict()
    for sentence in sentences:
        for word, freq in freqTable.items():
            if word in sentence.lower():
                if sentence in sentenceValue:
                    sentenceValue[sentence] += freq
                else:
                    sentenceValue[sentence] = freq
    # print(len(sentences))
    # print(sentenceValue.values())
    try:
        min_val = sorted(sentenceValue.values(), reverse=True)[target_len-1]
    except IndexError: # this case # of sentenceValue < # of sentences, so just leave it 0
        return np.nan
    
    summary = []
    for sentence in sentences:
        if len(summary) == target_len:
            break
        if (sentence in sentenceValue) and (sentenceValue[sentence] >= min_val):
            if '<S> '+ sentence + ' </S>' not in summary:
                summary += ['<S> '+ sentence + ' </S>']
            
    assert len(summary) == target_len, f"summary_len={len(summary)}, target_len={target_len}"
    
    return summary

In [8]:
df_train.iloc[30263]

article_id                                              PMC5238359
article_text                                                    []
abstract_text    [<S> highlightsinflammatory myofibroblastic tu...
labels                                                        None
section_names    [Conflicts of interest, Funding, Ethical appro...
sections                                  [[], [], [], [], [], []]
Name: 30263, dtype: object

In [9]:
extractive_summaries = df_train.progress_apply(extractive_apply, axis=1)

  0%|          | 0/119924 [00:00<?, ?it/s]

In [21]:
na_idx = extractive_summaries[extractive_summaries.isnull()].index

In [37]:
df_comp = pd.DataFrame()
df_comp['predictions'] = extractive_summaries[~extractive_summaries.index.isin(na_idx)]
df_comp['references'] = df_train['abstract_text'][~df_train.index.isin(na_idx)]

In [41]:
df_comp.progress_apply(lambda row: metric.compute(predictions=row['predictions'], references=row['references']), axis=1)

  0%|          | 0/116155 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [42]:
metric.compute(predictions=df_comp.iloc[:2,1], references=df_comp.iloc[:2,0])

{'rouge1': AggregateScore(low=Score(precision=0.2886178861788618, recall=0.3358024691358025, fmeasure=0.3219954648526077), mid=Score(precision=0.34084073499694534, recall=0.34995251661918325, fmeasure=0.342089609922975), high=Score(precision=0.3930635838150289, recall=0.3641025641025641, fmeasure=0.3621837549933422)),
 'rouge2': AggregateScore(low=Score(precision=0.05714285714285714, recall=0.06435643564356436, fmeasure=0.06378132118451024), mid=Score(precision=0.06625258799171843, recall=0.0682606920485863, fmeasure=0.06660361119305619), high=Score(precision=0.07536231884057971, recall=0.07216494845360824, fmeasure=0.06942590120160214)),
 'rougeL': AggregateScore(low=Score(precision=0.15447154471544716, recall=0.14320987654320988, fmeasure=0.15446071904127828), mid=Score(precision=0.1610508012594577, recall=0.1690408357075024, fmeasure=0.16339815997415386), high=Score(precision=0.1676300578034682, recall=0.19487179487179487, fmeasure=0.17233560090702946)),
 'rougeLsum': AggregateScore