# Extractive Baseline

Data: PubMed

In [1]:
import gdown
import json
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from tqdm.auto import tqdm
tqdm.pandas()
from datasets import load_metric
metric = load_metric("rouge")

In [2]:
url = 'https://drive.google.com/u/0/uc?export=download&confirm=OaU2&id=1b3rmCSIoh6VhD4HKWjI4HOW-cSwcwbeC'
output = '../csci-544-project/data/arxiv-dataset.zip'
gdown.cached_download(url, output, quiet=False)

File exists: ../csci-544-project/data/arxiv-dataset.zip


'../csci-544-project/data/arxiv-dataset.zip'

In [3]:
gdown.extractall(output)

['../csci-544-project/data/arxiv-dataset/',
 '../csci-544-project/data/__MACOSX/._arxiv-dataset',
 '../csci-544-project/data/arxiv-dataset/train.txt',
 '../csci-544-project/data/__MACOSX/arxiv-dataset/._train.txt',
 '../csci-544-project/data/arxiv-dataset/vocab',
 '../csci-544-project/data/__MACOSX/arxiv-dataset/._vocab',
 '../csci-544-project/data/arxiv-dataset/test.txt',
 '../csci-544-project/data/__MACOSX/arxiv-dataset/._test.txt',
 '../csci-544-project/data/arxiv-dataset/val.txt',
 '../csci-544-project/data/__MACOSX/arxiv-dataset/._val.txt']

In [4]:
def read_arxiv(path: str) -> pd.DataFrame:
    with open(path, 'r') as json_file:
        json_list = list(json_file)

    # read jsonl
    result = [json.loads(json_str) for json_str in json_list]

    return pd.DataFrame(result)

In [5]:
df_train = read_arxiv('../csci-544-project/data/arxiv-dataset/train.txt')
df_test = read_arxiv('../csci-544-project/data/arxiv-dataset/test.txt')
df_val = read_arxiv('../csci-544-project/data/arxiv-dataset/val.txt')

In [6]:
df_train

Unnamed: 0,article_id,article_text,abstract_text,labels,section_names,sections
0,1405.3379,[additive models @xcite provide an important f...,[<S> additive models play an important role in...,,"[introduction, main results on learning rates,...",[[additive models @xcite provide an important ...
1,0901.1147,[the leptonic decays of a charged pseudoscalar...,[<S> we have studied the leptonic decay @xmath...,,"[[sec:introduction]introduction, [sec:detector...",[[the leptonic decays of a charged pseudoscala...
2,nlin0608019,[the transport properties of nonlinear non - e...,"[<S> in 84 , 258 ( 2000 ) , mateos conjectured...",,"[introduction, regularity and chaos in single-...",[[the transport properties of nonlinear non - ...
3,0903.5449,[studies of laser beams propagating through tu...,[<S> the effect of a random phase diffuser on ...,,"[introduction, the method of photon distributi...",[[studies of laser beams propagating through t...
4,hep-ph0605279,[the so - called `` nucleon spin crisis '' rai...,[<S> with a special intention of clarifying th...,,"[introduction, model lagrangian with pion mass...",[[the so - called `` nucleon spin crisis '' ra...
...,...,...,...,...,...,...
203032,quant-ph0402038,"[e. rasmusen , _ games and information : an in...",[<S> effects of a corrupt source on the dynami...,,[references],"[[e. rasmusen , _ games and information : an i..."
203033,0907.3736,"[the magnetocaloric effect , _, i.e. _ , a tem...",[<S> we compute the entropy of antiferromagnet...,,"[introduction, methods, spin @xmath1 heisenber...","[[the magnetocaloric effect , _, i.e. _ , a te..."
203034,1506.04688,"[as expected , the most interesting combinator...",[<S> as a generalization of orbit - polynomial...,,"[introduction and preliminaries, partitions ar...","[[as expected , the most interesting combinato..."
203035,cond-mat0304118,[by numerical study we find that the branch cu...,[<S> within the lowest - order born approximat...,,[scaling form for branch cut integrals],[[by numerical study we find that the branch c...


In [7]:
def extractive_apply(row):
    # print(row['article_id'])
    sentences = row['article_text']
    if not sentences or not sentences[0]:
        return np.nan
    elif len(sentences) < len(row['abstract_text']):
        return np.nan
    
    
    text = ' '.join(sentences)
    stopWords = set(stopwords.words('english'))
    words = word_tokenize(text)
    target_len = len(row['abstract_text'])
    
    freqTable = dict()
    for word in words:
        word = word.lower()
        if word in stopWords:
            continue
        if word in freqTable:
            freqTable[word] += 1
        else:
            freqTable[word] = 1
            
    sentenceValue = dict()
    for sentence in sentences:
        for word, freq in freqTable.items():
            if word in sentence.lower():
                if sentence in sentenceValue:
                    sentenceValue[sentence] += freq
                else:
                    sentenceValue[sentence] = freq
    # print(len(sentences))
    # print(sentenceValue.values())
    try:
        min_val = sorted(sentenceValue.values(), reverse=True)[target_len-1]
    except IndexError: # this case # of sentenceValue < # of sentences, so just leave it 0
        return np.nan
    
    summary = []
    for sentence in sentences:
        if len(summary) == target_len:
            break
        if (sentence in sentenceValue) and (sentenceValue[sentence] >= min_val):
            if '<S> '+ sentence + ' </S>' not in summary:
                summary += ['<S> '+ sentence + ' </S>']
            
    assert len(summary) == target_len, f"summary_len={len(summary)}, target_len={target_len}"
    
    return summary

In [8]:
df_train.iloc[30263]

article_id                                               0912.2266
article_text     [is a bright , radio loud quasar that shares m...
abstract_text    [<S> we have analysed the first 15 months of _...
labels                                                        None
section_names    [introduction, from radio to x-rays, the gamma...
sections         [[is a bright , radio loud quasar that shares ...
Name: 30263, dtype: object

In [9]:
extractive_summaries = df_train.progress_apply(extractive_apply, axis=1)

  0%|          | 0/203037 [00:00<?, ?it/s]

In [10]:
na_idx = extractive_summaries[extractive_summaries.isnull()].index

In [11]:
df_comp = pd.DataFrame()
df_comp['predictions'] = extractive_summaries[~extractive_summaries.index.isin(na_idx)]
df_comp['references'] = df_train['abstract_text'][~df_train.index.isin(na_idx)]

In [12]:
df_comp

Unnamed: 0,predictions,references
0,[<S> [ approxerrorthm ] under assumption [ ass...,[<S> additive models play an important role in...
1,[<S> .[table : data - single ] summary of sing...,[<S> we have studied the leptonic decay @xmath...
2,[<S> two chaotic attractors emerge with @xmath...,"[<S> in 84 , 258 ( 2000 ) , mateos conjectured..."
3,[<S> [ sixteen ] can be replaced by @xmath90 w...,[<S> the effect of a random phase diffuser on ...
4,[<S> the soliton equation of motion is obtaine...,[<S> with a special intention of clarifying th...
...,...,...
203030,[<S> @l@|@c@@c@ source & @xmath45 & @xmath28 +...,[<S> we report the first measurement of the @x...
203031,"[<S> the most important innovation of dft , wh...",[<S> a density functional theory for many - bo...
203033,"[<S> finally , the adiabatic cooling rate has ...",[<S> we compute the entropy of antiferromagnet...
203034,"[<S> also , we show that every quotient - poly...",[<S> as a generalization of orbit - polynomial...


In [13]:
def join_sent_apply(row):
    pred_sents = list(map(lambda x: x[4:-4], row['predictions']))
    ref_sents = list(map(lambda x: x[4:-4], row['references']))
    return ''.join(pred_sents), ''.join(ref_sents)

In [14]:
df_comp['predictions_joined'], df_comp['references_joined'] = zip(*df_comp.progress_apply(join_sent_apply, axis=1))

  0%|          | 0/197570 [00:00<?, ?it/s]

In [15]:
df_comp.to_pickle('output/extraction-arxiv.pkl')

In [16]:
metric.compute(predictions=df_comp['predictions_joined'].to_list(), references=df_comp['references_joined'].to_list())

{'rouge1': AggregateScore(low=Score(precision=0.19845984227455268, recall=0.49305410444157993, fmeasure=0.2699811906313422), mid=Score(precision=0.1988730690206154, recall=0.49364789827918343, fmeasure=0.2704602352972015), high=Score(precision=0.19928852972149902, recall=0.4942926747461064, fmeasure=0.2709065143147144)),
 'rouge2': AggregateScore(low=Score(precision=0.05778413217118419, recall=0.1440189797585834, fmeasure=0.07912415627189733), mid=Score(precision=0.057966521925853914, recall=0.14439902038541141, fmeasure=0.0793573711131341), high=Score(precision=0.058153588645902865, recall=0.14477867332283653, fmeasure=0.07958679485696006)),
 'rougeL': AggregateScore(low=Score(precision=0.1019790576602871, recall=0.25529959493410515, fmeasure=0.13848515872459735), mid=Score(precision=0.10221435086429984, recall=0.2556211482203058, fmeasure=0.1387108074869934), high=Score(precision=0.10243490365381765, recall=0.25595296816405216, fmeasure=0.1389294682053559)),
 'rougeLsum': AggregateSc