In [46]:
import pymupdf
import re
import nltk.data
import os
from functools import reduce
import pandas as pd
from tqdm import tqdm
import gc
from pdfrw import PdfReader

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [47]:
MIN_PAR_LEN = 200
MIN_SENTENCE_LEN = 100
PDFS_DIR = '/home/dzigen/Desktop/ITMO/DialogueSystem/DialogueSearchSystem/data/sci_pdfs'

In [54]:
def get_pdfpages(pdf_path):
    text = []
    with pymupdf.open(pdf_path) as doc:
        title = doc.metadata['title']
        for page in doc:
            for tab in page.find_tables():
                # process the content of table 'tab'
                page.add_redact_annot(tab.bbox)  # wrap table in a redaction annotation

            for img in page.get_images():
                # process the content of table 'tab'
                page.delete_image(img[0]) # wrap table in a redaction annotation

            page.apply_redactions()  # erase all table text
            text.append(page.get_text("blocks", sort=True))

    return text, title

def filter_pdfblocks(pages):
    filtered_pages = []
    for page in pages:
        filtered_block = []
        flag = False
        for block in page:
            txt = block[4]

            if len(txt) >= MIN_PAR_LEN:
                txt = txt.replace('-\n','').replace('\n',' ')

                if txt.startswith("Figure") or txt.startswith("Table"):
                    continue

                sentences = list(filter(lambda x: len(x) >= MIN_SENTENCE_LEN, tokenizer.tokenize(txt)))
                filtered_block.append(sentences)
            
            else:
                if len(re.findall('references', txt.lower())) > 0:
                    #print("find references!")
                    flag = True
                    break

        filtered_pages.append(filtered_block)

        if flag:
            break

    return filtered_pages

def prepare_sci_corpus(pdfs_dir):
    pdfs = os.listdir(pdfs_dir)
    #print(pdfs)

    tmp_data = []
    for id, file in tqdm(enumerate(pdfs)):
        #print(file) 
        path = f"{pdfs_dir}/{file}"

        print("extracting text...")
        gc.collect()
        pages_info, title = get_pdfpages(path)
        print(title)
        gc.collect()
        print("filtering text...")
        filtered_info = filter_pdfblocks(pages_info)

        blocks = reduce(lambda acc, x: acc + x, filtered_info, [])
        paragraphs = reduce(lambda acc, x: acc + x, blocks, [])

        tmp_data += [(title, sent, {'path': path, 'id': id}) for sent in paragraphs]

    df = pd.DataFrame(tmp_data, columns=['title','text', 'metadata'])
    return df

In [None]:
df = prepare_sci_corpus(PDFS_DIR)

In [56]:
df

Unnamed: 0,text,metadata
0,Generative models for open domain question ans...,"{'title': '', 'path': '/home/dzigen/Desktop/IT..."
1,"While promising, this approach requires to use...","{'title': '', 'path': '/home/dzigen/Desktop/IT..."
2,"In this paper, we investigate how much these m...","{'title': '', 'path': '/home/dzigen/Desktop/IT..."
3,"Interestingly, we observe that the performance...","{'title': '', 'path': '/home/dzigen/Desktop/IT..."
4,This is evidence that sequence-to-sequence mod...,"{'title': '', 'path': '/home/dzigen/Desktop/IT..."
...,...,...
4263,"Expansion allows to enrich documents, either b...",{'title': 'SPLADE: Sparse Lexical and Expansio...
4264,"Recently, dense retrieval based on BERT has de...",{'title': 'SPLADE: Sparse Lexical and Expansio...
4265,"Our approach relies on in-batch negatives, log...",{'title': 'SPLADE: Sparse Lexical and Expansio...
4266,SPLADE is an appealing candidate for initial r...,{'title': 'SPLADE: Sparse Lexical and Expansio...
