In [31]:
import fitz
import re
import nltk.data
import os
from functools import reduce
import pandas as pd
from tqdm import tqdm
import gc
import pickle

from langchain_community.retrievers import BM25Retriever
from langchain_core.documents import Document

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

In [7]:
MIN_PAR_LEN = 200
MIN_SENTENCE_LEN = 100
PDFS_DIR = '/home/dzigen/Desktop/ITMO/ВКР/КМУ2024/data/pdfs'

In [24]:
def get_pdfpages(pdf_path):
    text = []
    with fitz.open(pdf_path) as doc:
        for page in doc:
            for tab in page.find_tables():
                # process the content of table 'tab'
                page.add_redact_annot(tab.bbox)  # wrap table in a redaction annotation

            for img in page.get_images():
                # process the content of table 'tab'
                page.delete_image(img[0]) # wrap table in a redaction annotation

            page.apply_redactions()  # erase all table text
            text.append(page.get_text("blocks"))

    return text

def filter_pdfblocks(pages):
    filtered_pages = []
    for page in pages:
        filtered_block = []
        flag = False
        for block in page:
            txt = block[4]

            if len(txt) >= MIN_PAR_LEN:
                txt = txt.replace('-\n','').replace('\n',' ')

                if txt.startswith("Figure") or txt.startswith("Table"):
                    continue

                sentences = list(filter(lambda x: len(x) >= MIN_SENTENCE_LEN, tokenizer.tokenize(txt)))
                filtered_block.append(sentences)
            
            else:
                if len(re.findall('references', txt.lower())) > 0:
                    #print("find references!")
                    flag = True
                    break

        filtered_pages.append(filtered_block)

        if flag:
            break

    return filtered_pages

def prepare_sci_corpus(pdfs_dir):
    pdfs = os.listdir(pdfs_dir)
    print(pdfs)

    tmp_data = []
    for file in tqdm(pdfs):
        print(file)
        title = file[:-4]
        path = f"{pdfs_dir}/{file}"

        print("extracting text...")
        gc.collect()
        pages_info = get_pdfpages(path)
        gc.collect()
        print("filtering text...")
        filtered_info = filter_pdfblocks(pages_info)

        blocks = reduce(lambda acc, x: acc + x, filtered_info, [])
        paragraphs = reduce(lambda acc, x: acc + x, blocks, [])

        tmp_data += [(title, sent) for sent in paragraphs]

    df = pd.DataFrame(tmp_data, columns=['title', 'text'])
    return df

In [None]:
df = prepare_sci_corpus(PDFS_DIR)

In [27]:
df.to_csv("/home/dzigen/Desktop/ITMO/ВКР/КМУ2024/data/scipdf_corpus.tsv", sep='\t', index=False)

In [30]:
documents = []
for i in tqdm(range(df.shape[0])):
    base_ctx = df['title'][i] + "\n"+ df['text'][i]
    formatd_ctx = "passage: " + base_ctx

    tmp_m = {
        'in_base_index': i
        }

    documents.append(Document(page_content=formatd_ctx, metadata=tmp_m))

100%|██████████| 4318/4318 [00:00<00:00, 67561.74it/s]


In [32]:
pdfsci_base_path = '/home/dzigen/Desktop/ITMO/ВКР/КМУ2024/data/bases/scipdf_bm25_base.pkl'

bm25_model = BM25Retriever.from_documents(documents, k=4)

with open(pdfsci_base_path, 'wb') as bm25result_file:
    pickle.dump(bm25_model, bm25result_file)