In [2]:
import os
import re
import pandas as pd
from PyPDF2 import PdfFileReader
from haystack.document_store.memory import InMemoryDocumentStore
from haystack.reader.farm import FARMReader
from haystack.utils import print_answers
from haystack.pipeline import ExtractiveQAPipeline

# define the path of the folder containing PDF files
pdf_folder = "/Users/amin/Desktop/33/test"

# create an in-memory document store
document_store = InMemoryDocumentStore()

# loop over all PDF files in the folder and index them in the document store
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        filepath = os.path.join(pdf_folder, filename)
        # read the PDF file and create a Haystack document
        with open(filepath, 'rb') as f:
            pdf = PdfFileReader(f)
            text = ""
            for i in range(pdf.getNumPages()):
                text += pdf.getPage(i).extractText()
            document = {"text": text, "meta": {"name": os.path.splitext(filename)[0]}}
            document_store.write_documents([document])

# create an ExtractiveQAPipeline with a FARMReader as the document reader
reader = FARMReader(model_name_or_path="deepset/bert-large-uncased-whole-word-masking-squad2", use_gpu=False)
pipeline = ExtractiveQAPipeline(reader=reader)

# define the function to extract disease-related keywords from a text chunk
def extract_keywords(text):
    n = 3  # number of keywords to extract
    # use the ExtractiveQAPipeline to find the answer to the question "what are the main disease-related keywords?"
    question = "What are the 3 main disease-related keywords?"
    result = pipeline.run(query=question, documents=[{"text": text}], top_k_retriever=1, top_k_reader=1)
    answers = result['answers']
    # extract the top n keywords from the answer
    keywords = re.findall(r'\b\w+\b', answers[0]['answer'])
    keywords = keywords[:n]
    return keywords

# loop over all PDF files in the folder
data = []
for filename in os.listdir(pdf_folder):
    if filename.endswith(".pdf"):
        filepath = os.path.join(pdf_folder, filename)
        # read the PDF file
        with open(filepath, 'rb') as f:
            pdf = PdfFileReader(f)
            text = ""
            for i in range(pdf.getNumPages()):
                text += pdf.getPage(i).extractText()
            # divide the text into chunks of 512 tokens
            chunk_size = 512
            chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
            # extract disease-related keywords from each chunk
            keywords = []
            for chunk in chunks:
                keywords += extract_keywords(chunk)
            # save the file name and keywords in a data frame
            data.append([os.path.splitext(filename)[0], keywords])

# create a data frame from the collected data
df = pd.DataFrame(data, columns=["File", "Issue_KW"])

# save the data frame to a CSV file
df.to_csv("Issue_KW.csv", index=False)


ModuleNotFoundError: No module named 'haystack.document_store'

In [2]:
df

Unnamed: 0,File,Issue_KW
0,JPM_SRZN_NOTE_11.28.22,"[Key, Changes, EPS, 2021A, 2022E, liver, trans..."
1,CT one-pager Dec 22,"[circadian, rhythm, disorders, US, EU, disease..."
