In [2]:
import os
import pandas as pd
from transformers import pipeline
from PyPDF2 import PdfReader

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    with open(pdf_path, 'rb') as file:
        reader = PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()
    return text

# Function to find the top 3 disease-related keywords in the text
def find_top_3_diseases(text, max_tokens=384):
    question = "What are the 3 main disease-related keywords in the text?"
    tokenized_text = nlp.tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    num_chunks = (tokenized_text.input_ids.shape[1] + max_tokens - 1) // max_tokens

    for chunk_idx in range(num_chunks):
        start = chunk_idx * max_tokens
        end = start + max_tokens
        input_chunk = tokenized_text.input_ids[:, start:end]
        attention_mask_chunk = tokenized_text.attention_mask[:, start:end]
        answer = nlp(model_inputs={"input_ids": input_chunk, "attention_mask": attention_mask_chunk}, question=question)
        if answer["score"] > 0.5:
            return answer["answer"]
    return "Keywords not found"

# Load the question-answering model
nlp = pipeline("question-answering")

# Define the folder containing PDF files
pdf_folder = "/Users/amin/Desktop/33/test"

# Read PDF files and find the top 3 disease-related keywords
data = []
for file in os.listdir(pdf_folder):
    if file.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder, file)
        text = extract_text_from_pdf(pdf_path)
        top_3_diseases = find_top_3_diseases(text)
        data.append({"File": file[:-4], "Issue_KW": top_3_diseases})

# Create a DataFrame and save it as a CSV file
df = pd.DataFrame(data)
df.to_csv("Issue_KW.csv", index=False)


No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


ValueError: Unknown arguments {'model_inputs': {'input_ids': tensor([[  101, 18732, 20336,  9565, 13821,  2036, 18581,  1658, 18135,  1203,
         23994,  3309,  1403,  1306,  1107,  1103, 19165,  1104, 20394, 26003,
          9847,  4695, 20012,  2249, 18122, 10615,   117, 17881,  1477, 24654,
          2101,  9637, 17444, 15517, 10069,   149, 12420, 20427,  8900,  3048,
         11410,   123,  1658, 11787,  3848,  8989,  1658,  3031, 11470,  4183,
          4124,  5466, 26123, 11553,  2101,  1324, 15556,  3031, 11470,  8787,
          4124,  1823, 25702,  9291,  1116,  2101,  1324, 15556, 15391,  1776,
          1616,   117, 17023,   111,  6342, 13809, 13199,  5838,  9126,  4342,
         22534,   204, 10288,  1107, 10351,   132,   170,  6898,   118,  1149,
          1121,  1103,  5364, 18601,   113, 21362,  2271,   114,   204,   109,
           127,   119,   123,  2107,  6478,  1121, 21362,  2271,  1105, 18235,
          1116,  2346,  3031,  1186,   204, 21902, 10522,  2042,  1104, 14056,
          2036,  1353, 10799,  1115,  1107, 23034,   157,  8271,   120,   157,
          8271,  2346,   115,  1707,   117,   170,  2281,  3445,  1104,   140,
          2428,  2137,   204,  1203,  2747, 19560,  4010,   113, 15411,  1658,
         25928,  2217,   114,   117,  1884,  4934,  2002,   113,  1664,   118,
         27410,   114,  3469,   204,   157,  8271,  2346, 25128,  8519,  1200,
           117,  1769,  1884, 13252,  1204,  2233,  1884, 18337,  3798,   157,
          8271,  2346,   113,   135,   126, 28349,  1306,   114,  1114,  2869,
          3653, 13950,   124,  1708,  1818,  7317,  1183,  2162, 12393, 18555,
         11410,  1708, 15297,   138,   204, 24930, 18380,  1158,  1126,  8362,
          7877,  2913,  3445,  1104, 13306, 16042,  3653,   113,   140,  2428,
          2137,   114,  1114,  1887,  3254,  1766, 14598,  4233,   204, 18959,
          5208, 16764,  1107,   140, 23935,  1105, 17355, 12725,  4863,   204,
         15969,  2137,   118,  2407,  1107, 17881,  1527,   204, 11661, 14274,
         12256,  8547,  2510,  3850,  4765,   117, 22661,  1105,  1329,   204,
         24689,  1181, 14274,  7097,  1121, 21362,  2271,   204,  1249, 27948,
          2888,  1113, 11169,  2538,   204, 20089,  4253,   170,   109,  1476,
          1550,  2768,   138, 13080,   204, 15969,  2137,   118, 12619,   117,
         22661,  1718,  1105,   140, 10044,   117, 12278,   122,  2025,   113,
           116,  7642, 24275,  2528, 24662, 18959,  2101,   114,   204, 12798,
          2509,  3850,  4765,   204, 16752,  6105,  4175,  1459,  1264,  1104,
          2380,  8724,   115,   157,  8271,   113,   152,   114,   134, 18491,
         11006, 18873, 19577,   151,   118,   152,  8745,  2007,  9272, 21564,
         11780,   142,  3190, 10147, 12412, 26140,   153, 11410, 21678, 11607,
          2036,   125,  5301,  1942,  1775, 10424,  1475,  6844, 12556,  1513,
         21025,  5301,  1942,  1775, 10424,  1477,  6844, 12556,  1513, 21025,
          5301,  1942,  1775, 10424,  1495,  6844, 12556,  1513, 21025,   135,
          1476,  4252,  1643, 24171]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}, 'question': 'What are the 3 main disease-related keywords in the text?'}