In [1]:
import os
import pdfplumber
import pandas as pd
from transformers import pipeline

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ' '.join(page.extract_text() for page in pdf.pages)
    return text

def divide_text_into_chunks(text, chunk_size=300):
    words = text.split()
    return [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

def is_human_related(chunk, nlp):
    response = nlp(question="Is this text about a human disease?", context=chunk)
    return response['answer'].lower() == 'yes'

def process_pdf_file(file_path, nlp):
    text = extract_text_from_pdf(file_path)
    chunks = divide_text_into_chunks(text)

    for chunk in chunks:
        if is_human_related(chunk, nlp):
            return 'yes'
    return 'no'

# Initialize the question-answering pipeline
nlp = pipeline("question-answering", model="bert-large-uncased-whole-word-masking-finetuned-squad", tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad")

# Set the folder containing PDF files
pdf_folder = '/Users/amin/Desktop/33/test'

# Process all PDF files in the folder
data = []
for file_name in os.listdir(pdf_folder):
    if file_name.endswith('.pdf'):
        file_path = os.path.join(pdf_folder, file_name)
        human_related = process_pdf_file(file_path, nlp)
        data.append({'File': file_name[:-4], 'Human': human_related})

# Create a pandas DataFrame and save it as a CSV file
df = pd.DataFrame(data)
df.to_csv('output.csv', index=False)


In [2]:
df

Unnamed: 0,File,Human
0,Zehna Corporate Deck SymBiosis,no
1,PS_TCRR_NOTE_1.5.23,no
2,Grace Sciece Platform (9.2.20),no
3,AcuamarkDx Series A_Corp Deck,no
4,coding.bio_deck,no
5,Centurion_BioPharma-Non-Confidential_2021_2,no
6,JPM_SRZN_NOTE_11.28.22,no
7,G_ACET_IOC_4.8.21,no
8,C_ARQT_NOTE_11.15.22,no
9,B_RXRX_IOC_9.21.21,no
