In [1]:
import os
import pandas as pd
from pdfminer.high_level import extract_text
from transformers import pipeline

def read_pdf(file):
    text = extract_text(file)
    return text

def is_human_related(text, nlp):
    question = "Does this company primarily focus on solving human-related problems?"
    answer = nlp(question=question, context=text)
    return answer['answer']

def main():
    pdf_folder = "/Users/amin/Desktop/33/test"
    data = []

    nlp = pipeline(
        "question-answering",
        model="bert-large-uncased-whole-word-masking-finetuned-squad",
        tokenizer="bert-large-uncased-whole-word-masking-finetuned-squad",
    )

    for file in os.listdir(pdf_folder):
        if file.endswith(".pdf"):
            file_path = os.path.join(pdf_folder, file)
            try:
                text = read_pdf(file_path)
                human_related = is_human_related(text, nlp)
                if human_related.lower() in ["yes", "true"]:
                    human_related = "yes"
                else:
                    human_related = "no"
                data.append([file[:-4], human_related])
            except Exception as e:
                print(f"Error processing {file}: {e}")

    df = pd.DataFrame(data, columns=["File", "Human"])
    print(df)

if __name__ == "__main__":
    main()


                                                 File Human
0                      Zehna Corporate Deck SymBiosis    no
1                                 PS_TCRR_NOTE_1.5.23    no
2                      Grace Sciece Platform (9.2.20)    no
3                       AcuamarkDx Series A_Corp Deck    no
4                                     coding.bio_deck    no
5         Centurion_BioPharma-Non-Confidential_2021_2    no
6                              JPM_SRZN_NOTE_11.28.22    no
7                                   G_ACET_IOC_4.8.21    no
8                                C_ARQT_NOTE_11.15.22    no
9                                  B_RXRX_IOC_9.21.21    no
10               Dianomi Non-Confidential Slides - v3    no
11                           KBC_MAAT.FR_IOC_12.14.21    no
12                               Engrail Therapeutics    no
13                                 WB_VOR_IOC_1.25.22    no
14                                 E_FNCH_IOC_8.10.21    no
15                                CG_GRN