In [1]:
import os
import pandas as pd
from pdfminer.high_level import extract_text
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering

# Function to extract text from PDF file
def extract_text_from_pdf(file_path):
    return extract_text(file_path)

# Initialize the tokenizer and model for question-answering
model_name = 'dmis-lab/biobert-base-cased-v1.1-squad'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Initialize the question-answering pipeline
question_answering = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Set the directory containing the PDF files
pdf_directory = '/Users/amin/Desktop/33/test'

# Initialize an empty dataframe
df = pd.DataFrame(columns=['File', 'Diag_Treat'])

# Iterate over the files in the directory
for file_name in os.listdir(pdf_directory):
    if file_name.endswith('.pdf'):
        # Extract text from the PDF file
        file_path = os.path.join(pdf_directory, file_name)
        pdf_text = extract_text_from_pdf(file_path)

        # Ask the question-answering model
        question = 'Is the primary focus of the bio-related company diagnosing diseases or treating diseases?'
        answer = question_answering(question=question, context=pdf_text)['answer']

        # Standardize the answer to either 'Diagnose' or 'Treatment'
        diag_treat = 'Diagnose' if 'diagnos' in answer.lower() else 'Treatment'

        # Append the result to the dataframe
        df = df.append({'File': file_name[:-4], 'Diag_Treat': diag_treat}, ignore_index=True)

# Display the dataframe
print(df)


Downloading (…)lve/main/config.json:   0%|          | 0.00/477 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

                                                 File Diag_Treat
0                      Zehna Corporate Deck SymBiosis  Treatment
1                                 PS_TCRR_NOTE_1.5.23  Treatment
2                      Grace Sciece Platform (9.2.20)  Treatment
3                       AcuamarkDx Series A_Corp Deck  Treatment
4                                     coding.bio_deck  Treatment
5         Centurion_BioPharma-Non-Confidential_2021_2  Treatment
6                              JPM_SRZN_NOTE_11.28.22  Treatment
7                                   G_ACET_IOC_4.8.21  Treatment
8                                C_ARQT_NOTE_11.15.22  Treatment
9                                  B_RXRX_IOC_9.21.21  Treatment
10               Dianomi Non-Confidential Slides - v3  Treatment
11                           KBC_MAAT.FR_IOC_12.14.21  Treatment
12                               Engrail Therapeutics  Treatment
13                                 WB_VOR_IOC_1.25.22  Treatment
14                       