In [3]:
import os
import PyPDF2
import pandas as pd
from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering

# Function to extract text from PDF file
def extract_text_from_pdf(file_path):
    pdf_file = open(file_path, 'rb')
    pdf_reader = PyPDF2.PdfFileReader(pdf_file)
    text = ''
    for page_num in range(pdf_reader.numPages):
        text += pdf_reader.getPage(page_num).extractText()
    pdf_file.close()
    return text

# Initialize the tokenizer and model for question-answering
model_name = 'distilbert-base-cased-distilled-squad'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

# Initialize the question-answering pipeline
question_answering = pipeline('question-answering', model=model, tokenizer=tokenizer)

# Set the directory containing the PDF files
pdf_directory = '/Users/amin/Desktop/33/test'

# Initialize an empty dataframe
df = pd.DataFrame(columns=['File', 'Diag_Treat'])

# Iterate over the files in the directory
for file_name in os.listdir(pdf_directory):
    if file_name.endswith('.pdf'):
        # Extract text from the PDF file
        file_path = os.path.join(pdf_directory, file_name)
        pdf_text = extract_text_from_pdf(file_path)

        # Ask the question-answering model
        question = 'Is the primary focus of the bio-related company diagnosing diseases or treating diseases?'
        answer = question_answering(question=question, context=pdf_text)['answer']

        # Standardize the answer to either 'Diagnose' or 'Treatment'
        diag_treat = 'Diagnose' if 'diagnos' in answer.lower() else 'Treatment'

        # Append the result to the dataframe
        df = df.append({'File': file_name[:-4], 'Diag_Treat': diag_treat}, ignore_index=True)

# Display the dataframe
print(df)


Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
FloatObject (b'0.00-5677566') invalid; use 0.0 instead
Superfluous whitespace found in object header b'1' b'0'
Superfluous whitespace found in object header b'2' b'0'
Superfluous whitespace found in object header b'3' b'0'
Superfluous whitespace found in object header b'18' b'0'
Superfluous whitespace found in object header b'41' b'0'
Superfluous whitespace found in object header b'61' b'0'
Superfluous whitespace found in object header b'206' b'0'
Superfluous whitespace found in object header b'214' b'0'
Superfluous whitespace found in object header b'231' b'0'
Superfluous whitespace found in object header b'234' b'0'
Superfluous whitespace found in object header b'241' b'0'
Superfluous whitespace found in object header b'254' b'0'
Superfluous whitespace found in object header b'257' b'0'
Superfluous whitespace found in object header b'267' b'0'


Superfluous whitespace found in object header b'135' b'0'
Superfluous whitespace found in object header b'136' b'0'
Superfluous whitespace found in object header b'137' b'0'
Superfluous whitespace found in object header b'138' b'0'
Superfluous whitespace found in object header b'139' b'0'
Superfluous whitespace found in object header b'140' b'0'
Superfluous whitespace found in object header b'141' b'0'
Superfluous whitespace found in object header b'142' b'0'
Superfluous whitespace found in object header b'143' b'0'
Superfluous whitespace found in object header b'144' b'0'
Superfluous whitespace found in object header b'145' b'0'
Superfluous whitespace found in object header b'146' b'0'
Superfluous whitespace found in object header b'147' b'0'
Superfluous whitespace found in object header b'148' b'0'
Superfluous whitespace found in object header b'149' b'0'
Superfluous whitespace found in object header b'150' b'0'
Superfluous whitespace found in object header b'151' b'0'
Superfluous wh

Superfluous whitespace found in object header b'298' b'0'
Superfluous whitespace found in object header b'297' b'0'
Superfluous whitespace found in object header b'296' b'0'
Superfluous whitespace found in object header b'302' b'0'
Superfluous whitespace found in object header b'301' b'0'
Superfluous whitespace found in object header b'300' b'0'
Superfluous whitespace found in object header b'349' b'0'
Superfluous whitespace found in object header b'312' b'0'
Superfluous whitespace found in object header b'311' b'0'
Superfluous whitespace found in object header b'310' b'0'
Superfluous whitespace found in object header b'348' b'0'
Superfluous whitespace found in object header b'304' b'0'
Superfluous whitespace found in object header b'314' b'0'
Superfluous whitespace found in object header b'316' b'0'
Superfluous whitespace found in object header b'318' b'0'
Superfluous whitespace found in object header b'320' b'0'
Superfluous whitespace found in object header b'322' b'0'
Superfluous wh

Superfluous whitespace found in object header b'522' b'0'
Superfluous whitespace found in object header b'523' b'0'
Superfluous whitespace found in object header b'524' b'0'
Superfluous whitespace found in object header b'525' b'0'
Superfluous whitespace found in object header b'526' b'0'
Superfluous whitespace found in object header b'527' b'0'
Superfluous whitespace found in object header b'528' b'0'
Superfluous whitespace found in object header b'529' b'0'
Superfluous whitespace found in object header b'530' b'0'
Superfluous whitespace found in object header b'531' b'0'
Superfluous whitespace found in object header b'532' b'0'
Superfluous whitespace found in object header b'533' b'0'
Superfluous whitespace found in object header b'534' b'0'
Superfluous whitespace found in object header b'535' b'0'
Superfluous whitespace found in object header b'536' b'0'
Superfluous whitespace found in object header b'537' b'0'
Superfluous whitespace found in object header b'538' b'0'
Superfluous wh

Superfluous whitespace found in object header b'658' b'0'
Superfluous whitespace found in object header b'659' b'0'
Superfluous whitespace found in object header b'660' b'0'
Superfluous whitespace found in object header b'661' b'0'
Superfluous whitespace found in object header b'662' b'0'
Superfluous whitespace found in object header b'663' b'0'
Superfluous whitespace found in object header b'664' b'0'
Superfluous whitespace found in object header b'665' b'0'
Superfluous whitespace found in object header b'666' b'0'
Superfluous whitespace found in object header b'667' b'0'
Superfluous whitespace found in object header b'668' b'0'
Superfluous whitespace found in object header b'669' b'0'
Superfluous whitespace found in object header b'670' b'0'
Superfluous whitespace found in object header b'671' b'0'
Superfluous whitespace found in object header b'672' b'0'
Superfluous whitespace found in object header b'673' b'0'
Superfluous whitespace found in object header b'674' b'0'
Superfluous wh

                                                 File Diag_Treat
0                      Zehna Corporate Deck SymBiosis  Treatment
1                                 PS_TCRR_NOTE_1.5.23  Treatment
2                      Grace Sciece Platform (9.2.20)  Treatment
3                       AcuamarkDx Series A_Corp Deck  Treatment
4                                     coding.bio_deck  Treatment
5         Centurion_BioPharma-Non-Confidential_2021_2  Treatment
6                              JPM_SRZN_NOTE_11.28.22  Treatment
7                                   G_ACET_IOC_4.8.21  Treatment
8                                C_ARQT_NOTE_11.15.22  Treatment
9                                  B_RXRX_IOC_9.21.21  Treatment
10               Dianomi Non-Confidential Slides - v3  Treatment
11                           KBC_MAAT.FR_IOC_12.14.21  Treatment
12                               Engrail Therapeutics  Treatment
13                                 WB_VOR_IOC_1.25.22  Treatment
14                       