In [None]:
import os
import PyPDF2
from transformers import pipeline
import pandas as pd

# Function to read PDF files and convert them to text
def pdf_to_text(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfFileReader(file)
        text = ''
        for page_num in range(reader.numPages):
            text += reader.getPage(page_num).extractText()
    return text

# Function to split text into chunks of 300 tokens
def chunk_text(text, chunk_size=300):
    words = text.split()
    chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]
    return chunks

# Function to classify each chunk as Diagnosis or Treatment
def classify_chunks(chunks, nlp, diagnosis_labels, treatment_labels):
    diagnosis_score = 0
    treatment_score = 0

    for chunk in chunks:
        candidate_labels = diagnosis_labels + treatment_labels
        response = nlp(chunk, candidate_labels=candidate_labels)
        labels = response['labels']
        scores = response['scores']

        for label, score in zip(labels, scores):
            if label in diagnosis_labels:
                diagnosis_score += score
            elif label in treatment_labels:
                treatment_score += score

    return 'Diagnosis' if diagnosis_score > treatment_score else 'Treatment'

# Initialize the zero-shot classification model with the specified model name and revision
nlp = pipeline('zero-shot-classification', model='facebook/bart-large-mnli', revision='c626438')

# Initialize the results DataFrame
results = pd.DataFrame(columns=['File', 'Diag_Treat'])

# Set the folder path containing the PDF files
folder_path = '/Users/amin/Desktop/33/test'

# Define the related words for each category
diagnosis_labels = [
    'medical diagnosis', 'condition identification', 'illness detection', 'disorder diagnosis',
    'health problem identification', 'pathology detection'
]

treatment_labels = [
    'medical treatment', 'therapeutic intervention', 'medication', 'therapy', 'care plan',
    'healing regimen', 'cure', 'rehabilitation', 'management'
]

# Loop through each PDF file in the folder
for file in os.listdir(folder_path):
    if file.endswith('.pdf'):
        file_path = os.path.join(folder_path, file)
        
        # Convert the PDF to text and split it into chunks
        text = pdf_to_text(file_path)
        chunks = chunk_text(text)
        
        # Classify the chunks and add the result to the DataFrame
        classification = classify_chunks(chunks, nlp, diagnosis_labels, treatment_labels)
        results = results.append({'File': file[:-4], 'Diag_Treat': classification}, ignore_index=True)

# Save the results to a CSV file
results.to_csv('classification_results.csv', index=False)


Xref table not zero-indexed. ID numbers for objects will be corrected.
Xref table not zero-indexed. ID numbers for objects will be corrected.
