## 📚 Prerequisites

Ensure that your Azure Services are properly set up, your Conda environment is created, and your environment variables are configured as per the instructions in the [README.md](README.md) file.

In [58]:
import os

# Define the target directory
target_directory = r"C:\Users\pablosal\Desktop\gbb-ai-hls-factory-prior-auth"  # change your directory here

# Check if the directory exists
if os.path.exists(target_directory):
    # Change the current working directory
    os.chdir(target_directory)
    print(f"Directory changed to {os.getcwd()}")
else:
    print(f"Directory {target_directory} does not exist.")

Directory changed to C:\Users\pablosal\Desktop\gbb-ai-hls-factory-prior-auth


### Images To Pdf

In [59]:
from src.aoai.azure_openai import AzureOpenAIManager
from utils.ml_logging import get_logger

# Set up logger
logger = get_logger()

In [60]:
from src.extractors.pdf_data_extractor import OCRHelper
ocr_data_extractor_helper = OCRHelper(container_name="pre-auth-policies")

In [62]:
# Replace with the URL of your PDF file in Azure Blob Storage
INPUT_PATH = "utils/data/cases/001/b/"

# Replace with the path to your local directory where the images will be saved
OUTPUT_PATH = "utils/data/pdfs/001_b_rejected/"

ocr_data_extractor_helper.extract_images_from_pdf(
    input_path=INPUT_PATH, output_path=OUTPUT_PATH
)

2024-10-21 22:25:30,235 - micro - MainProcess - INFO     Input path is a local file or directory: utils/data/cases/001/b/ (pdf_data_extractor.py:extract_images_from_pdf:148)
2024-10-21 22:25:30,241 - micro - MainProcess - INFO     Found 5 PDF files in utils/data/cases/001/b/ and its subdirectories (pdf_data_extractor.py:_process_pdf_directory:173)
2024-10-21 22:25:30,242 - micro - MainProcess - INFO     Processing file: utils/data/cases/001/b/doctor_notes\01_b_notes.pdf (pdf_data_extractor.py:_process_pdf_directory:175)
2024-10-21 22:25:30,248 - micro - MainProcess - INFO     Opening file: utils/data/cases/001/b/doctor_notes\01_b_notes.pdf (pdf_data_extractor.py:_process_single_pdf:204)
2024-10-21 22:25:30,318 - micro - MainProcess - INFO     Processing page 1 of utils/data/cases/001/b/doctor_notes\01_b_notes.pdf (pdf_data_extractor.py:_process_single_pdf:209)
2024-10-21 22:25:30,479 - micro - MainProcess - INFO     Saved image: utils/data/pdfs/001_b_rejected/01_b_notes-page-1.png (pdf

In [65]:
import os
from pathlib import Path
from typing import List, Union

def find_all_files(root_folder: str, extensions: Union[List[str], str]) -> List[str]:
    """
    Recursively finds all files with specified extensions under the specified root folder, including subfolders.

    Args:
        root_folder (str): The root folder to search for files.
        extensions (Union[List[str], str]): A list of file extensions to search for (e.g., ['jpeg', 'jpg', 'png', 'pdf']).

    Returns:
        List[str]: A list of full paths to the found files.
    """
    if isinstance(extensions, str):
        extensions = [extensions]

    extensions = [ext.lower() for ext in extensions]
    files_list = []
    root_folder_path = Path(root_folder).resolve()

    for root, _, files in os.walk(root_folder_path):
        for file in files:
            if any(file.lower().endswith(f".{ext}") for ext in extensions):
                full_path = Path(root) / file
                files_list.append(str(full_path.resolve()))
                
    return files_list


In [66]:
root_folder = 'utils/data/pdfs/001_b_rejected'
extensions = ['png']
pa_files_images = find_all_files(root_folder, extensions)

## Proccess Images 

In [68]:
azure_openai_client_05_13 = AzureOpenAIManager(completion_model_name='AZURE_OPENAI_CHAT_DEPLOYMENT_ID')

In [70]:
SYSTEM_PROMPT = """
## Role:
You are an expert Prior Authorization (PA) specialist with extensive experience in analyzing medical documents and extracting critical information.

## Task:
Your task is to review and interpret medical documents provided as images, such as prior authorization forms, medical imaging results, lab reports, and doctor notes. Your goal is to extract essential information to make informed decisions regarding Prior Authorization (PA) workflows. You are proficient in handling images from PDFs and ensuring the accuracy and completeness of the extracted data.

## Instructions:
Carefully analyze the provided images and extract the following information, presenting it in **JSON format** as key-value pairs:

1. **Diagnosis and Medical Justification** (including ICD-10 code)
2. **Detailed History of Alternative Treatments and Results**
3. **Relevant Lab Results or Diagnostic Imaging**
4. **Documented Symptom Severity and Impact on Daily Life**
5. **Prognosis and Risk if Treatment Is Not Approved**
6. **Clinical Rationale for Urgency** (if applicable)
7. **Plan for Treatment or Request for Prior Authorization**
    - **Name of the Medication or Procedure Being Requested**
    - **Code of the Medication or Procedure** (e.g., CPT code, NDC code, or any other relevant medical code). If not available, do your best to provide the code; if unsure, mention "Not provided."
    - **Dosage or plan for the medication or procedure**
    - **Duration of Doses or Days of Treatment**
    - **Rationale for the Medication or Procedure**

**Note:** This task involves critical clinical information extraction. Ensure all details are correctly interpreted and accurately transcribed. Pay close attention to medical terminology, codes, and any indications of urgency or severity.
"""


In [71]:
USER_PROMPT_1 = '''
Given the following images from medical documents .pdf (including prior authorization forms, medical imaging results, lab results, and doctor notes):

Please carefully analyze the provided images to extract the following information and present it in JSON format as key-value pairs:

1. **Patient Information**:
    - Patient Name
    - Patient Date of Birth
    - Patient ID (e.g., Cigna ID)
    - Patient Address
    - Patient Phone Number

2. **Physician Information**:
    - Physician Name
    - Specialty
    - Physician Contact (Office Phone, Fax, Address)

3. **Clinical Information**:
    - Diagnosis and medical justification (including ICD-10 code)
    - Detailed history of alternative treatments and results
    - Relevant lab results or diagnostic imaging
    - Documented symptom severity and impact on daily life
    - Prognosis and risk if treatment is not approved
    - Clinical rationale for urgency (if applicable)
    - Plan for Treatment or Request for Prior Authorization
      - Name of the medication or procedure being requested
      - Code of the medication or procedure (if available)
      - Dosage or plan for the medication or procedure
      - Duration of Doses or Days of Treatment
      - Rationale for the medication or procedure

Instructions:

1. **Accuracy is Paramount**: Ensure all extracted information is accurate and directly supported by the provided text. Pay special attention to correcting any OCR errors or misinterpretations.

2. **OCR Error Correction**: Be vigilant for common OCR mistakes, such as misread characters or numbers, and correct them based on context.

3. **Formatting the JSON Output**:
   - Use the exact field names as provided.
   - If certain information is not available in the text, indicate it as "Not provided" in the JSON output.

Generate a JSON output based on the following schema and instructions:

Schema:
{
  "Patient Information": {
    "Patient Name": "Value here",
    "Patient Date of Birth": "Value here",
    "Patient ID": "Value here",
    "Patient Address": "Value here",
    "Patient Phone Number": "Value here"
  },
  "Physician Information": {
    "Physician Name": "Value here",
    "Specialty": "Value here",
    "Physician Contact": {
      "Office Phone": "Value here",
      "Fax": "Value here",
      "Office Address": "Value here"
    }
  },
  "Clinical Information": {
    "Diagnosis and medical justification (including ICD-10 code)": "Value here",
    "Detailed history of alternative treatments and results": "Value here",
    "Relevant lab results or diagnostic imaging": "Value here",
    "Documented symptom severity and impact on daily life": "Value here",
    "Prognosis and risk if treatment is not approved": "Value here",
    "Clinical rationale for urgency (if applicable)": "Value here"
    "Plan for Treatment or Request for Prior Authorization": {
      "Medication or Procedure": "Value here",
      "Code": "Value here",
      "Dosage": "Value here",
      "Duration": "Value here",
      "Rationale": "Value here"
      }
  }
}

4. **Clarity and Professionalism**:
    - Use clear and concise language appropriate for medical documentation.
    - Maintain professional tone and terminology.

5. **Multiple Entries Handling**:
    - If multiple diagnoses, treatments, or lab results are present, list each entry separated by semicolons within the same field.

6. **ICD-10 Codes**:
    - Ensure that any ICD-10 codes are accurately extracted and correspond correctly to the diagnosis.
    - If the ICD-10 code is missing but the diagnosis is present, you may look up the standard ICD-10 code that matches the diagnosis, if appropriate.

7. **Lab Results and Imaging**:
    - Include key findings, values, and any notable abnormalities.
    - Mention the type of test and the date if available.

8. **Symptom Severity and Impact**:
    - Provide details on how the symptoms affect the patient's daily life, including any limitations or impairments.

9. **Prognosis and Risks**:
    - Clearly state the potential outcomes and risks if the treatment is not approved, as documented in the text.

10. **Clinical Rationale for Urgency**:
    - If applicable, explain why the treatment is urgent based on the clinical information provided.

11. **Plan for Treatment or Request for Prior Authorization**:
    - Clearly state the name of the medication or procedure being requested for the patient.
    - Include the code of the medication or procedure if available.
    - State the dosage or plan for the medication or procedure.
    - Specify the duration of doses or days of treatment.
    - Provide the rationale for the medication or procedure based on the clinical information provided.

**Note**: This task involves critical clinical information extraction. Take your time to ensure all details are correctly interpreted and accurately transcribed.
'''

In [72]:
pa_files_images

['C:\\Users\\pablosal\\Desktop\\gbb-ai-hls-factory-prior-auth\\utils\\data\\pdfs\\001_b_rejected\\01_b_form-page-1.png',
 'C:\\Users\\pablosal\\Desktop\\gbb-ai-hls-factory-prior-auth\\utils\\data\\pdfs\\001_b_rejected\\01_b_form-page-2.png',
 'C:\\Users\\pablosal\\Desktop\\gbb-ai-hls-factory-prior-auth\\utils\\data\\pdfs\\001_b_rejected\\01_b_imaging-page-1.png',
 'C:\\Users\\pablosal\\Desktop\\gbb-ai-hls-factory-prior-auth\\utils\\data\\pdfs\\001_b_rejected\\01_b_labs-page-1.png',
 'C:\\Users\\pablosal\\Desktop\\gbb-ai-hls-factory-prior-auth\\utils\\data\\pdfs\\001_b_rejected\\01_b_labs-page-2.png',
 'C:\\Users\\pablosal\\Desktop\\gbb-ai-hls-factory-prior-auth\\utils\\data\\pdfs\\001_b_rejected\\01_b_labs-page-3.png',
 'C:\\Users\\pablosal\\Desktop\\gbb-ai-hls-factory-prior-auth\\utils\\data\\pdfs\\001_b_rejected\\01_b_notes-page-1.png',
 'C:\\Users\\pablosal\\Desktop\\gbb-ai-hls-factory-prior-auth\\utils\\data\\pdfs\\001_b_rejected\\01_b_notes-page-2.png']

In [73]:
api_response_gpt4o = await azure_openai_client_05_13.generate_chat_response(query=USER_PROMPT_1, 
                                                           system_message_content=SYSTEM_PROMPT, 
                                                           image_paths=pa_files_images,
                                                           conversation_history=[],
                                                           stream=False,
                                                           response_format='json_object',
                                                           max_tokens=2000)

2024-10-21 22:27:44,000 - micro - MainProcess - INFO     Function generate_chat_response started at 2024-10-21 22:27:44 (azure_openai.py:generate_chat_response:345)
2024-10-21 22:27:44,043 - micro - MainProcess - INFO     Image C:\Users\pablosal\Desktop\gbb-ai-hls-factory-prior-auth\utils\data\pdfs\001_b_rejected\01_b_form-page-1.png type: image/png (azure_openai.py:generate_chat_response:384)
2024-10-21 22:27:44,074 - micro - MainProcess - INFO     Image C:\Users\pablosal\Desktop\gbb-ai-hls-factory-prior-auth\utils\data\pdfs\001_b_rejected\01_b_form-page-2.png type: image/png (azure_openai.py:generate_chat_response:384)
2024-10-21 22:27:44,106 - micro - MainProcess - INFO     Image C:\Users\pablosal\Desktop\gbb-ai-hls-factory-prior-auth\utils\data\pdfs\001_b_rejected\01_b_imaging-page-1.png type: image/png (azure_openai.py:generate_chat_response:384)
2024-10-21 22:27:44,134 - micro - MainProcess - INFO     Image C:\Users\pablosal\Desktop\gbb-ai-hls-factory-prior-auth\utils\data\pdfs\0

In [74]:
api_response_gpt4o['response']

{'Patient Information': {'Patient Name': 'Sarah Sample',
  'Patient Date of Birth': '10-19-2014',
  'Patient ID': '4567890',
  'Patient Address': '28 W Comet Ave, Chicago, IL 60607',
  'Patient Phone Number': '555-123-4567'},
 'Physician Information': {'Physician Name': 'Shiva Pedram, MD',
  'Specialty': 'Pediatrics',
  'Physician Contact': {'Office Phone': '555-991-2750',
   'Fax': '555-786-5643',
   'Office Address': '5721 S Maryland Ave, Chicago, IL 60637'}},
 'Clinical Information': {'Diagnosis and medical justification (including ICD-10 code)': 'Crohn’s Disease, ICD-10: K50.90',
  'Detailed history of alternative treatments and results': 'Not provided',
  'Relevant lab results or diagnostic imaging': 'EGD: Stomach - Gastritis, erythema present; Duodenum - Mild to moderate duodenitis with edema. Colonoscopy: Ileum - Patchy inflammation with areas of erythema and ulceration; Colon - Diffuse inflammation, granularity, and friability present; Rectum - Mild inflammation, no significant