# Get all Resident surgeries and case log data

This code does the following: 

1) updates the [resident file](https://storage.cloud.google.com/shared-aif-bucket-3444/resident_cases/resident_data.json) if there are new ENT residents,

2) Uploads the resident file,

3) Uploads the [consultant file](https://storage.cloud.google.com/shared-aif-bucket-3444/resident_cases/ENT_consultants.xlsx),

4) Uploads the [CPT code conversion file](https://storage.cloud.google.com/shared-aif-bucket-3444/resident_cases/cpt_database.xlsx),

5) Launches a jupyter widget allowing the user select the residents and the date of interest,

6) Performs a SQL query based on the user's selection extracting all cases for that resident over the defined period of time, using billing CPT codes to determine what CPT codes were performed during the case,

7) Converts the billing CPT codes into codes appropriate for logging cases, based on the CPT code conversion file,

8) Predicts special equipment used (robot, KTP, CO2 laser, sialendoscopy) based on the description of the case,

9) Creates an excel spreadsheet with the modified CPT codes and all of the relevant details, so that it can be pasted into the ACGME caselog template.




## Things that will require updating over time:

1) When consultants are added to the ACGME website dropdown list, they need to be added to the [consultant file](https://storage.cloud.google.com/shared-aif-bucket-3444/resident_cases/ENT_consultants.xlsx) located in google cloud storage

2) If errors are found or modifications are needed to adjust the CPT conversion file, it should be edited [here](https://storage.cloud.google.com/shared-aif-bucket-3444/resident_cases/cpt_database.xlsx) located in google cloud storage

In [1]:
import sys
import shutil
from pathlib import Path

# check for existing venv, and use it if it exists. Otherwise, create fresh .venv
venv_path = Path(".venv")
if venv_path.exists():
    print("Using the existing virtual environment...")
  #  shutil.rmtree(venv_path)

else:
    print("Creating fresh virtual environment...")
    !python -m venv .venv --clear

# Use absolute paths to avoid any ambiguity
venv_abs = venv_path.absolute()
if sys.platform == "win32":
    python_exe = venv_abs / "Scripts" / "python.exe"
    pip_exe = venv_abs / "Scripts" / "pip.exe"
else:
    python_exe = venv_abs / "bin" / "python"
    pip_exe = venv_abs / "bin" / "pip"

# Install packages using explicit commands
print("\nInstalling packages...")
%pip install --upgrade pip
%pip install requests

# Test
print("\nTesting installation:")
!"{python_exe}" -c "import requests; print(f'Success! Requests from: {{requests.__file__}}')"
%pip install -r requirements.txt
# Show what's installed
print("\nInstalled packages:")
%pip list


Using the existing virtual environment...

Installing packages...
Collecting pip
  Using cached pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-25.1.1-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.3.1
    Uninstalling pip-24.3.1:
      Successfully uninstalled pip-24.3.1
Successfully installed pip-25.1.1
Note: you may need to restart the kernel to use updated packages.
Collecting requests
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting charset-normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl.metadata (35 kB)
Collecting idna<4,>=2.5 (from requests)
  Using cached idna-3.10-py3-none-any.whl.metadata (10 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.4.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2025.4.26-py3-none-a

In [None]:
!gcloud auth application-default login

"""
ENT Resident Case Processing System with Interactive UI

This notebook combines:
1. Interactive widgets for parameter selection and execution
2. DataLoader class for retrieving data from various sources
3. ResidentCaseProcessor class for data enrichment and transformation

The result is a complete end-to-end system with a user-friendly interface.
"""

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from google.cloud import bigquery, storage
import pandas as pd
import json
import openpyxl
from datetime import datetime, timedelta
from IPython.display import display, Markdown, HTML
from tqdm.notebook import tqdm
from google.cloud.storage.blob import Blob
from IPython.display import FileLink, display
from io import BytesIO
import traceback


class DataLoader:
    """Class to load all source data from Google Cloud Storage and BigQuery."""

    def __init__(self, project_id="aif-usr-p-ent-ai-misc-3444", bucket_name="shared-aif-bucket-3444"):
        """Initialize the DataLoader with project and bucket information.

        Args:
            project_id: The Google Cloud project ID
            bucket_name: The Google Cloud Storage bucket name
        """
        self.project_id = project_id
        self.bucket_name = bucket_name

        # Set up time info
        self.today = datetime.now()
        self.start_date = (self.today - timedelta(days=7)).strftime('%Y-%m-%d')
        self.end_date = self.today.strftime('%Y-%m-%d')
        self.today_date = self.today.strftime('%Y-%m-%d')

        # Define common blob names
        self.cpt_blob_name = "resident_cases/cpt_database.xlsx"
        self.consultant_blob_name = "resident_cases/ENT_consultants.xlsx"
        self.resident_blob_name = "resident_cases/resident_data.json"
        self.resident_blob_name2 = "resident_cases/resident_data.json"

        # Initialize clients
        self.bq_client = bigquery.Client(project=self.project_id)
        self.storage_client = storage.Client()
        self.bucket = self.storage_client.bucket(self.bucket_name)

        # Progress update callback
        self.progress_callback = None

    def set_progress_callback(self, callback_function):
        """Set a callback function for progress updates.

        Args:
            callback_function: Function that takes (progress_value, status_text)
        """
        self.progress_callback = callback_function

    def update_progress(self, value, status_text=""):
        """Update progress through callback if available.

        Args:
            value: Progress value (0-100)
            status_text: Status message
        """
        if self.progress_callback:
            self.progress_callback(value, status_text)

    def debug_print(self, message, level="INFO"):
        """Print debug messages with timestamp and level.

        Args:
            message: The message to print
            level: The log level (INFO, WARNING, ERROR)
        """
        timestamp = datetime.now().strftime("%H:%M:%S")
        if level == "ERROR":
            print(f"üî¥ {timestamp} - ERROR: {message}")
        elif level == "WARNING":
            print(f"üü° {timestamp} - WARNING: {message}")
        else:
            print(f"üîµ {timestamp} - {message}")


    def load_consultant_data(self):
        """Load consultant reference data from Google Cloud Storage.

        Returns:
            DataFrame: Consultant reference data
        """
        try:
            self.update_progress(30, "Loading consultant database...")
            consultant_blob = self.bucket.blob(self.consultant_blob_name)
            consultant_bytes = consultant_blob.download_as_bytes()
            df_consultants = pd.read_excel(BytesIO(consultant_bytes))
            print(f"‚úÖ Loaded consultant database with {len(df_consultants)} rows.")

            # Show sample data
         #   print("\nüìä Consultant DataFrame columns:", df_consultants.columns.tolist())
         #   display(df_consultants.head(3))

            self.update_progress(35, "Consultant database loaded successfully")
            return df_consultants

        except Exception as e:
            self.debug_print(f"Error loading consultant data: {str(e)}", "ERROR")
            traceback.print_exc()
            raise

    def load_cpt_data(self):
        """Load CPT reference data from Google Cloud Storage.

        Returns:
            DataFrame: CPT reference data
        """
        try:
            self.update_progress(20, "Loading CPT database...")
            cpt_blob = self.bucket.blob(self.cpt_blob_name)
            cpt_bytes = cpt_blob.download_as_bytes()
            df_cpt = pd.read_excel(BytesIO(cpt_bytes))
            print(f"‚úÖ Loaded CPT database with {len(df_cpt)} rows.")

            # Show sample data
            self.update_progress(25, "CPT database loaded successfully")
            return df_cpt

        except Exception as e:
            self.debug_print(f"Error loading CPT data: {str(e)}", "ERROR")
            traceback.print_exc()
            raise

    def load_resident_data(self):
        """Load resident data from Google Cloud Storage.

        Returns:
            list: List of resident dictionaries
        """
        try:
            resident_blob = self.bucket.blob(self.resident_blob_name)
            json_bytes = resident_blob.download_as_bytes()
            dict_residents = json.loads(json_bytes.decode('utf-8'))
            print(f"‚úÖ Loaded resident data with {len(dict_residents)} entries.")

            # Show sample data
            sample_keys = list(dict_residents[0].keys())[:5]  # Show first 5 keys
            return dict_residents

        except Exception as e:
            self.debug_print(f"Error loading resident data: {str(e)}", "ERROR")
            traceback.print_exc()
            raise

    def create_residents_from_bigquery(self):
        """Create resident data from BigQuery.

        Returns:
            list: List of resident dictionaries with updated information
        """
        try:
            print("üìä Querying resident data from BigQuery...")

            # The SQL query to get resident information
            query = """
            SELECT
            PR.practitioner_display,
            PR.telecom3_value AS email,
            PR.telecom1_value AS phone_number,
            DHP.PROVIDER_LAN_ID AS LAN_ID,
            DHP.PROVIDER_PERSON_ID AS Person_ID,
            DHP.PROVIDER_FIRST_NAME AS First_name,
            DHP.PROVIDER_LAST_NAME AS Last_name

            FROM `ml-mps-adl-intfhr-phi-p-3b6e.phi_current_fhir_us_p.PractitionerRole` PR

            RIGHT JOIN
            `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_HEALTHCARE_PROVIDER` DHP
            ON (PR.telecom3_value = DHP.PROVIDER_EMAIL_ADDRESS)

            WHERE specialty_coding_display = ("Otorhinolaryngology")
            AND code_text IN ("Resident","Fellow", "Resident Unlicensed DEA")
            AND telecom1_value LIKE ("%507%")
            AND telecom2_value LIKE ("%507%")
            AND DHP.PROVIDER_NPI IS NOT NULL
            AND DHP.PROVIDER_NPI != ''
            AND DHP.PROVIDER_PERSON_ID IS NOT NULL
            AND DHP.PROVIDER_PERSON_ID != ''
            AND PR.active IS TRUE
            """

            # Execute the query
            query_job = self.bq_client.query(query)
            results = query_job.result()

            # Convert to pandas DataFrame for easier processing
            df = results.to_dataframe()
            print(f"‚úÖ Retrieved {len(df)} resident records from BigQuery.")

            # Create list of dictionaries
            resident_dict2 = []
            for _, row in df.iterrows():
                resident = {
                    "First name": row["First_name"],
                    "Last name": row["Last_name"],
                    "LAN_ID": row["LAN_ID"],
                    "Employee_ID": row["Person_ID"],
                    "Email": row["email"],
                    "Phone_number": row["phone_number"]
                }
                resident_dict2.append(resident)

            print(f"‚úÖ Created resident_dict2 with {len(resident_dict2)} entries.")

            # Show sample data
            if resident_dict2:
                sample_keys = list(resident_dict2[0].keys())
            return resident_dict2

        except Exception as e:
            print(f"üî¥ Error creating resident data from BigQuery: {str(e)}")
            traceback.print_exc()
            raise

    def combine_and_save_resident_data(self):
        """Combine resident data from both sources and save to Google Cloud Storage."""
        try:
            # Load existing resident data
            dict_residents = self.load_resident_data()

            # Create new resident data from BigQuery
            resident_dict2 = self.create_residents_from_bigquery()

            # Create an email-to-record mapping for faster lookups
            email_to_resident = {r.get("Email", "").lower(): r for r in dict_residents if r.get("Email")}

            # Track statistics for reporting
            updated_count = 0
            added_count = 0

            # Process each resident from BigQuery data
            for new_resident in resident_dict2:
                email = new_resident.get("Email", "").lower()
                if not email:
                    continue

                if email in email_to_resident:
                    # Update existing resident record with new fields
                    existing_resident = email_to_resident[email]
                    record_modified = False

                    for key, value in new_resident.items():
                        # Only update if the key doesn't exist or has an empty value
                        if key not in existing_resident or not existing_resident[key]:
                            if value:  # Only update if the new value is not empty
                                existing_resident[key] = value
                                record_modified = True

                    # Only increment counter if at least one field was updated
                    if record_modified:
                        updated_count += 1
                else:
                    # Add new resident to the list
                    dict_residents.append(new_resident)
                    email_to_resident[email] = new_resident
                    added_count += 1

            print(f"‚úÖ Updated {updated_count} existing resident records")
            print(f"‚úÖ Added {added_count} new resident records")
            print(f"‚úÖ Total resident count: {len(dict_residents)}")

            # Convert to JSON
            json_data = json.dumps(dict_residents, indent=2)

            # Upload to GCS
            resident_blob = self.bucket.blob(self.resident_blob_name2)
            resident_blob.upload_from_string(json_data, content_type="application/json")

            print(f"‚úÖ Combined resident data saved to gs://{self.bucket_name}/{self.resident_blob_name2}")

            return dict_residents

        except Exception as e:
            print(f"üî¥ Error combining and saving resident data: {str(e)}")
            traceback.print_exc()
            raise


    def generate_query(self, start_date, end_date, selected_emails, include_op_notes=False, include_path_reports=False):
        """Generate BigQuery SQL query based on parameters.

        Args:
            start_date: Start date for query (YYYY-MM-DD)
            end_date: End date for query (YYYY-MM-DD)
            selected_emails: List of emails to filter by
            include_op_notes: Whether to include operation notes
            include_path_reports: Whether to include pathology reports

        Returns:
            str: SQL query
        """
        # Format the emails for SQL
        email_list = ",\n        ".join([f"'{email.lower()}'" for email in selected_emails])

        # Conditionally include Operation Notes
        op_note_field = "FACT_CLINICAL_DOCUMENTS.CLINICAL_DOCUMENT_TEXT AS Op_note," if include_op_notes else ""

        # Conditionally include Pathology Reports
        path_report_field = "fp.SPECIMEN_NOTE AS Path_report," if include_path_reports else ""

        return f"""
    -- Define provider emails at the top
    WITH Provider_Emails AS (
        SELECT email FROM UNNEST([
            {email_list}
        ]) AS email
    ),

    -- Define valid CPT codes at the top
    Valid_CPT_Code_List AS (
        SELECT code FROM UNNEST(['10005','10021','11420','11421','11422','11423','11424','11426','11440','11441','11442','11443','11444','11446','11620','11621','11622','11623','11624','11626','11640','11641','11642','11643','11644','11646','13120','13121','13122','13131','13132','13133','13151','13152','13153','14000','14001','14020','14021','14040','14041','14060','14061','14301','14302','15002','15003','15004','15005','15100','15101','15110','15111','15115','15116','15120','15121','15130','15135','15220','15221','15240','15241','15260','15261','15572','15574','15576','15610','15620','15630','15731','15734','15740','15756','15757','15758','15760','15770','15775','15777','15780','15781','15782','15783','15786','15788','15789','15792','15793','15819','15820','15821','15822','15823','15824','15825','15826','15828','15829','15840','15841','15842','15845','15876','20900','20902','20910','20912','20922','20955','20969','20970','21110','21210','21215','21230','21235','21244','21245','21280','21282','21315','21320','21325','21330','21335','21336','21337','21338','21343','21344','21345','21346','21347','21348','21355','21356','21360','21365','21366','21385','21386','21387','21390','21395','21401','21406','21407','21408','21421','21422','21423','21431','21432','21433','21435','21436','21440','21445','21451','21452','21453','21454','21461','21462','21465','21470','21550','21552','21554','21555','21556','21557','21558','21685','21720','30000','30020','30100','30110','30115','30117','30118','30124','30125','30130','30140','30150','30160','30200','30220','30300','30310','30400','30410','30430','30435','30450','30460','30462','30465','30468','30520','30540','30545','30560','30580','30600','30620','30630','30801','30802','30901','30903','30905','30906','30915','30920','31000','31020','31030','31032','31040','31070','31075','31080','31081','31084','31085','31086','31087','31205','31225','31230','31237','31238','31239','31240','31254','31255','31256','31267','31276','31287','31288','31290','31291','31292','31293','31294','31295','31296','31297','31300','31360','31367','31370','31375','31380','31382','31390','31395','31420','31520','31525','31526','31527','31528','31530','31531','31535','31536','31540','31541','31545','31551','31552','31553','31554','31560','31561','31570','31571','31572','31573','31574','31575','31576','31577','31578','31579','31580','31584','31587','31590','31591','31592','31600','31601','31605','31610','31611','31613','31614','31615','31622','31623','31624','31625','31630','31631','31635','31636','31638','31640','31641','31750','31780','31800','35180','35188','35201','35701','35800','35875','35876','38380','38500','38510','38520','38542','38550','38555','38700','38720','38724','40490','40500','40510','40520','40525','40527','40530','40650','40700','40701','40702','40720','40761','40800','40801','40804','40805','40808','40810','40812','40814','40840','40844','41000','41005','41006','41007','41008','41009','41010','41100','41108','41110','41112','41113','41116','41120','41130','41140','41150','41153','41510','41512','41520','41530','41823','41825','41874','42000','42100','42104','42106','42120','42140','42145','42160','42200','42205','42210','42215','42220','42225','42226','42227','42235','42260','42281','42310','42330','42335','42340','42405','42408','42409','42410','42415','42420','42425','42440','42450','42500','42505','42507','42550','42600','42650','42660','42665','42700','42720','42800','42808','42809','42810','42815','42825','42826','42830','42831','42835','42836','42842','42870','42890','42892','42894','42900','42950','42953','42955','42960','42962','42975','43020','43030','43100','43116','43124','43130','43180','43191','43192','43193','43194','43195','43196','43197','43198','43200','43201','43202','43215','43220','43226','43410','43496','60220','60225','60240','60252','60260','60270','60271','60280','60281','60500','60502','60512','60600','60605','61580','61581','61584','61585','61586','61590','61596','61598','61605','61607','61611','61615','61618','64568','64582','64584','64868','64885','67900','67901','67911','67912','67914','67916','67917','67950','68720','68815','69000','69005','69020','69100','69105','69110','69120','69140','69145','69150','69200','69205','69222','69300','69310','69320','69420','69421','69424','69433','69436','69440','69501','69502','69505','69511','69530','69535','69540','69550','69552','69554','69601','69602','69603','69610','69620','69631','69632','69633','69650','69660','69661','69662','69666','69667','69670','69676','69700','69705','69710','69711','69714','69716','69717','69720','69725','69726','69727','69740','69745','69801','69805','69806','69905','69915','69930','69950','69955','69960','69970','95805','95806','95807','95808','95810','95811','95992']) AS code
    ),

    -- Break down the procedures with their bilateral codes
    Procedure_Details_With_Bilateral AS (
        SELECT
            fscp.SURGICAL_CASE_FPK,
            fscp.SURGICAL_PROCEDURE_DK,
            fscp.SURGICAL_PROCEDURE_ORDERED_DESCRIPTION,
            dsp.SURGICAL_PROCEDURE_DESCRIPTION,
            fscp.SURGICAL_PROCEDURE_BILATERAL_CODE
        FROM `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.FACT_SURGICAL_CASE_PROCEDURE` AS fscp
        JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_SURGICAL_PROCEDURE` AS dsp
            ON fscp.SURGICAL_PROCEDURE_DK = dsp.SURGICAL_PROCEDURE_DK
    ),

    Procedure_Aggregates AS (
        SELECT
            fscp.SURGICAL_CASE_FPK,
            STRING_AGG(DISTINCT fscp.SURGICAL_PROCEDURE_ORDERED_DESCRIPTION, '; ') AS all_written_procedures,
            STRING_AGG(DISTINCT CONCAT(dsp.SURGICAL_PROCEDURE_DESCRIPTION, ' (', COALESCE(fscp.SURGICAL_PROCEDURE_BILATERAL_CODE, 'N/A'), ')'), '; ') AS written_procedures_summarized,
            STRING_AGG(DISTINCT fscp.SURGICAL_PROCEDURE_BILATERAL_CODE, '; ') AS all_bilateral_codes
        FROM `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.FACT_SURGICAL_CASE_PROCEDURE` AS fscp
        JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_SURGICAL_PROCEDURE` AS dsp
            ON fscp.SURGICAL_PROCEDURE_DK = dsp.SURGICAL_PROCEDURE_DK
        GROUP BY fscp.SURGICAL_CASE_FPK
    ),

    -- Extract procedures and filter by both FACT_PROCEDURES and DIM_PROCEDURE_CODE criteria
    Matching_Procedures AS (
        SELECT
            fsc.SURGICAL_CASE_FPK,
            fp.procedure_code_DK,
            fp.procedure_code,
            fp.procedure_method_code,
            dpc.PROCEDURE_NAME,
            dpc.PROCEDURE_DESCRIPTION,
            dpc.PROCEDURE_CODE AS dim_procedure_code,
            dpc.PROCEDURE_CATEGORY
        FROM `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.FACT_SURGICAL_CASE` AS fsc
        JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.FACT_PROCEDURES` AS fp
            ON fsc.SURGICAL_CASE_DTM = fp.Procedure_dtm
            AND fsc.PATIENT_DK = fp.patient_dk
            AND fp.procedure_method_code = 'CPT(R)'
        JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_PROCEDURE_CODE` AS dpc
            ON fp.procedure_code_DK = dpc.PROCEDURE_CODE_DK
            AND dpc.PROCEDURE_METHOD_CODE IN ('CPT(R)')
            AND dpc.PROCEDURE_CATEGORY IN (
                'PR DIGESTIVE SYSTEM SERVICES',
                'PR ENDOCRINE SYSTEM SERVICES',
                'PR HEMIC AND LYMPHATIC SYSTEM SERVICES',
                'PR INTEGUMENTARY SYSTEM SERVICES',
                'PR MEDIASTINUM AND DIAPHRAGM SERVICES',
                'PR MUSCULOSKELETAL SERVICES',
                'PR NERVOUS SYSTEM SERVICES',
                'PR OPHTHALMOLOGY SERVICES',
                'PR OTORHINOLARYNGOLOGIC',
                'PR RESPIRATORY SYSTEM SERVICES',
                'PR AUDITORY SYSTEM SERVICES'
            )
    ),

    -- Aggregate the filtered procedures by SURGICAL_CASE_FPK
    Procedure_Details AS (
        SELECT
            SURGICAL_CASE_FPK,
            STRING_AGG(DISTINCT CAST(procedure_code_DK AS STRING), '; ') AS procedure_code_DKs,
            STRING_AGG(DISTINCT procedure_code, ', ') AS procedure_codes,
            STRING_AGG(DISTINCT procedure_method_code, '; ') AS procedure_method_codes,
            STRING_AGG(DISTINCT PROCEDURE_NAME, '; ') AS procedure_names,
            STRING_AGG(DISTINCT PROCEDURE_DESCRIPTION, '; ') AS procedure_descriptions,
            STRING_AGG(DISTINCT dim_procedure_code, ', ') AS dim_procedure_codes,
            STRING_AGG(DISTINCT PROCEDURE_CATEGORY, '; ') AS procedure_categories
        FROM Matching_Procedures
        GROUP BY SURGICAL_CASE_FPK
    ),

    -- Modified CTE to use the Valid_CPT_Code_List
    Valid_CPT_Codes AS (
        SELECT
            mp.SURGICAL_CASE_FPK,
            STRING_AGG(DISTINCT mp.procedure_code, ', ') AS valid_CPT_codes
        FROM Matching_Procedures mp
        JOIN Valid_CPT_Code_List vcl
            ON mp.procedure_code = vcl.code
        GROUP BY mp.SURGICAL_CASE_FPK
    ),

    -- New CTE to aggregate all providers by SURGICAL_CASE_FPK
    All_Providers AS (
        SELECT
            dsprb.SURGICAL_CASE_FPK,
            STRING_AGG(DISTINCT dhp.PROVIDER_LAST_NAME, '; ') AS all_providers
        FROM `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_SURGICAL_PROVIDER_ROLE_SERVICE_BRIDGE` AS dsprb
        JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_HEALTHCARE_PROVIDER` AS dhp
            ON dhp.PROVIDER_DK = dsprb.PROVIDER_DK
        WHERE LOWER(dhp.PROVIDER_SPECIALTY_DESCRIPTION) IN ('otolaryngology', 'otorhinolaryngology')
        GROUP BY dsprb.SURGICAL_CASE_FPK
    )

    SELECT DISTINCT
        fsc.SURGICAL_CASE_FPK AS SURGICAL_CASE_FPK,
        PAT_DIM_PATIENT.PATIENT_CLINIC_NUMBER AS MRN,
        dhp.PROVIDER_FIRST_NAME AS Resident_first_name,
        dhp.PROVIDER_LAST_NAME AS Resident_last_name,
        dhp.PROVIDER_EMAIL_ADDRESS AS Prov_email,
        fsc.SURGICAL_CASE_DTM AS Surgery_date,
        dhp_primary.PROVIDER_LAST_NAME AS Primary_surgeon,
        ap.all_providers AS All_providers,
        dspr.SURGICAL_PROVIDER_ROLE_DESCRIPTION AS Role,
        DATE_DIFF(EXTRACT(DATE FROM fsc.SURGICAL_CASE_DTM), PAT_DIM_PATIENT.PATIENT_BIRTH_DATE, DAY) AS Pt_Age_In_Days,
        proc_agg.all_written_procedures,
        proc_agg.written_procedures_summarized,
        proc_agg.all_bilateral_codes,
        pd.procedure_descriptions AS Procedure_descriptions,
        pd.procedure_codes AS CPT_codes,
        vcpt.valid_CPT_codes,
        {op_note_field}
        {path_report_field}

    FROM `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.FACT_SURGICAL_CASE` AS fsc
    JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_SURGICAL_PROVIDER_ROLE_SERVICE_BRIDGE` AS dsprb
        ON fsc.SURGICAL_CASE_FPK = dsprb.SURGICAL_CASE_FPK
    JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_SURGICAL_PROVIDER_ROLE` AS dspr
        ON dspr.SURGICAL_PROVIDER_ROLE_DK = dsprb.SURGICAL_PROVIDER_ROLE_DK
    JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_HEALTHCARE_PROVIDER` AS dhp
        ON dhp.PROVIDER_DK = dsprb.PROVIDER_DK
    JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_HEALTHCARE_PROVIDER` AS dhp_primary
        ON dhp_primary.PROVIDER_DK = fsc.PRIMARY_PROVIDER_DK
    JOIN `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_PATIENT` AS PAT_DIM_PATIENT
        ON fsc.PATIENT_DK = PAT_DIM_PATIENT.PATIENT_DK
    JOIN Procedure_Aggregates AS proc_agg
        ON fsc.SURGICAL_CASE_FPK = proc_agg.SURGICAL_CASE_FPK
    LEFT JOIN Procedure_Details AS pd
        ON fsc.SURGICAL_CASE_FPK = pd.SURGICAL_CASE_FPK
    LEFT JOIN Valid_CPT_Codes AS vcpt
        ON fsc.SURGICAL_CASE_FPK = vcpt.SURGICAL_CASE_FPK
    LEFT JOIN All_Providers AS ap
        ON fsc.SURGICAL_CASE_FPK = ap.SURGICAL_CASE_FPK
    LEFT JOIN
    `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.DIM_SURGICAL_CASE_NOTE_BRIDGE` DSC_note_bridge
    ON (fsc.SURGICAL_CASE_FPK = DSC_note_bridge.SURGICAL_CASE_FPK)

    LEFT JOIN
    `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.FACT_CLINICAL_DOCUMENTS` FACT_CLINICAL_DOCUMENTS
    ON (DSC_note_bridge.CLINICAL_DOCUMENT_ID = FACT_CLINICAL_DOCUMENTS.CLINICAL_DOCUMENT_ID)


    LEFT JOIN
    `ml-mps-adl-intudp-phi-p-d5cb.phi_udpwh_etl_us_p.FACT_PATHOLOGY` fp
    ON (fsc.PATIENT_DK = fp.PATIENT_DK AND fp.SPECIMEN_COLLECTION_DATE_DK = fsc.SURGICAL_CASE_DATE_DK)


    WHERE LOWER(dhp.PROVIDER_EMAIL_ADDRESS) IN (SELECT email FROM Provider_Emails)
    -- AND LOWER(dhp.PROVIDER_LOCATION_SITE_NAME) = 'rochester, minnesota'

    AND (fsc.SURGICAL_CASE_DTM BETWEEN '{start_date}' AND '{end_date}')
        """

    def query_surgical_cases(self, start_date, end_date, selected_emails, include_op_notes=False, include_path_reports=False):
        """Query BigQuery for surgical case data based on parameters.

        Args:
            start_date: Start date for query (YYYY-MM-DD)
            end_date: End date for query (YYYY-MM-DD)
            selected_emails: List of emails to filter by
            include_op_notes: Whether to include operation notes
            include_path_reports: Whether to include pathology reports

        Returns:
            DataFrame: Surgical case data
        """
        try:
            self.update_progress(40, "Generating SQL query...")

            # Generate query based on parameters
            query = self.generate_query(
                start_date,
                end_date,
                selected_emails,
                include_op_notes,
                include_path_reports
            )

            self.update_progress(50, "Executing BigQuery query...")
            print("Running resident case log query...")

            # Execute query
            query_job = self.bq_client.query(query)

            # Wait for the query to complete
            self.update_progress(60, "Query running... Please wait...")
            results = query_job.result()  # This blocks until the query is complete

            # Convert results to DataFrame
            self.update_progress(70, "Query complete, fetching results...")
            df_surgical = results.to_dataframe()

            print(f"‚úÖ Retrieved {len(df_surgical)} surgical cases.")
            self.update_progress(75, f"Retrieved {len(df_surgical)} surgical cases")

            # Rename columns to match expected format if needed
            if 'Resident_email' in df_surgical.columns:
                df_surgical = df_surgical.rename(columns={'Resident_email': 'Prov_email'})

            return df_surgical

        except Exception as e:
            self.debug_print(f"Error querying surgical cases: {str(e)}", "ERROR")
            traceback.print_exc()
            raise

    def create_combined_dataframe(self, start_date, end_date, selected_emails, include_op_notes=False, include_path_reports=False):
        """Create the combined dataframe that merges all necessary source data.

        Args:
            start_date: Start date for query (YYYY-MM-DD)
            end_date: End date for query (YYYY-MM-DD)
            selected_emails: List of emails to filter by
            include_op_notes: Whether to include operation notes
            include_path_reports: Whether to include pathology reports

        Returns:
            tuple: (combined_df, df_cpt, df_consultants) - All dataframes needed for processing
        """
        try:
            # Load all reference data
            df_cpt = self.load_cpt_data()
            df_consultants = self.load_consultant_data()

            # Query surgical cases
            df_surgical = self.query_surgical_cases(
                start_date,
                end_date,
                selected_emails,
                include_op_notes,
                include_path_reports
            )

            # For this implementation, we're assuming df_surgical already has all the needed
            # resident information from the BigQuery join. If not, we could load residents
            # and merge them here.
            combined_df = df_surgical

            print(f"‚úÖ Created combined dataframe with {len(combined_df)} rows.")

            return combined_df, df_cpt, df_consultants

        except Exception as e:
            self.debug_print(f"Error creating combined dataframe: {str(e)}", "ERROR")
            traceback.print_exc()
            raise

    def save_dataframe(self, df, filename, sheet_name="Data"):
        """Save a dataframe to Google Cloud Storage as Excel.

        Args:
            df: DataFrame to save
            filename: Name of the file (without path)
            sheet_name: Name of the Excel sheet

        Returns:
            str: Local file path where the data was saved
        """
        try:
            self.update_progress(95, "Saving Excel file...")

            # Local path
            local_path = filename

            # Save locally first
            with pd.ExcelWriter(local_path, engine="openpyxl") as writer:
                df.to_excel(writer, index=False, sheet_name=sheet_name)

            # Upload to GCS
            self.update_progress(98, "Uploading to Google Cloud Storage...")
            blob_name = f"resident_cases/{filename}"
            blob = self.bucket.blob(blob_name)
            blob.upload_from_filename(local_path)

            print(f"üíæ DataFrame saved locally at: {local_path}")
            print(f"üìÅ DataFrame uploaded to gs://{self.bucket_name}/{blob_name}")

            self.update_progress(100, "File saved and uploaded successfully!")

            return local_path

        except Exception as e:
            self.debug_print(f"Error saving dataframe: {str(e)}", "ERROR")
            traceback.print_exc()
            raise


class ResidentCaseProcessor:
    """Class to process resident case data with additional enrichment."""

    def __init__(self):
        """Initialize processor with tracking information."""
        # Initialize tracking dictionaries
        self.cpt_tracking = {
            "total_codes_processed": 0,
            "codes_with_matches": 0,
            "codes_without_matches": 0,
            "rows_with_cpt_codes": 0,
            "errors": 0
        }

        # Set up time info for filename generation
        self.today = datetime.now()
        self.today_date = self.today.strftime('%Y-%m-%d')

        # Progress update callback
        self.progress_callback = None

    def set_progress_callback(self, callback_function):
        """Set a callback function for progress updates.

        Args:
            callback_function: Function that takes (progress_value, status_text)
        """
        self.progress_callback = callback_function

    def update_progress(self, value, status_text=""):
        """Update progress through callback if available.

        Args:
            value: Progress value (0-100)
            status_text: Status message
        """
        if self.progress_callback:
            self.progress_callback(value, status_text)

    def debug_print(self, message, level="INFO"):
        """Print debug messages with timestamp and level.

        Args:
            message: The message to print
            level: The log level (INFO, WARNING, ERROR)
        """
        timestamp = datetime.now().strftime("%H:%M:%S")
        if level == "ERROR":
            print(f"üî¥ {timestamp} - ERROR: {message}")
        elif level == "WARNING":
            print(f"üü° {timestamp} - WARNING: {message}")
        else:
            print(f"üîµ {timestamp} - {message}")

    def prepare_cpt_dataframe(self, df_cpt):
        """Prepare CPT dataframe for matching.

        Args:
            df_cpt: The CPT reference dataframe

        Returns:
            DataFrame: Prepared CPT dataframe
        """
        # Check if df_cpt has the expected columns
        if 'CPT_CODE' not in df_cpt.columns:
            self.debug_print("'CPT_CODE' column not found in df_cpt.", "WARNING")
            print("Available columns in df_cpt:", df_cpt.columns.tolist())
            # Try to find similar column names that might be the CPT code column
            potential_cpt_columns = [col for col in df_cpt.columns if 'CPT' in col or 'cpt' in col.lower()]
            if potential_cpt_columns:
                print(f"Potential CPT code columns found: {potential_cpt_columns}")
                # Use the first potential match
                print(f"Using '{potential_cpt_columns[0]}' as the CPT code column")
                df_cpt.rename(columns={potential_cpt_columns[0]: 'CPT_CODE'}, inplace=True)
            else:
                self.debug_print("No CPT_CODE column found and no suitable alternative identified", "ERROR")
                raise ValueError("CPT_CODE column not found in df_cpt")

        if 'Modified_CPT' not in df_cpt.columns:
            self.debug_print("'Modified_CPT' column not found in df_cpt.", "WARNING")
            print("Available columns in df_cpt:", df_cpt.columns.tolist())
            # Try to find similar column names that might be the Modified CPT column
            potential_mod_columns = [col for col in df_cpt.columns if 'MOD' in col.upper() or 'MODIFIED' in col.upper()]
            if potential_mod_columns:
                print(f"Potential Modified CPT columns found: {potential_mod_columns}")
                # Use the first potential match
                print(f"Using '{potential_mod_columns[0]}' as the Modified CPT column")
                df_cpt.rename(columns={potential_mod_columns[0]: 'Modified_CPT'}, inplace=True)
            else:
                # If no modified column is found, create one with the same values as CPT_CODE
                print("Creating 'Modified_CPT' column as a copy of 'CPT_CODE'")
                df_cpt['Modified_CPT'] = df_cpt['CPT_CODE']

        # Convert CPT_CODE to string type in the CPT dataframe for easier matching
        # Also clean delimiters from the reference data
        df_cpt['CPT_CODE'] = df_cpt['CPT_CODE'].astype(str).str.strip()
        df_cpt['CPT_CODE_clean'] = df_cpt['CPT_CODE'].str.replace(',', '').str.replace(';', '').str.replace('.', '').str.strip()

        # Display the first few rows of the CPT dataframe after any modifications
       # print("\nüìä CPT DataFrame after column adjustments:")
       # display(df_cpt.head(3))

        return df_cpt

    def process_cpt_codes(self, processed_df, df_cpt, idx):
        """Process CPT codes for a single row.

        Args:
            processed_df: The dataframe being processed
            df_cpt: The CPT reference dataframe
            idx: The row index to process

        Returns:
            tuple: (updated_cpt, modified_cpt) - Processed CPT code lists
        """
        # Get the original CPT codes, handling NaN values
        cpt_codes = str(processed_df.loc[idx, "CPT_codes"]) if pd.notna(processed_df.loc[idx, "CPT_codes"]) else ""

        # Skip processing if "nan" is the value (converted from NaN)
        if cpt_codes == "nan":
            cpt_codes = ""

        # Debug: Print CPT codes for some rows to verify
      #  if idx < 3 or idx % 100 == 0:
       #     if cpt_codes:
       #         print(f"\nRow {idx} - Original CPT codes: {cpt_codes}")

        # Split the CPT codes string into individual codes and clean whitespace
        individual_codes = [code.strip() for code in cpt_codes.split(',') if code.strip()]

        # Track rows with CPT codes
        if individual_codes:
            self.cpt_tracking["rows_with_cpt_codes"] += 1

        # Initialize lists to store updated and modified CPT codes
        updated_cpt = []
        modified_cpt = []

        # Process each individual CPT code
        for code in individual_codes:
            self.cpt_tracking["total_codes_processed"] += 1

            # Debug: Print code being processed for the first few rows
           # if idx < 3:
            #    print(f"  - Processing CPT code: {code}")

            # First, add the original code to updated_cpt (any custom transformations would go here)
            updated_cpt.append(code)

            try:
                # Ensure type consistency by converting code to string and cleaning it
                code_str = str(code).strip()

                # Remove any commas, semicolons, or other delimiters that might affect matching
                code_clean = code_str.replace(',', '').replace(';', '').replace('.', '').strip()

                # Try both string and numeric matching to handle type differences
                # First try exact string match with clean codes
                matches = df_cpt[df_cpt['CPT_CODE_clean'] == code_clean]

                # If no match, try converting to numeric if possible
                if matches.empty and code_clean.isdigit():
                    # Try numeric match if code is a number
                    try:
                        code_int = int(code_clean)
                        matches = df_cpt[df_cpt['CPT_CODE_clean'].astype(float) == code_int]
                  #      if idx < 3 and not matches.empty:
                   #         print(f"    ‚úì Found match for {code} using numeric comparison")
                    except (ValueError, TypeError):
                        # If conversion fails, continue with empty matches
                        pass

                if not matches.empty:
                    self.cpt_tracking["codes_with_matches"] += 1
                    # If a match is found, get the Modified_CPT value
                    modified_value = matches.iloc[0]['Modified_CPT']
                    if pd.notna(modified_value):
                        modified_cpt.append(str(modified_value))
                     #   if idx < 3:
                     #       print(f"    ‚úì Found match for {code}: {modified_value}")
                  #  else:
                        # If Modified_CPT is NaN, don't include this code
                      #  if idx < 3:
                       #     print(f"    ‚ö†Ô∏è Found match for {code} but Modified_CPT is NaN, excluding from modified codes")
                else:
                    self.cpt_tracking["codes_without_matches"] += 1
                    # If no match is found, exclude this code from modified_cpt list
                #    if idx < 3:
                 #      print(f"    ‚ùå No match found for {code}, excluding from modified codes")
            except Exception as e:
                self.cpt_tracking["errors"] += 1
                self.debug_print(f"Error processing CPT code {code} in row {idx}: {str(e)}", "ERROR")
                # If an error occurs, use the original code
                modified_cpt.append(code)

        return updated_cpt, modified_cpt

    def determine_equipment(self, processed_df, idx):
        """Determine equipment used based on procedure descriptions.

        Args:
            processed_df: The dataframe being processed
            idx: The row index to process

        Returns:
            list: Equipment identified in the procedure
        """
        equipment = []

        # Check for various equipment in the procedure descriptions
        procedure_text = str(processed_df.loc[idx, "all_written_procedures"]).lower() if pd.notna(processed_df.loc[idx, "all_written_procedures"]) else ""

        # Skip processing if "nan" is the value (converted from NaN)
        if procedure_text == "nan":
            procedure_text = ""

        # Check for robotic procedures
        if "robotic" in procedure_text:
            equipment.append("Robotic: 20221")

        # Check for CO2 laser
        if "co2" in procedure_text or "carbon dioxide" in procedure_text:
            equipment.append("Laser-CO2: 20215")

        # Check for sialendoscopy
        if "sialendoscopy" in procedure_text:
            equipment.append("Sialendoscopy: 20885")

        # Check for KTP laser
        if "ktp" in procedure_text:
            equipment.append("Laser-KTP: 20216")

        return equipment

    def determine_age_category(self, age_days):
        """Determine patient age category based on age in days.

        Args:
            age_days: Patient age in days

        Returns:
            str: Age category string
        """
        if pd.notna(age_days):
            if age_days < 28:
                return "Neonate (<28 days): 114"
            elif age_days >= 28 and age_days < (3 * 365):
                return "Infant/Toddler(>=28-<3yr): 113"
            elif age_days >= (3 * 365) and age_days < (13 * 365):
                return "Child(>;= 3 - < 13 yrs): 112"
            elif age_days >= (13 * 365) and age_days < (18 * 365):
                return "Adolescent(>= 13 -<18 yr): 110"
            else:
                return "Adult (>= 18 yrs): 111"
        else:
            return "Unknown"

    def process_consultant_data(self, processed_df, df_consultants):
        """Process consultant data to create corrected_consultant column."""
        print("\nüîÑ Processing consultant data...")

        # Create case-insensitive mapping of consultant names to their indices
        consultant_dict = {name.upper().strip(): index for name, index in
                        zip(df_consultants.iloc[:, 0], df_consultants.iloc[:, 1])}

        # Log the consultant dictionary (first few entries)
       # print(f"\nüìä Consultant mapping dictionary (first 5 entries):")
       # for i, (name, index) in enumerate(list(consultant_dict.items())[:5]):
        #    print(f"  - {name} -> {index}")

        # Initialize new column
        processed_df['corrected_consultant'] = ""

        # Process each row for consultant correction
        for idx in tqdm(range(len(processed_df)), desc="Processing consultant data"):
            try:
                # Get the primary consultant name
                primary_consultant = str(processed_df.loc[idx, "Primary_surgeon"]).strip() if pd.notna(processed_df.loc[idx, "Primary_surgeon"]) else ""

                # Skip processing if "nan" is the value
                if primary_consultant.lower() == "nan":
                    primary_consultant = ""

                # Extract last name more intelligently, handling compound last names
                found_match = False

                # Try checking if the full primary consultant last name is in the list
                if primary_consultant:
                    # This handles cases where the full last name is stored in consultant_dict
                    # For example, "Van Abel" or "De La Cruz" would be matched as complete names
                    primary_last_name = primary_consultant.upper()
                    if primary_last_name in consultant_dict:
                        processed_df.loc[idx, "corrected_consultant"] = consultant_dict[primary_last_name]
                        found_match = True
                    else:
                        # Try to match with just the last part - this might work for simpler names
                        last_word = primary_consultant.split()[-1].upper() if primary_consultant else ""
                        if last_word in consultant_dict:
                            processed_df.loc[idx, "corrected_consultant"] = consultant_dict[last_word]
                            found_match = True

                # If no match found in primary surgeon, check the list of all providers
                if not found_match:
                    all_providers = str(processed_df.loc[idx, "All_providers"]) if pd.notna(processed_df.loc[idx, "All_providers"]) else ""

                    # Skip processing if "nan" is the value
                    if all_providers.lower() == "nan":
                        all_providers = ""

                    # Split by semicolon and check each name
                    if all_providers:
                        for provider in all_providers.split(';'):
                            provider = provider.strip()

                            # Try matching the full provider name first
                            if provider.upper() in consultant_dict:
                                processed_df.loc[idx, "corrected_consultant"] = consultant_dict[provider.upper()]
                                found_match = True
                                break

                            # If full name doesn't match, try just the last part
                            provider_parts = provider.split()
                            if provider_parts:
                                provider_last_name = provider_parts[-1].upper()
                                if provider_last_name in consultant_dict:
                                    processed_df.loc[idx, "corrected_consultant"] = consultant_dict[provider_last_name]
                                    found_match = True
                                    break

                    # If no match found in either primary or all providers, use default
                    if not found_match:
                        processed_df.loc[idx, "corrected_consultant"] = "Program, Other: 280213"

                # Debug: Print consultant processing for some rows
              #  if idx < 3 or idx % 100 == 0:
                #    print(f"\nRow {idx} - Primary consultant: {primary_consultant}")
                #    print(f"  ‚Üí Corrected consultant: {processed_df.loc[idx, 'corrected_consultant']}")

            except Exception as e:
                self.debug_print(f"Error processing consultant in row {idx}: {str(e)}", "ERROR")
                # Use default if error occurs
                processed_df.loc[idx, "corrected_consultant"] = "Program, Other: 280213"

        return processed_df

    def process_key_indicators(self, processed_df, df_cpt):
        """Process CPT codes to extract key indicators.

        Args:
            processed_df: The dataframe being processed
            df_cpt: The CPT reference dataframe

        Returns:
            DataFrame: Updated dataframe with key_indicators column
        """
        print("\nüîÑ Processing key indicators data...")

        # Check if df_cpt has the key indicator column
        key_indicator_col = None
        potential_key_cols = [col for col in df_cpt.columns if 'indicator_type_abbr' in col.lower()]
        if potential_key_cols:
            key_indicator_col = potential_key_cols[0]
            print(f"Using '{key_indicator_col}' as the Key Indicator column")
        else:
            self.debug_print("No Key Indicator column found in df_cpt. Creating empty key indicators column.", "WARNING")

        # Initialize key indicators column
        processed_df['key_indicators'] = ""

        # Process each row to extract key indicators from CPT codes
        for idx in tqdm(range(len(processed_df)), desc="Processing key indicators"):
            try:
                # Get the modified CPT codes
                cpt_codes = str(processed_df.loc[idx, "modified CPT codes"]) if pd.notna(processed_df.loc[idx, "modified CPT codes"]) else ""

                # Skip processing if "nan" is the value or empty
                if cpt_codes.lower() == "nan" or not cpt_codes:
                    continue

                # Split the CPT codes string into individual codes and clean whitespace
                individual_codes = [code.strip() for code in cpt_codes.split(',') if code.strip()]

                # Initialize list to store key indicators
                key_indicators = []

                # Process each CPT code to find its key indicator
                for code in individual_codes:
                    # Clean the code for matching
                    code_clean = str(code).replace(',', '').replace(';', '').replace('.', '').strip()

                    # Skip if no key indicator column was found
                    if not key_indicator_col:
                        continue

                    # Find matches in the CPT dataframe
                    matches = df_cpt[df_cpt['CPT_CODE_clean'] == code_clean]

                    # If matches found, get the key indicator
                    if not matches.empty:
                        key_indicator = matches.iloc[0][key_indicator_col]
                        if pd.notna(key_indicator) and str(key_indicator).strip():
                            key_indicators.append(str(key_indicator).strip())

                # Remove duplicates and join with semicolons
                unique_indicators = list(set(key_indicators))
                processed_df.loc[idx, "key_indicators"] = '; '.join(unique_indicators) if unique_indicators else ""

                # Debug: Print key indicators for some rows
              #  if idx < 3 or idx % 100 == 0:
                   # if individual_codes:
                   #     print(f"\nRow {idx} - CPT codes: {cpt_codes}")
                   #     print(f"  ‚Üí Key indicators: {processed_df.loc[idx, 'key_indicators']}")

            except Exception as e:
                self.debug_print(f"Error processing key indicators in row {idx}: {str(e)}", "ERROR")

        return processed_df

    def process_data(self, combined_df, df_cpt, df_consultants):
        """Process the resident case data with all enrichments.

        Args:
            combined_df: The input dataframe with resident case data
            df_cpt: The CPT reference dataframe
            df_consultants: The consultant reference dataframe

        Returns:
            DataFrame: Processed and enriched dataframe
        """
        print("\nüîÑ Processing surgical data...")
        self.update_progress(80, "Processing resident case data...")

        # Make a copy of the dataframe to work with
        try:
            processed_df = combined_df.copy()
            # Debug: Show combined_df structure
          #  print("\nüìä Combined DataFrame columns:", combined_df.columns.tolist())
          #  print(f"üìä Combined DataFrame has {len(combined_df)} rows")
        except Exception as e:
            self.debug_print(f"Error copying dataframe: {str(e)}", "ERROR")
            raise

        # Initialize new columns for CPT code tracking
        processed_df['Updated_CPT_codes'] = ""
        processed_df['modified CPT codes'] = ""

        # Prepare the CPT dataframe for matching
        df_cpt = self.prepare_cpt_dataframe(df_cpt)

        # Process data with progress bar
        for idx in tqdm(range(len(processed_df)), desc="Processing patient data"):
            try:
                # 1. Calculate patient age category based on days
                age_days = processed_df.loc[idx, "Pt_Age_In_Days"]
                processed_df.loc[idx, "Patient_Age_Category"] = self.determine_age_category(age_days)

                # 2. Update CPT codes based on rules
                updated_cpt, modified_cpt = self.process_cpt_codes(processed_df, df_cpt, idx)

                # Join the processed codes and update the dataframe
                processed_df.loc[idx, "Updated_CPT_codes"] = ', '.join(updated_cpt) if updated_cpt else ""
                processed_df.loc[idx, "modified CPT codes"] = ', '.join(modified_cpt) if modified_cpt else ""

                # Debug: Print updated values for some rows
                if idx < 3 and (updated_cpt or modified_cpt):
                    print(f"  ‚Üí Updated CPT codes: {processed_df.loc[idx, 'Updated_CPT_codes']}")
                    print(f"  ‚Üí Modified CPT codes: {processed_df.loc[idx, 'modified CPT codes']}")

                # 3. Determine equipment used
                equipment = self.determine_equipment(processed_df, idx)

                # Set equipment value, with "N/A: 20214" for empty values
                processed_df.loc[idx, "equipment"] = '; '.join(equipment) if equipment else "N/A: 20214"

            except Exception as e:
                self.debug_print(f"Error processing row {idx}: {str(e)}", "ERROR")
                # Continue with next row instead of failing completely
                continue

        # Process consultant data to create corrected_consultant column
        self.update_progress(85, "Processing consultant data...")
        processed_df = self.process_consultant_data(processed_df, df_consultants)

        # Process key indicators
        self.update_progress(90, "Processing key indicators...")
        processed_df = self.process_key_indicators(processed_df, df_cpt)

        # Display CPT code processing statistics
        print("\nüìä CPT Code Processing Statistics:")
        print(f"Total rows with CPT codes: {self.cpt_tracking['rows_with_cpt_codes']} out of {len(processed_df)}")
        print(f"Total individual CPT codes processed: {self.cpt_tracking['total_codes_processed']}")
        print(f"CPT codes with matches in reference data: {self.cpt_tracking['codes_with_matches']}")
        print(f"CPT codes without matches: {self.cpt_tracking['codes_without_matches']}")
        print(f"Errors during processing: {self.cpt_tracking['errors']}")

        # Verify that our new columns were properly populated
        print("\n‚úÖ Verification of CPT code processing:")
        non_empty_updated = processed_df['Updated_CPT_codes'].str.strip().str.len() > 0
        non_empty_modified = processed_df['modified CPT codes'].str.strip().str.len() > 0
        print(f"Rows with non-empty Updated_CPT_codes: {non_empty_updated.sum()} out of {len(processed_df)}")
        print(f"Rows with non-empty modified CPT codes: {non_empty_modified.sum()} out of {len(processed_df)}")

        # Sample data with non-empty modified CPT codes
        #print("\nüìä Sample rows with modified CPT codes:")
        #sample_rows = processed_df[non_empty_modified].head(5)
        #sample_display = pd.DataFrame({
        #    'Original CPT': sample_rows['CPT_codes'],
        #    'Modified CPT': sample_rows['modified CPT codes']
       # })
        #display(sample_display)

        return processed_df

    def create_summary_dataframe(self, processed_df):
        """Create a summary dataframe with selected columns.

        Args:
            processed_df: The processed dataframe

        Returns:
            DataFrame: Summary dataframe
        """
        try:
            self.update_progress(92, "Creating summary dataframe...")

            # Create a new DataFrame with selected and transformed columns
            df_summary = pd.DataFrame({
                "Resident Name": processed_df["Resident_first_name"].str.strip() + " " + processed_df["Resident_last_name"].str.strip(),
                "Case_ID": processed_df["SURGICAL_CASE_FPK"],
                "Case_Date": pd.to_datetime(processed_df["Surgery_date"]).dt.date,
                "Special_equipment": processed_df["equipment"],
                "Role": processed_df["Role"],
                "Site": "Mayo Clinic (Rochester): 12793",
                "Attending": processed_df["corrected_consultant"],  # New column
                "CPT codes": processed_df["modified CPT codes"],
                "Patient_age": processed_df["Patient_Age_Category"],
                "Post-op procedure description": processed_df["written_procedures_summarized"],
                "Procedure name": processed_df["Procedure_descriptions"],
                "Pre-op preocedure info": processed_df["all_written_procedures"],
                "Key_indicators": processed_df["key_indicators"],  # New column
                "All surgeons": processed_df["All_providers"],
                "Pt age (days)": processed_df["Pt_Age_In_Days"],
                "Pt MRN": processed_df["MRN"],
                "Resident Email": processed_df["Prov_email"],
                "Primary_surgeon_listed": processed_df["Primary_surgeon"],
                "Original CPT codes": processed_df["CPT_codes"],
                "Updated_CPT codes": processed_df["Updated_CPT_codes"],
            })

            # First, count the total rows before any filtering
            original_row_count = len(df_summary)

            # Remove rows containing "EVACUATION HEMATOMA" in Post-op procedure description
            hematoma_mask = ~df_summary["Post-op procedure description"].str.contains("EVACUATION HEMATOMA", case=False, na=False)
            df_summary = df_summary[hematoma_mask]

            # Count rows after hematoma filtering
            after_hematoma_count = len(df_summary)
            hematoma_rows_removed = original_row_count - after_hematoma_count
            print(f"\n‚úÖ Removed {hematoma_rows_removed} rows containing 'EVACUATION HEMATOMA'")

            # Remove rows where "CPT codes" is blank or empty
            df_summary = df_summary[df_summary["CPT codes"].notna() &
                                (df_summary["CPT codes"].str.strip() != "")]

            # Calculate final counts
            final_row_count = len(df_summary)
            empty_cpt_rows_removed = after_hematoma_count - final_row_count
            total_removed_rows = original_row_count - final_row_count

            print(f"‚úÖ Removed {empty_cpt_rows_removed} rows with empty CPT codes")
            print(f"‚úÖ Total removed: {total_removed_rows} rows ({hematoma_rows_removed} hematoma + {empty_cpt_rows_removed} empty CPT)")
            print(f"‚úÖ Final dataset contains {final_row_count} rows out of original {original_row_count}")

            # Preview the summarized data
            df_summary = df_summary.sort_values(by=["Resident Name", "Case_Date"]).reset_index(drop=True)
            display(Markdown("### ü©∫ Condensed Surgical Case Log"))
            display(df_summary.head(10))

            return df_summary

        except Exception as e:
            self.debug_print(f"Error creating summary dataframe: {str(e)}", "ERROR")
            # Print the error traceback for debugging
            traceback.print_exc()
            raise


# Initialize the interactive UI
def initialize_ui(dict_residents):
    """Initialize the interactive UI with widgets.

    Args:
        dict_residents: List of resident dictionaries

    Returns:
        dict: Dictionary containing all UI widgets
    """
    # Display custom CSS for better styling
    display(HTML("""
    <style>
        .widget-label {
            font-family: 'Arial', sans-serif !important;
            font-size: 14px !important;
            font-weight: 600 !important;
            color: #2c3e50 !important;
        }
        .widget-select-multiple {
            font-family: 'Arial', sans-serif !important;
            font-size: 13px !important;
        }
        .widget-text {
            font-family: 'Arial', sans-serif !important;
            font-size: 13px !important;
        }
        .widget-checkbox {
            font-size: 14px !important;
        }
        .widget-button {
            font-family: 'Arial', sans-serif !important;
            font-size: 14px !important;
            font-weight: 600 !important;
        }
        .jupyter-widgets {
            margin: 5px 0 !important;
        }
    </style>
    """))

    # Date picker widgets with better layout
    today = datetime.now()
    start_date_widget = widgets.DatePicker(
        description='Start Date:',
        value=datetime(2020, 1, 1).date(),  # January 1, 2020
        disabled=False,
        style={'description_width': '120px'},
    )

    end_date_widget = widgets.DatePicker(
        description='End Date:',
        value=today.date(),
        disabled=False,
        style={'description_width': '120px'},
    )

    # Create a horizontal box for date widgets
    date_box = widgets.HBox([start_date_widget, end_date_widget],
                            layout=widgets.Layout(margin='10px 0'))

    # Create multi-select widget for residents with double width
    valid_residents = [r for r in dict_residents if r["First name"] != "None" and r["Email"] != "None"]
    resident_display_names = [f"{r['First name']} {r['Last name']} ({r['Email']})" for r in valid_residents]

    resident_select = widgets.SelectMultiple(
        options=list(zip(resident_display_names, range(len(valid_residents)))),
        value=[],
        description='Residents:',
        rows=10,
        style={'description_width': '120px'},
        layout=widgets.Layout(width='600px')  # Double the default width
    )

    # Additional emails widget with improved layout
    additional_emails_widget = widgets.Textarea(
        value='',
        placeholder='Enter additional emails, separated by commas',
        description='Additional Emails:',
        style={'description_width': '120px'},
        layout=widgets.Layout(width='600px', height='60px')
    )

    # Add checkbox for Select All Residents
    select_all_checkbox = widgets.Checkbox(
        value=False,
        description='Select All Residents',
        disabled=False,
        indent=False,
        style={'description_width': 'initial'}
    )

    # Add checkboxes for including additional data
    include_op_notes_widget = widgets.Checkbox(
        value=False,
        description='Include op notes',
        disabled=False,
        indent=False,
        style={'description_width': 'initial'}
    )

    include_path_reports_widget = widgets.Checkbox(
        value=False,
        description='Include Pathology Reports',
        disabled=False,
        indent=False,
        style={'description_width': 'initial'}
    )

    # Create a horizontal box for checkboxes
    checkbox_box = widgets.HBox([
        select_all_checkbox,
        include_op_notes_widget,
        include_path_reports_widget
    ], layout=widgets.Layout(margin='10px 0', justify_content='flex-start'))

    # Create button with improved styling
    execute_button = widgets.Button(
        description='Execute Query',
        disabled=False,
        button_style='primary',
        tooltip='Click to execute the query',
        icon='check',
        layout=widgets.Layout(width='200px', margin='10px 0')
    )

    # Progress bar widget
    progress_bar = widgets.IntProgress(
        value=0,
        min=0,
        max=100,
        description='Progress:',
        bar_style='info',  # 'success', 'info', 'warning', 'danger' or ''
        style={'bar_color': '#00cc88', 'description_width': 'initial'},
        layout=widgets.Layout(width='600px', margin='10px 0')
    )

    # Status label widget
    status_label = widgets.Label(value="Ready to execute query")

    # Output widget
    output_widget = widgets.Output()

    # Create a vertical box for progress bar and status
    progress_box = widgets.VBox([progress_bar, status_label])

    # Function to handle select all checkbox
    def on_select_all_change(change):
        if change['new']:
            resident_select.value = tuple(range(len(valid_residents)))
        else:
            resident_select.value = []

    select_all_checkbox.observe(on_select_all_change, names='value')

    # Create a well-organized layout
    display(HTML("<h3 style='font-family: Arial, sans-serif; color: #2c3e50;'>Resident Case Query Tool</h3>"))
    display(date_box)
    display(resident_select)
    display(additional_emails_widget)
    display(checkbox_box)
    display(execute_button)
    display(progress_box)
    display(output_widget)

    # Return all widgets in a dictionary
    return {
        'start_date': start_date_widget,
        'end_date': end_date_widget,
        'resident_select': resident_select,
        'additional_emails': additional_emails_widget,
        'select_all': select_all_checkbox,
        'include_op_notes': include_op_notes_widget,
        'include_path_reports': include_path_reports_widget,
        'execute_button': execute_button,
        'progress_bar': progress_bar,
        'status_label': status_label,
        'output': output_widget,
        'valid_residents': valid_residents
    }


# Function to update progress bar
def update_progress(ui, value, status_text=""):
    """Update progress bar and status label.

    Args:
        ui: Dictionary of UI widgets
        value: Progress value (0-100)
        status_text: Status message
    """
    ui['progress_bar'].value = value
    if status_text:
        ui['status_label'].value = status_text


# Function to handle button click
def setup_execute_handler(ui, dict_residents):
    """Set up the handler for the execute button.

    Args:
        ui: Dictionary of UI widgets
        dict_residents: List of resident dictionaries
    """

    def on_execute_button_clicked(b):
        with ui['output']:
            ui['output'].clear_output()

            # Reset progress bar
            update_progress(ui, 0, "Starting query execution...")

            # Get selected residents' emails
            selected_indices = list(ui['resident_select'].value)
            selected_emails = [ui['valid_residents'][i]["Email"] for i in selected_indices]

            # Add additional emails if any
            additional_emails = [e.strip() for e in ui['additional_emails'].value.split(',') if e.strip()]
            all_emails = selected_emails + additional_emails

            if not all_emails:
                print("‚ùå Please select at least one resident or add an email address.")
                return

            # Get dates
            start = ui['start_date'].value.strftime('%Y-%m-%d')
            end = ui['end_date'].value.strftime('%Y-%m-%d')

            # Get checkbox values
            include_op_notes = ui['include_op_notes'].value
            include_path_reports = ui['include_path_reports'].value

            print(f"üîç Searching for cases from {start} to {end}")
            print(f"üìß Selected emails: {', '.join(all_emails)}")
            print(f"üìÑ Include Operation Notes: {'Yes' if include_op_notes else 'No'}")
            print(f"üî¨ Include Pathology Reports: {'Yes' if include_path_reports else 'No'}")

            # Initialize the DataLoader and ResidentCaseProcessor
            try:
                loader = DataLoader()
                processor = ResidentCaseProcessor()

                # Set progress callback functions
                loader.set_progress_callback(lambda value, text: update_progress(ui, value, text))
                processor.set_progress_callback(lambda value, text: update_progress(ui, value, text))

                # Create the combined dataframe
                update_progress(ui, 10, "Initializing connections...")
                combined_df, df_cpt, df_consultants = loader.create_combined_dataframe(
                    start,
                    end,
                    all_emails,
                    include_op_notes,
                    include_path_reports
                )

                # Process the data
                processed_df = processor.process_data(combined_df, df_cpt, df_consultants)

                # Create the summary dataframe
                df_summary = processor.create_summary_dataframe(processed_df)

                # Save the results
                today_date = datetime.now().strftime('%Y-%m-%d')
                summary_filename = f"Condensed_resident_case_data_{today_date}.xlsx"
                local_path = loader.save_dataframe(df_summary, summary_filename, "Resident Cases")

                # Display download link
                display(FileLink(local_path))

            except Exception as e:
                print(f"‚ùå Error: {str(e)}")
                update_progress(ui, 0, "Error occurred during processing")
                traceback.print_exc()
                return

    # Connect button to function
    ui['execute_button'].on_click(on_execute_button_clicked)


# Main function to run the application
def run_app(dict_residents):
    """Run the complete application with UI.

    Args:
        dict_residents: List of resident dictionaries
    """
    # Initialize UI
    ui = initialize_ui(dict_residents)

    # Setup execute handler
    setup_execute_handler(ui, dict_residents)

    print("‚úÖ Application initialized. Use the UI above to query and process resident case data.")

# Execute the resident data update process
if __name__ == "__main__" or 'get_ipython' in globals():
    try:
        # Initialize DataLoader
        loader = DataLoader()

        # Combine and save resident data
        print("üìä Starting resident data update process...")
        combined_residents = loader.combine_and_save_resident_data()
        print(f"‚úÖ Successfully updated resident data with {len(combined_residents)} entries")

        # Now that we have the combined data, run the application
        run_app(combined_residents)
    except Exception as e:
        print(f"‚ùå Error updating resident data: {str(e)}")
        traceback.print_exc()