In [38]:
import os
import google.generativeai as genai
import pandas as pd
from typing import Dict, Any, Union, Optional


pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 50)
pd.set_option('display.width', 150)

print("Libraries imported and display options set.")

Libraries imported and display options set.


In [40]:
# Using absolute paths is safer for notebooks. Ensure this path is correct for your system.
PROCESSED_DATA_PATH = '/Users/ashishrathore/Aarogya-AI/data/processed/master_health_data_v2.csv'
CREDENTIALS_FILE = '/Users/ashishrathore/Aarogya-AI/crack-decorator-468911-s1-5ab46e3aea4b.json'

# --- AI Model Parameters ---
GEMINI_MODEL_NAME = "gemini-1.5-flash"

# --- System Prompt Engineering ---
# This prompt is the "brain" of our summarizer. It's carefully engineered to guide the AI's behavior, tone, and output format.
SUMMARY_SYSTEM_PROMPT = """
You are 'Aarogya-AI', a compassionate and intelligent medical report summarizer.
Your role is to translate complex medical data into a simple, easy-to-understand summary for a patient who has no medical knowledge.

**Instructions:**
1.  **Tone:** Be reassuring, positive, and clear. Avoid alarming language.
2.  **Structure:**
    *   Start with a general overview of the report.
    *   Highlight the key findings, starting with the most important one.
    *   For each finding, explain what the test measures in very simple terms (e.g., "Cholesterol is a type of fat in your blood").
    *   Do NOT provide any medical advice, diagnosis, or treatment recommendations. This is critical.
3.  **Content:**
    *   Focus only on the data provided. Do not invent any information.
    *   Mention both normal and out-of-range results to give a balanced view.
4.  **Format:** Use clear paragraphs. Use bullet points for listing the test results to make it easy to read.
5.  **Disclaimer:** ALWAYS end the summary with the following disclaimer:
    "**Disclaimer:** This is an AI-generated summary and not a substitute for professional medical advice. Please consult with your doctor to discuss your results in detail."
"""

print("Configuration loaded.")

Configuration loaded.


## 2. Core Logic: The Toolbox

Here, we define our reusable functions. This modular approach is key to writing clean, maintainable, and testable code. Each function has a specific, well-defined purpose and clear documentation.

In [43]:
def setup_ai_voice(credentials_path: str, model_name: str) -> Optional[genai.GenerativeModel]:
    """
    Configures and initializes the Gemini Generative Model.

    Args:
        credentials_path (str): Path to the Google Cloud credentials JSON file.
        model_name (str): The name of the Gemini model to use.

    Returns:
        Optional[genai.GenerativeModel]: An initialized model object, or None if setup fails.
    """
    try:
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
        genai.configure(transport='rest')
        model = genai.GenerativeModel(model_name)
        print("✅ AI Voice (Gemini Model) initialized successfully.")
        return model
    except Exception as e:
        print(f"❌ Error initializing Gemini Model: {e}")
        return None

def format_patient_data_for_prompt(patient_df: pd.DataFrame) -> str:
    """
    Formats a patient's test results into a simple string for the AI prompt.
    """
    if patient_df.empty:
        return "No data available for this patient."
    
    # Use .iloc[0] safely to get the first patient's name; handle potential missing names.
    patient_name = patient_df['patient_name'].iloc[0] if not pd.isna(patient_df['patient_name'].iloc[0]) else "Valued Patient"
    report_text = f"Patient Name: {patient_name}\n\nTest Results:\n"
    
    for _, row in patient_df.iterrows():
        # Use .get() for robust access to potentially missing columns.
        unit = row.get('unit', '')
        ref_range = row.get('reference_range', 'N/A')
        # Ensure we don't print 'nan' if values are missing.
        report_text += f"- {row['test_name']}: {row['result']} {unit if not pd.isna(unit) else ''} (Normal Range: {ref_range if not pd.isna(ref_range) else 'N/A'})\n"
        
    return report_text

def generate_summary(model: genai.GenerativeModel, patient_data_text: str, system_prompt: str) -> str:
    """
    Sends the patient data to the Gemini model and returns the summary.
    """
    if not model:
        return "Error: The AI model is not available."
        
    try:
        print("\n⏳ Sending data to the AI Voice for summarization...")
        # Combining the system prompt and user data is the standard way.
        full_prompt = [system_prompt, patient_data_text]
        response = model.generate_content(full_prompt)
        return response.text
    except Exception as e:
        return f"❌ An error occurred during summary generation: {e}"

print("Core logic functions defined.")

Core logic functions defined.


## 3. Main Execution: The Factory Floor

This is where we run the pipeline. We call our core functions in a logical sequence to produce the final result. The code here is simple and easy to follow because the complex logic is encapsulated within the functions above.

In [48]:
# 1. Initialize the AI Voice
print("--- [Step 1/4] Initializing the AI Voice... ---")
ai_voice = setup_ai_voice(credentials_path=CREDENTIALS_FILE, model_name=GEMINI_MODEL_NAME)

if ai_voice:
    # 2. Load the data
    print(f"\n--- [Step 2/4] Loading data from {PROCESSED_DATA_PATH}... ---")
    try:
        df = pd.read_csv(PROCESSED_DATA_PATH)
        print(f"✅ Loaded {len(df)} records.")
        
        # 3. Select a sample patient to summarize
        if not df.empty and 'source_file' in df.columns:
            print("\n--- [Step 3/4] Selecting a sample patient... ---")
            # Select all records from the first file found in the dataset
            sample_file_name = df['source_file'].iloc[0]
            patient_df = df[df['source_file'] == sample_file_name]
            
            print(f"Sample patient data selected from file: {sample_file_name}")

            # 4. Format data and generate summary
            print("\n--- [Step 4/4] Generating summary... ---")
            patient_prompt_data = format_patient_data_for_prompt(patient_df)
            
            print("\n--- [Input Data for AI] ---")
            print(patient_prompt_data)
            
            summary = generate_summary(ai_voice, patient_prompt_data, SUMMARY_SYSTEM_PROMPT)
            
            # Display the final, formatted summary
            print("\n" + "="*54)
            print(" " * 17 + "Aarogya-AI Summary" + " " * 17)
            print("="*54)
            print(summary)
            print("="*54)
        else:
            print("❌ The data file is empty or missing the required 'source_file' column.")

    except FileNotFoundError:
        print(f"❌ Error: Could not find the data file at {PROCESSED_DATA_PATH}")

--- [Step 1/4] Initializing the AI Voice... ---
✅ AI Voice (Gemini Model) initialized successfully.

--- [Step 2/4] Loading data from /Users/ashishrathore/Aarogya-AI/data/processed/master_health_data_v2.csv... ---
✅ Loaded 1581 records.

--- [Step 3/4] Selecting a sample patient... ---
Sample patient data selected from file: ee9d0cc9a6ff0ce1775ac233da86d3f2.jpg

--- [Step 4/4] Generating summary... ---

--- [Input Data for AI] ---
Patient Name: Yashvi M. Patel

Test Results:
- Hemoglobin (Hb): 13.0 g/dL (Normal Range: 13.00-17.00)
- Total RBC count: 5.0 mill/cumm (Normal Range: 4.50-5.50)
- Packed Cell Volume (PCV): 45.0 % (Normal Range: 40-50)
- Mean Corpuscular Volume (MCV): 100.0 fL (Normal Range: 83-101)
- MCH: 30.0 pg (Normal Range: 27-32)
- MCHC: nan g/dL (Normal Range: 32.50-34.50)
- RDW: nan  (Normal Range: N/A)
- Total WBC count: 12.0 cumm (Normal Range: 11.60-14.00)
- Neutrophils: 60.0 % (Normal Range: 50-62)
- Lymphocytes: 30.0 % (Normal Range: 20-40)
- Eosinophils: 2.0 % (N