In [91]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, List, Any



In [93]:
# Set some display options for better readability in the notebook
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 120)

print("Libraries imported and display options set.")

Libraries imported and display options set.


## 1. Configuration: The Control Panel

In a production system, these parameters would be loaded from an external file (e.g., `params.yaml`). This allows for easy updates without changing the code. For this proof-of-concept, we define them in a dedicated cell.

In [114]:

# Define the plausible biological ranges for various tests.
PLAUSIBILITY_RULES: Dict[str, Dict[str, float]] = {
    # Hematology
    "Hemoglobin":           {"min": 5.0, "max": 25.0},      # g/dL
    "Platelet Count":       {"min": 10000, "max": 1000000}, # /cumm
    "RBC Count":            {"min": 1.0, "max": 10.0},      # M/uL
    "WBC Count":            {"min": 500, "max": 50000},     # cells/cumm
    
    # Lipid Profile
    "Total Cholesterol":    {"min": 50, "max": 500},        # mg/dL
    "Triglycerides":        {"min": 30, "max": 1000},       # mg/dL
    "HDL Cholesterol":      {"min": 10, "max": 150},        # mg/dL
    "LDL Cholesterol":      {"min": 20, "max": 400},        # mg/dL
    
    # Liver Function
    "AST (SGOT)":           {"min": 5, "max": 500},         # U/L
    "ALT (SGPT)":           {"min": 5, "max": 500},         # U/L
    
    # Kidney Function
    "Creatinine":           {"min": 0.2, "max": 15.0},      # mg/dL
    "Urea":                 {"min": 5, "max": 200},         # mg/dL
}

# File paths
PROCESSED_DATA_PATH: str = '/Users/ashishrathore/Aarogya-AI/data/processed/master_health_data_v2.csv'

print("Configuration loaded.")

Configuration loaded.


## 2. Core Logic: The Toolbox

Here, we define our reusable functions. This modular approach is key to writing clean, maintainable, and testable code. Each function has a specific, well-defined purpose.

In [117]:
def load_and_clean_data(filepath: str) -> pd.DataFrame:
    """
    Loads the processed data and performs necessary cleaning and type conversion.
    """
    try:
        df = pd.read_csv(filepath)
        print(f"Successfully loaded {len(df)} records from {filepath}")
        
        if 'result' not in df.columns:
            raise ValueError("The 'result' column is missing from the CSV file.")

        # Convert 'result' column to numeric, coercing errors to NaN
        df['result_numeric'] = pd.to_numeric(df['result'], errors='coerce')
        
        # Report on cleaning
        initial_rows = len(df)
        df.dropna(subset=['result_numeric'], inplace=True)
        final_rows = len(df)
        print(f"Data Cleaning Report: Dropped {initial_rows - final_rows} rows with non-numeric results.")
        
        return df
        
    except FileNotFoundError:
        print(f"Error: The file was not found at {filepath}")
        return pd.DataFrame()

def find_plausibility_anomalies(df: pd.DataFrame, rules: Dict[str, Dict[str, float]]) -> pd.DataFrame:
    """
    Identifies records where test results fall outside plausible biological ranges using efficient pandas operations.
    """
    if df.empty:
        return pd.DataFrame()
        
    # Filter the DataFrame to only include tests for which we have rules
    df_filtered = df[df['test_name'].isin(rules.keys())].copy()
    
    # A lambda function to check if a value is within its rule's range
    def is_anomalous(row):
        rule = rules.get(row['test_name'])
        if rule:
            return not (rule['min'] <= row['result_numeric'] <= rule['max'])
        return False

    # Apply the function to find anomalies
    anomalies_mask = df_filtered.apply(is_anomalous, axis=1)
    anomalies_df = df_filtered[anomalies_mask].copy()
    
    # Prepare the final report dataframe
    if not anomalies_df.empty:
        anomalies_df['plausible_range'] = anomalies_df['test_name'].apply(lambda name: f"{rules[name]['min']} - {rules[name]['max']}")
        anomalies_df['reason'] = 'Result is outside the plausible range.'
        
        # Select and rename columns for a clean report
        report_columns = {
            'source_file': 'Source File',
            'test_name': 'Test Name',
            'result_numeric': 'Anomalous Result',
            'plausible_range': 'Plausible Range',
            'reason': 'Reason'
        }
        return anomalies_df[list(report_columns.keys())].rename(columns=report_columns)
        
    return pd.DataFrame()

print("Core logic functions defined.")

Core logic functions defined.


## 3. Main Execution: The Factory Floor

This is where we run the pipeline. We call our core functions in a logical sequence to produce the final result. The code here is simple and easy to follow because the complex logic is encapsulated within the functions above.

In [120]:
# 1. Load and clean the data
print("--- [Step 1/3] Loading and cleaning data... ---")
master_df = load_and_clean_data(PROCESSED_DATA_PATH)

if not master_df.empty:
    print(f"--- [Step 2/3] Analyzing {len(master_df)} clean records... ---")
    
    # 2. Deploy the Sentry to find anomalies
    print("Deploying The Sentry to find plausibility anomalies...")
    suspicious_reports_df = find_plausibility_anomalies(master_df, PLAUSIBILITY_RULES)
    
    # 3. Display the Sentry's report
    print("--- [Step 3/3] Generating Sentry Report... ---")
    if not suspicious_reports_df.empty:
        print(f"\n>>> 🚨 Sentry has flagged {len(suspicious_reports_df)} records! <<<\n")
        display(suspicious_reports_df)
    else:
        print("\n>>> ✅ Sentry reports: All clear! No plausibility anomalies found. <<<\n")
else:
    print("\n--- Pipeline stopped as no data could be loaded. ---")

--- [Step 1/3] Loading and cleaning data... ---
Successfully loaded 1581 records from /Users/ashishrathore/Aarogya-AI/data/processed/master_health_data_v2.csv
Data Cleaning Report: Dropped 152 rows with non-numeric results.
--- [Step 2/3] Analyzing 1429 clean records... ---
Deploying The Sentry to find plausibility anomalies...
--- [Step 3/3] Generating Sentry Report... ---

>>> 🚨 Sentry has flagged 48 records! <<<



Unnamed: 0,Source File,Test Name,Anomalous Result,Plausible Range,Reason
14,070a0bf186108bfd43470afab9738e32.jpg,Total Cholesterol,2.0,50 - 500,Result is outside the plausible range.
96,ilide.info-blood-report-pdf-pr_9a268b5250cb855...,Platelet Count,245.0,10000 - 1000000,Result is outside the plausible range.
143,ilide.info-blood-report-yatendra-08112022-pr_f...,Platelet Count,211.0,10000 - 1000000,Result is outside the plausible range.
204,WM115.pdf,Total Cholesterol,2.0,50 - 500,Result is outside the plausible range.
210,WM115.pdf,RBC Count,25.0,1.0 - 10.0,Result is outside the plausible range.
212,TEST-REPORT-02.12.2022-.pdf,Triglycerides,1153.0,30 - 1000,Result is outside the plausible range.
264,ilide.info-father-blood-test-report-sgrh-pr_c5...,Hemoglobin,1.0,5.0 - 25.0,Result is outside the plausible range.
338,ilide.info-max-lab-report-pr_37cb37b89df9e8ada...,Platelet Count,166.0,10000 - 1000000,Result is outside the plausible range.
351,ilide.info-lft-lalpath-pr_954f6d86d0a1283e1391...,AST (SGOT),1357.9,5 - 500,Result is outside the plausible range.
352,ilide.info-lft-lalpath-pr_954f6d86d0a1283e1391...,ALT (SGPT),913.5,5 - 500,Result is outside the plausible range.


## 4. Anomaly Detection: The AI Detective (Level 2)

The Sentry is good at catching values that are biologically impossible. Now, we will build an "AI Detective" to catch values that are **statistically improbable**. We will use an unsupervised machine learning algorithm called **Isolation Forest**. This algorithm is excellent at finding outliers in data.

**Methodology:**
1.  **Feature Engineering:** Prepare the data for the model.
2.  **Model Training:** Train the Isolation Forest model on the "normal" data.
3.  **Prediction:** Use the trained model to predict which data points are anomalies.
4.  **Reporting:** Display the records that the AI has flagged as suspicious.

In [125]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler

def find_statistical_anomalies(df: pd.DataFrame) -> pd.DataFrame:
    """
    Uses the Isolation Forest algorithm to find statistical anomalies.
    It analyzes each test type independently.

    Args:
        df (pd.DataFrame): The cleaned input DataFrame.

    Returns:
        pd.DataFrame: A DataFrame containing records flagged as statistical anomalies.
    """
    all_anomalies = []
    
    # We will run the model on each test type separately for better accuracy
    for test_name in df['test_name'].unique():
        df_test = df[df['test_name'] == test_name].copy()
        
        # The model needs at least 2 data points to run
        if len(df_test) < 2:
            continue
            
        # 1. Feature Engineering: We only need the numeric result
        # We need to reshape it because the model expects a 2D array
        X = df_test[['result_numeric']].values
        
        # 2. Model Training
        # contamination='auto' is a smart way to let the model decide the threshold for anomalies
        model = IsolationForest(contamination='auto', random_state=42)
        model.fit(X)
        
        # 3. Prediction
        # The model predicts -1 for anomalies and 1 for normal data
        df_test['anomaly_score'] = model.predict(X)
        
        # Filter for the anomalies
        anomalies = df_test[df_test['anomaly_score'] == -1]
        
        if not anomalies.empty:
            all_anomalies.append(anomalies)
            
    if not all_anomalies:
        return pd.DataFrame()
        
    # Combine all found anomalies into a single DataFrame
    final_anomalies_df = pd.concat(all_anomalies)
    
    # Prepare a clean report
    final_anomalies_df['reason'] = 'Result is a statistical outlier compared to other reports.'
    report_columns = {
        'source_file': 'Source File',
        'test_name': 'Test Name',
        'result_numeric': 'Anomalous Result',
        'reason': 'Reason'
    }
    return final_anomalies_df[list(report_columns.keys())].rename(columns=report_columns)

# --- EXECUTION ---
print("\n--- 🤖 Deploying The AI Detective... ---")
# We use the same 'master_df' that has already been cleaned
statistical_anomalies_df = find_statistical_anomalies(master_df)

if not statistical_anomalies_df.empty:
    print(f"\n--- 🕵️ AI Detective has flagged {len(statistical_anomalies_df)} records! ---")
    display(statistical_anomalies_df)
else:
    print("\n--- ✅ AI Detective reports: All clear! No statistical anomalies found. ---")


--- 🤖 Deploying The AI Detective... ---

--- 🕵️ AI Detective has flagged 236 records! ---


Unnamed: 0,Source File,Test Name,Anomalous Result,Reason
126,ilide.info-blood-report-yatendra-08112022-pr_f...,Packed Cell Volume (PCV),52.1,Result is a statistical outlier compared to ot...
390,HDL.pdf,Packed Cell Volume (PCV),33.0,Result is a statistical outlier compared to ot...
336,ilide.info-max-lab-report-pr_37cb37b89df9e8ada...,MCH,32.9,Result is a statistical outlier compared to ot...
568,ilide.info-lab-report-sample-blood-pr_abc0d047...,MCH,24.0,Result is a statistical outlier compared to ot...
619,ilide.info-blood-report-pr_0f79618c92430ffd441...,MCH,20.0,Result is a statistical outlier compared to ot...
...,...,...,...,...
1125,ilide.info-11-discharge-summary-pr_a8fd819c803...,UREA,27.0,Result is a statistical outlier compared to ot...
1109,ilide.info-11-discharge-summary-pr_a8fd819c803...,SERUM CREATININE,0.6,Result is a statistical outlier compared to ot...
1110,ilide.info-11-discharge-summary-pr_a8fd819c803...,eGFR,122.0,Result is a statistical outlier compared to ot...
1123,ilide.info-11-discharge-summary-pr_a8fd819c803...,D DIMER,0.3,Result is a statistical outlier compared to ot...
