In [None]:

import numpy as np
import pandas as pd

def analyze_axis_meanings(df, coords_3d, n_extreme=10):
    """
    Analyze what each axis represents by looking at extreme papers
    """
    df_with_coords = df.copy()
    df_with_coords['x'] = coords_3d[:, 0]
    df_with_coords['y'] = coords_3d[:, 1] 
    df_with_coords['z'] = coords_3d[:, 2]
    
    print("🔍 AXIS INTERPRETATION ANALYSIS")
    print("=" * 60)
    
    # X-AXIS ANALYSIS
    print(f"\n📍 X-AXIS ANALYSIS:")
    print(f"{'='*40}")
    
    # Left extreme (most negative X)
    left_extreme = df_with_coords.nsmallest(n_extreme, 'x')
    print(f"\n⬅️  LEFT EXTREME (X < {left_extreme['x'].max():.2f}):")
    print("Study Types:", left_extreme['study_type'].value_counts().to_dict())
    print("Sample Titles:")
    for title in left_extreme['title'].head(3):
        print(f"   • {title}")
    
    # Right extreme (most positive X)
    right_extreme = df_with_coords.nlargest(n_extreme, 'x')
    print(f"\n➡️  RIGHT EXTREME (X > {right_extreme['x'].min():.2f}):")
    print("Study Types:", right_extreme['study_type'].value_counts().to_dict())
    print("Sample Titles:")
    for title in right_extreme['title'].head(3):
        print(f"   • {title}")
    
    # Y-AXIS ANALYSIS
    print(f"\n📍 Y-AXIS ANALYSIS:")
    print(f"{'='*40}")
    
    # Front extreme (most negative Y)
    front_extreme = df_with_coords.nsmallest(n_extreme, 'y')
    print(f"\n⬇️  FRONT EXTREME (Y < {front_extreme['y'].max():.2f}):")
    print("Study Types:", front_extreme['study_type'].value_counts().to_dict())
    print("Sample Titles:")
    for title in front_extreme['title'].head(3):
        print(f"   • {title}")
    
    # Back extreme (most positive Y)
    back_extreme = df_with_coords.nlargest(n_extreme, 'y')
    print(f"\n⬆️  BACK EXTREME (Y > {back_extreme['y'].min():.2f}):")
    print("Study Types:", back_extreme['study_type'].value_counts().to_dict())
    print("Sample Titles:")
    for title in back_extreme['title'].head(3):
        print(f"   • {title}")
    
    # Z-AXIS ANALYSIS
    print(f"\n📍 Z-AXIS ANALYSIS:")
    print(f"{'='*40}")
    
    # Bottom extreme (most negative Z)
    bottom_extreme = df_with_coords.nsmallest(n_extreme, 'z')
    print(f"\n⬇️  BOTTOM EXTREME (Z < {bottom_extreme['z'].max():.2f}):")
    print("Study Types:", bottom_extreme['study_type'].value_counts().to_dict())
    print("Sample Titles:")
    for title in bottom_extreme['title'].head(3):
        print(f"   • {title}")
    
    # Top extreme (most positive Z)
    top_extreme = df_with_coords.nlargest(n_extreme, 'z')
    print(f"\n⬆️  TOP EXTREME (Z > {top_extreme['z'].min():.2f}):")
    print("Study Types:", top_extreme['study_type'].value_counts().to_dict())
    print("Sample Titles:")
    for title in top_extreme['title'].head(3):
        print(f"   • {title}")
    
    return df_with_coords

def create_axis_labeled_plot(df, coords_3d):
    """
    Create a plot with axis interpretations labeled
    """
    import plotly.graph_objects as go
    
    # Analyze extremes first
    df_analyzed = analyze_axis_meanings(df, coords_3d)
    
    # Create base plot (use your existing function)
    fig = create_simple_3d_plot(df, coords_3d, color_by='study_type')
    
    # Add axis labels based on analysis
    axis_range = max(abs(coords_3d).max() * 1.3, 12)
    
    # X-axis labels
    fig.add_trace(go.Scatter3d(
        x=[-axis_range * 0.9], y=[0], z=[axis_range * 0.2],
        mode='text',
        text=['Observational<br>Studies'],
        textfont=dict(size=12, color='darkblue'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    fig.add_trace(go.Scatter3d(
        x=[axis_range * 0.9], y=[0], z=[axis_range * 0.2],
        mode='text',
        text=['Clinical<br>Trials'],
        textfont=dict(size=12, color='darkblue'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    # Y-axis labels
    fig.add_trace(go.Scatter3d(
        x=[0], y=[-axis_range * 0.9], z=[axis_range * 0.2],
        mode='text',
        text=['Clinical<br>Practice'],
        textfont=dict(size=12, color='darkgreen'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    fig.add_trace(go.Scatter3d(
        x=[0], y=[axis_range * 0.9], z=[axis_range * 0.2],
        mode='text',
        text=['Basic<br>Research'],
        textfont=dict(size=12, color='darkgreen'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    # Z-axis labels
    fig.add_trace(go.Scatter3d(
        x=[axis_range * 0.2], y=[0], z=[-axis_range * 0.9],
        mode='text',
        text=['Prevention'],
        textfont=dict(size=12, color='darkorange'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    fig.add_trace(go.Scatter3d(
        x=[axis_range * 0.2], y=[0], z=[axis_range * 0.9],
        mode='text',
        text=['Complications'],
        textfont=dict(size=12, color='darkorange'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    # Update title
    fig.update_layout(
        title='Medical Research Vector Space<br><sub>X: Study Type | Y: Research Setting | Z: Disease Complexity</sub>'
    )
    
    return fig

df_with_coords = analyze_axis_meanings(df, coords_3d, n_extreme=15)

fig_labeled = create_axis_labeled_plot(df, coords_3d)
fig_labeled.show()

fig_labeled.write_html("medical_research_3d_labeled.html")
print("💾 Saved labeled version: medical_research_3d_labeled.html")


def analyze_axis_correlations(df_with_coords):
    """
    Check which features correlate with each axis
    """
    print(f"\n📊 AXIS CORRELATION ANALYSIS:")
    print(f"{'='*50}")
    
    # Correlation with year
    x_year_corr = df_with_coords['x'].corr(df_with_coords['year'])
    y_year_corr = df_with_coords['y'].corr(df_with_coords['year'])
    z_year_corr = df_with_coords['z'].corr(df_with_coords['year'])
    
    print(f"\n📅 Publication Year Correlations:")
    print(f"   X-axis: {x_year_corr:.3f}")
    print(f"   Y-axis: {y_year_corr:.3f}")
    print(f"   Z-axis: {z_year_corr:.3f}")
    
    # Correlation with citations
    x_cit_corr = df_with_coords['x'].corr(df_with_coords['citations'])
    y_cit_corr = df_with_coords['y'].corr(df_with_coords['citations'])
    z_cit_corr = df_with_coords['z'].corr(df_with_coords['citations'])
    
    print(f"\n📈 Citation Count Correlations:")
    print(f"   X-axis: {x_cit_corr:.3f}")
    print(f"   Y-axis: {y_cit_corr:.3f}")  
    print(f"   Z-axis: {z_cit_corr:.3f}")
    
    # Study type distribution along each axis
    print(f"\n📋 Study Type Distribution by Axis:")
    for axis in ['x', 'y', 'z']:
        print(f"\n{axis.upper()}-Axis:")
        axis_study_corr = df_with_coords.groupby('study_type')[axis].mean().sort_values()
        for study_type, mean_pos in axis_study_corr.items():
            print(f"   {study_type}: {mean_pos:.2f}")

# Run correlation analysis
analyze_axis_correlations(df_with_coords)

🔍 AXIS INTERPRETATION ANALYSIS

📍 X-AXIS ANALYSIS:

⬅️  LEFT EXTREME (X < -1.98):
Study Types: {'Observational Study': 14, 'Systematic Review': 1}
Sample Titles:
   • 3. Prevention or Delay of Diabetes and Associated Comorbidit...
   • 4. Comprehensive Medical Evaluation and Assessment of Comorb...
   • 4. Comprehensive Medical Evaluation and Assessment of Comorb...

➡️  RIGHT EXTREME (X > 3.07):
Study Types: {'Observational Study': 8, 'Case Report': 5, 'Cohort Study': 1, 'Systematic Review': 1}
Sample Titles:
   • Sequential Presentation of Obsessive-Compulsive Disorder and...
   • A Wolfram-like syndrome family: Case report....
   • [Wolfram-like syndrome: a case report]....

📍 Y-AXIS ANALYSIS:

⬇️  FRONT EXTREME (Y < -2.29):
Study Types: {'Observational Study': 9, 'Cohort Study': 3, 'Meta-Analysis': 1, 'Clinical Trial': 1, 'Case Report': 1}
Sample Titles:
   • Arginine-stimulated copeptin in children and adolescents....
   • Arginine-Stimulated Copeptin-Based Diagnosis of Central Di

💾 Saved labeled version: medical_research_3d_labeled.html

📊 AXIS CORRELATION ANALYSIS:

📅 Publication Year Correlations:
   X-axis: -0.170
   Y-axis: 0.216
   Z-axis: -0.190

📈 Citation Count Correlations:
   X-axis: nan
   Y-axis: nan
   Z-axis: nan

📋 Study Type Distribution by Axis:

X-Axis:
   Cross-Sectional Study: -1.18
   Clinical Trial: -1.11
   Systematic Review: -0.40
   Meta-Analysis: -0.23
   Observational Study: -0.15
   Cohort Study: 0.60
   Case Report: 1.98

Y-Axis:
   Meta-Analysis: -1.07
   Cohort Study: -0.72
   Case Report: -0.53
   Clinical Trial: -0.14
   Cross-Sectional Study: -0.04
   Observational Study: 0.13
   Systematic Review: 0.18

Z-Axis:
   Clinical Trial: -0.26
   Observational Study: -0.16
   Case Report: 0.04
   Systematic Review: 0.61
   Cohort Study: 0.92
   Meta-Analysis: 1.14
   Cross-Sectional Study: 1.26



invalid value encountered in divide


invalid value encountered in divide



In [None]:


import numpy as np
import pandas as pd
import plotly.graph_objects as go

def analyze_patient_axes_detailed(df, coords_3d, n_extreme=20):
    """
    Detailed analysis of what each axis represents in patient clinical space
    """
    df_with_coords = df.copy()
    df_with_coords['x'] = coords_3d[:, 0]
    df_with_coords['y'] = coords_3d[:, 1] 
    df_with_coords['z'] = coords_3d[:, 2]
    
    print("🔍 DETAILED PATIENT AXIS INTERPRETATION")
    print("=" * 70)
    
    # X-AXIS ANALYSIS - Disease Control & Management
    print(f"\n📍 X-AXIS: Disease Control & Management")
    print(f"{'='*50}")
    
    left_patients = df_with_coords.nsmallest(n_extreme, 'x')
    right_patients = df_with_coords.nlargest(n_extreme, 'x')
    
    print(f"\n⬅️  LEFT EXTREME (X < {left_patients['x'].max():.2f}):")
    print(f"   🎯 HbA1c Control:")
    print(f"      • Average HbA1c: {left_patients['hba1c_current'].mean():.1f}%")
    print(f"      • Well controlled (<7%): {(left_patients['hba1c_current'] < 7).sum()}/{len(left_patients)} ({(left_patients['hba1c_current'] < 7).mean()*100:.1f}%)")
    print(f"   💊 Treatment:")
    print(f"      • Average adherence: {left_patients['adherence'].mean():.1f}%")
    print(f"      • Excellent response: {(left_patients['treatment_response'] == 'Excellent').sum()}/{len(left_patients)}")
    print(f"   📊 Stability:")
    print(f"      • Average hospitalizations: {left_patients['hospitalizations'].mean():.1f}")
    print(f"      • Average complications: {left_patients['complications_count'].mean():.1f}")
    
    print(f"\n➡️  RIGHT EXTREME (X > {right_patients['x'].min():.2f}):")
    print(f"   🎯 HbA1c Control:")
    print(f"      • Average HbA1c: {right_patients['hba1c_current'].mean():.1f}%")
    print(f"      • Poorly controlled (>9%): {(right_patients['hba1c_current'] > 9).sum()}/{len(right_patients)} ({(right_patients['hba1c_current'] > 9).mean()*100:.1f}%)")
    print(f"   💊 Treatment:")
    print(f"      • Average adherence: {right_patients['adherence'].mean():.1f}%")
    print(f"      • Poor response: {(right_patients['treatment_response'] == 'Poor').sum()}/{len(right_patients)}")
    print(f"   📊 Stability:")
    print(f"      • Average hospitalizations: {right_patients['hospitalizations'].mean():.1f}")
    print(f"      • Average complications: {right_patients['complications_count'].mean():.1f}")
    
    # Y-AXIS ANALYSIS - Disease Duration & Progression
    print(f"\n📍 Y-AXIS: Disease Duration & Progression")
    print(f"{'='*50}")
    
    front_patients = df_with_coords.nsmallest(n_extreme, 'y')
    back_patients = df_with_coords.nlargest(n_extreme, 'y')
    
    print(f"\n⬇️  FRONT EXTREME (Y < {front_patients['y'].max():.2f}):")
    print(f"   ⏰ Disease Timeline:")
    print(f"      • Average age: {front_patients['age'].mean():.1f} years")
    print(f"      • Average years with diabetes: {front_patients['years_since_diagnosis'].mean():.1f}")
    print(f"      • Newly diagnosed (<2 years): {(front_patients['years_since_diagnosis'] < 2).sum()}/{len(front_patients)}")
    print(f"   🔬 Disease State:")
    print(f"      • Type 1 diabetes: {(front_patients['diabetes_type'] == 'Type 1').sum()}/{len(front_patients)}")
    print(f"      • Average complications: {front_patients['complications_count'].mean():.1f}")
    print(f"      • Simple complexity: {(front_patients['complexity'] == 'Simple').sum()}/{len(front_patients)}")
    
    print(f"\n⬆️  BACK EXTREME (Y > {back_patients['y'].min():.2f}):")
    print(f"   ⏰ Disease Timeline:")
    print(f"      • Average age: {back_patients['age'].mean():.1f} years")
    print(f"      • Average years with diabetes: {back_patients['years_since_diagnosis'].mean():.1f}")
    print(f"      • Long-standing (>10 years): {(back_patients['years_since_diagnosis'] > 10).sum()}/{len(back_patients)}")
    print(f"   🔬 Disease State:")
    print(f"      • Type 2 diabetes: {(back_patients['diabetes_type'] == 'Type 2').sum()}/{len(back_patients)}")
    print(f"      • Average complications: {back_patients['complications_count'].mean():.1f}")
    print(f"      • Complex cases: {(back_patients['complexity'] == 'Complex').sum()}/{len(back_patients)}")
    
    # Z-AXIS ANALYSIS - Clinical Complexity & Comorbidity Burden
    print(f"\n📍 Z-AXIS: Clinical Complexity & Comorbidity Burden")
    print(f"{'='*50}")
    
    bottom_patients = df_with_coords.nsmallest(n_extreme, 'z')
    top_patients = df_with_coords.nlargest(n_extreme, 'z')
    
    print(f"\n⬇️  BOTTOM EXTREME (Z < {bottom_patients['z'].max():.2f}):")
    print(f"   🏥 Healthcare Utilization:")
    print(f"      • Average hospitalizations: {bottom_patients['hospitalizations'].mean():.1f}")
    print(f"      • Zero hospitalizations: {(bottom_patients['hospitalizations'] == 0).sum()}/{len(bottom_patients)}")
    print(f"   🔗 Disease Burden:")
    print(f"      • Average comorbidities: {bottom_patients['comorbidities_count'].mean():.1f}")
    print(f"      • Average complications: {bottom_patients['complications_count'].mean():.1f}")
    print(f"      • Low risk category: {(bottom_patients['risk_category'] == 'Low Risk').sum()}/{len(bottom_patients)}")
    print(f"   💰 Social Factors:")
    print(f"      • Average SES score: {bottom_patients['ses_score'].mean():.1f}/10")
    print(f"      • Private insurance: {(bottom_patients['insurance'] == 'Private').sum()}/{len(bottom_patients)}")
    
    print(f"\n⬆️  TOP EXTREME (Z > {top_patients['z'].min():.2f}):")
    print(f"   🏥 Healthcare Utilization:")
    print(f"      • Average hospitalizations: {top_patients['hospitalizations'].mean():.1f}")
    print(f"      • Multiple hospitalizations (>2): {(top_patients['hospitalizations'] > 2).sum()}/{len(top_patients)}")
    print(f"   🔗 Disease Burden:")
    print(f"      • Average comorbidities: {top_patients['comorbidities_count'].mean():.1f}")
    print(f"      • Average complications: {top_patients['complications_count'].mean():.1f}")
    print(f"      • High/Very High risk: {(top_patients['risk_category'].isin(['High Risk', 'Very High Risk'])).sum()}/{len(top_patients)}")
    print(f"   💰 Social Factors:")
    print(f"      • Average SES score: {top_patients['ses_score'].mean():.1f}/10")
    print(f"      • Medicaid/Uninsured: {(top_patients['insurance'].isin(['Medicaid', 'Uninsured'])).sum()}/{len(top_patients)}")
    
    return df_with_coords

def create_axis_correlation_analysis(df_with_coords):
    """
    Analyze correlations between axes and clinical variables
    """
    print(f"\n📊 AXIS CORRELATION ANALYSIS")
    print(f"{'='*50}")
    
    # Key clinical variables to analyze
    clinical_vars = {
        'hba1c_current': 'Current HbA1c',
        'hba1c_change': 'HbA1c Change',
        'years_since_diagnosis': 'Years with Diabetes',
        'age': 'Age',
        'complications_count': 'Complications',
        'comorbidities_count': 'Comorbidities',
        'hospitalizations': 'Hospitalizations',
        'adherence': 'Medication Adherence',
        'cv_risk': 'CV Risk Score',
        'kidney_risk': 'Kidney Risk',
        'bmi': 'BMI',
        'ses_score': 'Socioeconomic Score'
    }
    
    axes = ['x', 'y', 'z']
    axis_names = ['X (Control)', 'Y (Progression)', 'Z (Complexity)']
    
    print(f"\n{'Variable':<20} {'X-Axis':<12} {'Y-Axis':<12} {'Z-Axis':<12}")
    print(f"{'-'*20} {'-'*12} {'-'*12} {'-'*12}")
    
    for var, name in clinical_vars.items():
        if var in df_with_coords.columns:
            correlations = []
            for axis in axes:
                corr = df_with_coords[axis].corr(df_with_coords[var])
                correlations.append(f"{corr:>+.3f}")
            
            print(f"{name:<20} {correlations[0]:<12} {correlations[1]:<12} {correlations[2]:<12}")
    
    # Find strongest correlations for each axis
    print(f"\n🎯 STRONGEST CORRELATIONS:")
    for i, (axis, name) in enumerate(zip(axes, axis_names)):
        print(f"\n{name}:")
        axis_corrs = []
        for var, var_name in clinical_vars.items():
            if var in df_with_coords.columns:
                corr = abs(df_with_coords[axis].corr(df_with_coords[var]))
                axis_corrs.append((var_name, corr))
        
        # Sort by correlation strength and show top 3
        axis_corrs.sort(key=lambda x: x[1], reverse=True)
        for j, (var_name, corr) in enumerate(axis_corrs[:3]):
            print(f"   {j+1}. {var_name}: {corr:.3f}")

def create_labeled_patient_plot(df, coords_3d):
    """
    Create patient plot with axis interpretation labels
    """
    # Create base plot
    fig = create_patient_3d_plot(df, coords_3d, color_by='control_status')
    
    # Add axis interpretation labels
    axis_range = max(abs(coords_3d).max() * 1.3, 12)
    
    # X-axis labels (Disease Control)
    fig.add_trace(go.Scatter3d(
        x=[-axis_range * 0.9], y=[0], z=[axis_range * 0.2],
        mode='text',
        text=['Well<br>Controlled'],
        textfont=dict(size=12, color='darkblue'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    fig.add_trace(go.Scatter3d(
        x=[axis_range * 0.9], y=[0], z=[axis_range * 0.2],
        mode='text',
        text=['Poorly<br>Controlled'],
        textfont=dict(size=12, color='darkred'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    # Y-axis labels (Disease Progression)
    fig.add_trace(go.Scatter3d(
        x=[0], y=[-axis_range * 0.9], z=[axis_range * 0.2],
        mode='text',
        text=['Recently<br>Diagnosed'],
        textfont=dict(size=12, color='darkgreen'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    fig.add_trace(go.Scatter3d(
        x=[0], y=[axis_range * 0.9], z=[axis_range * 0.2],
        mode='text',
        text=['Long-standing<br>Diabetes'],
        textfont=dict(size=12, color='darkgreen'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    # Z-axis labels (Clinical Complexity)
    fig.add_trace(go.Scatter3d(
        x=[axis_range * 0.2], y=[0], z=[-axis_range * 0.9],
        mode='text',
        text=['Simple<br>Cases'],
        textfont=dict(size=12, color='darkorange'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    fig.add_trace(go.Scatter3d(
        x=[axis_range * 0.2], y=[0], z=[axis_range * 0.9],
        mode='text',
        text=['Complex<br>Cases'],
        textfont=dict(size=12, color='darkorange'),
        showlegend=False,
        hoverinfo='none'
    ))
    
    # Update title with axis interpretations
    fig.update_layout(
        title='Diabetes Patient Clinical Space<br><sub>X: Disease Control | Y: Disease Progression | Z: Clinical Complexity</sub>'
    )
    
    return fig

# ============================================
# USAGE EXAMPLE
# ============================================

# After you have df and coords_3d from your patient analysis:

# 1. Detailed axis analysis
print("🔍 Running detailed axis interpretation...")
df_with_coords = analyze_patient_axes_detailed(df, coords_3d, n_extreme=25)

# 2. Correlation analysis
create_axis_correlation_analysis(df_with_coords)

# 3. Create labeled plot
print("\n🎨 Creating labeled patient visualization...")
fig_labeled = create_labeled_patient_plot(df, coords_3d)
fig_labeled.show()

# 4. Save labeled version
fig_labeled.write_html("diabetes_patients_labeled_axes.html")
print("💾 Saved: diabetes_patients_labeled_axes.html")

🔍 Running detailed axis interpretation...
🔍 DETAILED PATIENT AXIS INTERPRETATION

📍 X-AXIS: Disease Control & Management

⬅️  LEFT EXTREME (X < -1.19):
   🎯 HbA1c Control:
      • Average HbA1c: 10.5%
      • Well controlled (<7%): 0/25 (0.0%)
   💊 Treatment:
      • Average adherence: 77.1%
      • Excellent response: 11/25
   📊 Stability:
      • Average hospitalizations: 0.5
      • Average complications: 2.1

➡️  RIGHT EXTREME (X > 0.96):
   🎯 HbA1c Control:
      • Average HbA1c: 6.2%
      • Poorly controlled (>9%): 0/25 (0.0%)
   💊 Treatment:
      • Average adherence: 80.3%
      • Poor response: 1/25
   📊 Stability:
      • Average hospitalizations: 0.2
      • Average complications: 0.3

📍 Y-AXIS: Disease Duration & Progression

⬇️  FRONT EXTREME (Y < -1.15):
   ⏰ Disease Timeline:
      • Average age: 50.6 years
      • Average years with diabetes: 5.0
      • Newly diagnosed (<2 years): 11/25
   🔬 Disease State:
      • Type 1 diabetes: 23/25
      • Average complications: 

💾 Saved: diabetes_patients_labeled_axes.html


In [None]:

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import requests
import json
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
WEAVIATE_URL = "http://localhost:8080"

def fetch_complete_patient_data(limit=1000):
    """
    Fetch complete diabetes patient data from Weaviate for analysis
    """
    print(f"🔍 Fetching complete patient dataset from Weaviate...")
    
    query = {
        "query": f"""
        {{
            Get {{
                DiabetesPatients(limit: {limit}) {{
                    patient_id
                    age
                    gender
                    diabetes_type
                    years_since_diagnosis
                    hba1c_current
                    hba1c_baseline
                    hba1c_change
                    glucose_fasting
                    glucose_random
                    bmi
                    blood_pressure_systolic
                    blood_pressure_diastolic
                    creatinine
                    egfr
                    ldl_cholesterol
                    hdl_cholesterol
                    triglycerides
                    albumin_creatinine_ratio
                    symptoms
                    complications
                    comorbidities
                    medications
                    medication_adherence
                    smoking_status
                    alcohol_use
                    exercise_weekly_hours
                    diet_quality_score
                    cardiovascular_risk_score
                    kidney_disease_risk
                    retinopathy_risk
                    neuropathy_risk
                    treatment_response
                    weight_change
                    hospitalizations_past_year
                    emergency_visits_past_year
                    insurance_type
                    socioeconomic_score
                    last_updated
                    data_quality_score
                }}
            }}
        }}
        """
    }
    
    try:
        response = requests.post(
            f"{WEAVIATE_URL}/v1/graphql",
            json=query,
            headers={"Content-Type": "application/json"},
            timeout=30
        )
        
        result = response.json()
        patients = result.get('data', {}).get('Get', {}).get('DiabetesPatients', [])
        
        if not patients:
            print("⚠️ No patient data found")
            return None
        
        # Process and clean data
        processed_data = []
        
        for patient in patients:
            # Handle list fields
            symptoms = patient.get('symptoms', [])
            complications = patient.get('complications', [])
            comorbidities = patient.get('comorbidities', [])
            medications = patient.get('medications', [])
            
            # Ensure lists
            if isinstance(symptoms, str):
                symptoms = [symptoms] if symptoms else []
            if isinstance(complications, str):
                complications = [complications] if complications else []
            if isinstance(comorbidities, str):
                comorbidities = [comorbidities] if comorbidities else []
            if isinstance(medications, str):
                medications = [medications] if medications else []
            
            processed_data.append({
                'patient_id': patient.get('patient_id', ''),
                'age': patient.get('age', 0),
                'gender': patient.get('gender', 'Unknown'),
                'diabetes_type': patient.get('diabetes_type', 'Unknown'),
                'years_since_diagnosis': patient.get('years_since_diagnosis', 0),
                'hba1c_current': patient.get('hba1c_current', 0),
                'hba1c_baseline': patient.get('hba1c_baseline', 0),
                'hba1c_change': patient.get('hba1c_change', 0),
                'glucose_fasting': patient.get('glucose_fasting', 0),
                'glucose_random': patient.get('glucose_random', 0),
                'bmi': patient.get('bmi', 0),
                'bp_systolic': patient.get('blood_pressure_systolic', 0),
                'bp_diastolic': patient.get('blood_pressure_diastolic', 0),
                'creatinine': patient.get('creatinine', 0),
                'egfr': patient.get('egfr', 0),
                'ldl': patient.get('ldl_cholesterol', 0),
                'hdl': patient.get('hdl_cholesterol', 0),
                'triglycerides': patient.get('triglycerides', 0),
                'acr': patient.get('albumin_creatinine_ratio', 0),
                'symptoms_count': len(symptoms),
                'complications_count': len(complications),
                'comorbidities_count': len(comorbidities),
                'medications_count': len(medications),
                'symptoms_list': symptoms,
                'complications_list': complications,
                'comorbidities_list': comorbidities,
                'medications_list': medications,
                'adherence': patient.get('medication_adherence', 0),
                'smoking': patient.get('smoking_status', 'Unknown'),
                'alcohol': patient.get('alcohol_use', 'Unknown'),
                'exercise': patient.get('exercise_weekly_hours', 0),
                'diet_score': patient.get('diet_quality_score', 0),
                'cv_risk': patient.get('cardiovascular_risk_score', 0),
                'kidney_risk': patient.get('kidney_disease_risk', 0),
                'retinopathy_risk': patient.get('retinopathy_risk', 0),
                'neuropathy_risk': patient.get('neuropathy_risk', 0),
                'treatment_response': patient.get('treatment_response', 'Unknown'),
                'weight_change': patient.get('weight_change', 0),
                'hospitalizations': patient.get('hospitalizations_past_year', 0),
                'er_visits': patient.get('emergency_visits_past_year', 0),
                'insurance': patient.get('insurance_type', 'Unknown'),
                'ses_score': patient.get('socioeconomic_score', 0),
                'data_quality': patient.get('data_quality_score', 0)
            })
        
        df = pd.DataFrame(processed_data)
        print(f"✅ Loaded {len(df)} patients for analysis")
        return df
        
    except Exception as e:
        print(f"❌ Error: {e}")
        return None

def generate_dataset_overview(df):
    """
    Generate comprehensive dataset overview
    """
    print("\n" + "="*80)
    print("📊 DIABETES PATIENT DATASET OVERVIEW")
    print("="*80)
    
    # Basic dataset info
    print(f"\n📈 DATASET SIZE & STRUCTURE:")
    print(f"   • Total Patients: {len(df):,}")
    print(f"   • Total Features: {len(df.columns)}")
    print(f"   • Data Quality Score: {df['data_quality'].mean():.3f}")
    print(f"   • Missing Data: {df.isnull().sum().sum()} values ({(df.isnull().sum().sum()/(len(df)*len(df.columns))*100):.2f}%)")
    
    # Demographics summary
    print(f"\n👥 DEMOGRAPHICS:")
    print(f"   • Age Range: {df['age'].min()}-{df['age'].max()} years (μ={df['age'].mean():.1f}, σ={df['age'].std():.1f})")
    print(f"   • Gender Distribution:")
    gender_counts = df['gender'].value_counts()
    for gender, count in gender_counts.items():
        print(f"     - {gender}: {count:,} ({count/len(df)*100:.1f}%)")
    
    print(f"   • Diabetes Type Distribution:")
    diabetes_counts = df['diabetes_type'].value_counts()
    for dtype, count in diabetes_counts.items():
        print(f"     - {dtype}: {count:,} ({count/len(df)*100:.1f}%)")
    
    # Clinical characteristics
    print(f"\n🔬 CLINICAL CHARACTERISTICS:")
    print(f"   • Disease Duration: {df['years_since_diagnosis'].mean():.1f} ± {df['years_since_diagnosis'].std():.1f} years")
    print(f"   • HbA1c Current: {df['hba1c_current'].mean():.1f} ± {df['hba1c_current'].std():.1f}%")
    print(f"   • HbA1c Change: {df['hba1c_change'].mean():.1f} ± {df['hba1c_change'].std():.1f}%")
    print(f"   • BMI: {df['bmi'].mean():.1f} ± {df['bmi'].std():.1f} kg/m²")
    print(f"   • Blood Pressure: {df['bp_systolic'].mean():.0f}/{df['bp_diastolic'].mean():.0f} mmHg")
    
    # Disease control analysis
    well_controlled = (df['hba1c_current'] < 7).sum()
    moderately_controlled = ((df['hba1c_current'] >= 7) & (df['hba1c_current'] < 8)).sum()
    poorly_controlled = ((df['hba1c_current'] >= 8) & (df['hba1c_current'] < 9)).sum()
    very_poor = (df['hba1c_current'] >= 9).sum()
    
    print(f"\n🎯 GLYCEMIC CONTROL STATUS:")
    print(f"   • Well Controlled (<7%): {well_controlled:,} ({well_controlled/len(df)*100:.1f}%)")
    print(f"   • Moderately Controlled (7-8%): {moderately_controlled:,} ({moderately_controlled/len(df)*100:.1f}%)")
    print(f"   • Poorly Controlled (8-9%): {poorly_controlled:,} ({poorly_controlled/len(df)*100:.1f}%)")
    print(f"   • Very Poor Control (≥9%): {very_poor:,} ({very_poor/len(df)*100:.1f}%)")
    
    # Complications and comorbidities
    print(f"\n⚕️ DISEASE BURDEN:")
    print(f"   • Average Complications: {df['complications_count'].mean():.1f}")
    print(f"   • Patients with Complications: {(df['complications_count'] > 0).sum():,} ({(df['complications_count'] > 0).mean()*100:.1f}%)")
    print(f"   • Average Comorbidities: {df['comorbidities_count'].mean():.1f}")
    print(f"   • Average Medications: {df['medications_count'].mean():.1f}")
    
    # Healthcare utilization
    print(f"\n🏥 HEALTHCARE UTILIZATION:")
    print(f"   • Average Hospitalizations/Year: {df['hospitalizations'].mean():.1f}")
    print(f"   • Patients with Hospitalizations: {(df['hospitalizations'] > 0).sum():,} ({(df['hospitalizations'] > 0).mean()*100:.1f}%)")
    print(f"   • Average ER Visits/Year: {df['er_visits'].mean():.1f}")
    
    # Risk profiles
    print(f"\n⚠️ RISK PROFILES:")
    print(f"   • Cardiovascular Risk: {df['cv_risk'].mean():.3f} ± {df['cv_risk'].std():.3f}")
    print(f"   • Kidney Disease Risk: {df['kidney_risk'].mean():.3f} ± {df['kidney_risk'].std():.3f}")
    print(f"   • Retinopathy Risk: {df['retinopathy_risk'].mean():.3f} ± {df['retinopathy_risk'].std():.3f}")
    print(f"   • Neuropathy Risk: {df['neuropathy_risk'].mean():.3f} ± {df['neuropathy_risk'].std():.3f}")
    
    # Treatment response
    print(f"\n💊 TREATMENT RESPONSE:")
    response_counts = df['treatment_response'].value_counts()
    for response, count in response_counts.items():
        print(f"   • {response}: {count:,} ({count/len(df)*100:.1f}%)")
    
    print(f"   • Average Medication Adherence: {df['adherence'].mean():.1f}%")
    
    # Social determinants
    print(f"\n💰 SOCIAL DETERMINANTS:")
    insurance_counts = df['insurance'].value_counts()
    for insurance, count in insurance_counts.items():
        print(f"   • {insurance}: {count:,} ({count/len(df)*100:.1f}%)")
    print(f"   • Average SES Score: {df['ses_score'].mean():.1f}/10")
    
    # Lifestyle factors
    print(f"\n🏃 LIFESTYLE FACTORS:")
    smoking_counts = df['smoking'].value_counts()
    for status, count in smoking_counts.items():
        print(f"   • {status} Smoker: {count:,} ({count/len(df)*100:.1f}%)")
    print(f"   • Average Exercise: {df['exercise'].mean():.1f} hours/week")
    print(f"   • Average Diet Score: {df['diet_score'].mean():.1f}/10")

def analyze_distributions(df):
    """
    Analyze key variable distributions
    """
    print(f"\n📊 DISTRIBUTION ANALYSIS:")
    print(f"{'='*50}")
    
    # Key continuous variables
    continuous_vars = {
        'age': 'Age (years)',
        'years_since_diagnosis': 'Years with Diabetes',
        'hba1c_current': 'Current HbA1c (%)',
        'bmi': 'BMI (kg/m²)',
        'bp_systolic': 'Systolic BP (mmHg)',
        'creatinine': 'Creatinine (mg/dL)',
        'egfr': 'eGFR (mL/min/1.73m²)',
        'ldl': 'LDL Cholesterol (mg/dL)',
        'hdl': 'HDL Cholesterol (mg/dL)',
        'cv_risk': 'Cardiovascular Risk',
        'adherence': 'Medication Adherence (%)'
    }
    
    print(f"\n📈 CONTINUOUS VARIABLES:")
    print(f"{'Variable':<25} {'Mean':<8} {'Std':<8} {'Min':<8} {'25%':<8} {'50%':<8} {'75%':<8} {'Max':<8}")
    print("-" * 89)
    
    for var, name in continuous_vars.items():
        if var in df.columns:
            stats_data = df[var].describe()
            print(f"{name:<25} {stats_data['mean']:>7.1f} {stats_data['std']:>7.1f} {stats_data['min']:>7.1f} {stats_data['25%']:>7.1f} {stats_data['50%']:>7.1f} {stats_data['75%']:>7.1f} {stats_data['max']:>7.1f}")

def analyze_most_common_items(df):
    """
    Analyze most common complications, comorbidities, medications, and symptoms
    """
    print(f"\n📋 MOST COMMON CLINICAL ITEMS:")
    print(f"{'='*50}")
    
    # Most common complications
    all_complications = []
    for comp_list in df['complications_list']:
        if isinstance(comp_list, list):
            all_complications.extend(comp_list)
    
    if all_complications:
        comp_counts = pd.Series(all_complications).value_counts()
        print(f"\n🔴 TOP COMPLICATIONS:")
        for i, (comp, count) in enumerate(comp_counts.head(10).items(), 1):
            percent = count / len(df) * 100
            print(f"   {i:2d}. {comp:<30}: {count:3d} patients ({percent:4.1f}%)")
    
    # Most common comorbidities
    all_comorbidities = []
    for comorb_list in df['comorbidities_list']:
        if isinstance(comorb_list, list):
            all_comorbidities.extend(comorb_list)
    
    if all_comorbidities:
        comorb_counts = pd.Series(all_comorbidities).value_counts()
        print(f"\n🟡 TOP COMORBIDITIES:")
        for i, (comorb, count) in enumerate(comorb_counts.head(10).items(), 1):
            percent = count / len(df) * 100
            print(f"   {i:2d}. {comorb:<30}: {count:3d} patients ({percent:4.1f}%)")
    
    # Most common medications
    all_medications = []
    for med_list in df['medications_list']:
        if isinstance(med_list, list):
            all_medications.extend(med_list)
    
    if all_medications:
        med_counts = pd.Series(all_medications).value_counts()
        print(f"\n💊 TOP MEDICATIONS:")
        for i, (med, count) in enumerate(med_counts.head(10).items(), 1):
            percent = count / len(df) * 100
            print(f"   {i:2d}. {med:<35}: {count:3d} patients ({percent:4.1f}%)")
    
    # Most common symptoms
    all_symptoms = []
    for symp_list in df['symptoms_list']:
        if isinstance(symp_list, list):
            all_symptoms.extend(symp_list)
    
    if all_symptoms:
        symp_counts = pd.Series(all_symptoms).value_counts()
        print(f"\n🔵 TOP SYMPTOMS:")
        for i, (symp, count) in enumerate(symp_counts.head(10).items(), 1):
            percent = count / len(df) * 100
            print(f"   {i:2d}. {symp:<30}: {count:3d} patients ({percent:4.1f}%)")

def create_comprehensive_dashboard(df):
    """
    Create comprehensive visualization dashboard
    """
    print(f"\n🎨 Creating comprehensive data visualization dashboard...")
    
    # Create subplot grid
    fig = make_subplots(
        rows=3, cols=3,
        subplot_titles=[
            'Age Distribution',
            'HbA1c Distribution', 
            'BMI Distribution',
            'Diabetes Type',
            'Treatment Response',
            'Complications Count',
            'HbA1c vs Years with Diabetes',
            'Risk Scores Distribution',
            'Healthcare Utilization'
        ],
        specs=[
            [{"type": "histogram"}, {"type": "histogram"}, {"type": "histogram"}],
            [{"type": "pie"}, {"type": "pie"}, {"type": "bar"}],
            [{"type": "scatter"}, {"type": "box"}, {"type": "bar"}]
        ]
    )
    
    # Row 1: Histograms
    fig.add_trace(go.Histogram(x=df['age'], name='Age', nbinsx=20), row=1, col=1)
    fig.add_trace(go.Histogram(x=df['hba1c_current'], name='HbA1c', nbinsx=20), row=1, col=2)
    fig.add_trace(go.Histogram(x=df['bmi'], name='BMI', nbinsx=20), row=1, col=3)
    
    # Row 2: Pie charts and bar
    diabetes_counts = df['diabetes_type'].value_counts()
    fig.add_trace(go.Pie(
        labels=diabetes_counts.index, 
        values=diabetes_counts.values, 
        name="Diabetes Type",
        textinfo='label+percent',
        textposition='inside',
        textfont_size=12,
        showlegend=True
    ), row=2, col=1)
    
    response_counts = df['treatment_response'].value_counts()
    fig.add_trace(go.Pie(
        labels=response_counts.index, 
        values=response_counts.values, 
        name="Treatment Response",
        textinfo='label+percent',
        textposition='inside',
        textfont_size=12,
        showlegend=True
    ), row=2, col=2)
    
    comp_counts = df['complications_count'].value_counts().sort_index()
    fig.add_trace(go.Bar(x=comp_counts.index, y=comp_counts.values, name="Complications"), row=2, col=3)
    
    # Row 3: Scatter, box, and bar
    fig.add_trace(go.Scatter(
        x=df['years_since_diagnosis'], 
        y=df['hba1c_current'],
        mode='markers',
        marker=dict(size=4, opacity=0.6),
        name="HbA1c vs Years"
    ), row=3, col=1)
    
    # Risk scores box plot
    risk_data = [df['cv_risk'], df['kidney_risk'], df['retinopathy_risk'], df['neuropathy_risk']]
    risk_names = ['CV Risk', 'Kidney Risk', 'Retinopathy Risk', 'Neuropathy Risk']
    
    for i, (data, name) in enumerate(zip(risk_data, risk_names)):
        fig.add_trace(go.Box(y=data, name=name, boxpoints='outliers'), row=3, col=2)
    
    # Healthcare utilization
    hosp_counts = df['hospitalizations'].value_counts().sort_index()
    fig.add_trace(go.Bar(x=hosp_counts.index, y=hosp_counts.values, name="Hospitalizations"), row=3, col=3)
    
    # Update layout
    fig.update_layout(
        title_text="Diabetes Patient Dataset Comprehensive Analysis",
        showlegend=True,
        height=1000,
        paper_bgcolor='white',
        legend=dict(
            orientation="v",
            yanchor="top",
            y=0.99,
            xanchor="left",
            x=1.01
        )
    )
    
    return fig

def generate_summary_statistics(df):
    """
    Generate comprehensive summary statistics
    """
    print(f"\n📊 COMPREHENSIVE SUMMARY STATISTICS:")
    print(f"{'='*60}")
    
    # Clinical outcomes by diabetes type
    print(f"\n🔬 CLINICAL OUTCOMES BY DIABETES TYPE:")
    for dtype in df['diabetes_type'].unique():
        subset = df[df['diabetes_type'] == dtype]
        print(f"\n{dtype} (n={len(subset)}):")
        print(f"   • Age: {subset['age'].mean():.1f} ± {subset['age'].std():.1f} years")
        print(f"   • HbA1c: {subset['hba1c_current'].mean():.1f} ± {subset['hba1c_current'].std():.1f}%")
        print(f"   • BMI: {subset['bmi'].mean():.1f} ± {subset['bmi'].std():.1f}")
        print(f"   • Complications: {subset['complications_count'].mean():.1f}")
        print(f"   • CV Risk: {subset['cv_risk'].mean():.3f}")
        print(f"   • Well controlled (<7%): {(subset['hba1c_current'] < 7).mean()*100:.1f}%")
    
    # Outcomes by control status
    print(f"\n📈 OUTCOMES BY GLYCEMIC CONTROL:")
    control_groups = {
        'Well Controlled': df[df['hba1c_current'] < 7],
        'Moderately Controlled': df[(df['hba1c_current'] >= 7) & (df['hba1c_current'] < 8)],
        'Poorly Controlled': df[(df['hba1c_current'] >= 8) & (df['hba1c_current'] < 9)],
        'Very Poor Control': df[df['hba1c_current'] >= 9]
    }
    
    for group_name, group_df in control_groups.items():
        if len(group_df) > 0:
            print(f"\n{group_name} (n={len(group_df)}):")
            print(f"   • Complications: {group_df['complications_count'].mean():.1f}")
            print(f"   • Hospitalizations: {group_df['hospitalizations'].mean():.1f}")
            print(f"   • CV Risk: {group_df['cv_risk'].mean():.3f}")
            print(f"   • Adherence: {group_df['adherence'].mean():.1f}%")
    
    # Correlation analysis
    print(f"\n🔗 KEY CORRELATIONS:")
    correlations = [
        ('HbA1c vs Complications', df['hba1c_current'].corr(df['complications_count'])),
        ('HbA1c vs Hospitalizations', df['hba1c_current'].corr(df['hospitalizations'])),
        ('HbA1c vs CV Risk', df['hba1c_current'].corr(df['cv_risk'])),
        ('Adherence vs HbA1c', df['adherence'].corr(df['hba1c_current'])),
        ('Age vs Complications', df['age'].corr(df['complications_count'])),
        ('BMI vs HbA1c', df['bmi'].corr(df['hba1c_current'])),
        ('Years DM vs Complications', df['years_since_diagnosis'].corr(df['complications_count']))
    ]
    
    for desc, corr in correlations:
        if not pd.isna(corr):
            print(f"   • {desc}: {corr:+.3f}")

# ============================================
# MAIN ANALYSIS EXECUTION
# ============================================

def run_complete_analysis():
    """
    Run complete patient data analysis
    """
    print("🚀 Starting Comprehensive Diabetes Patient Data Analysis")
    print("="*80)
    
    # 1. Fetch data
    df = fetch_complete_patient_data(limit=1000)
    
    if df is None:
        print("❌ Could not fetch data. Exiting analysis.")
        return None
    
    # 2. Generate overview
    generate_dataset_overview(df)
    
    # 3. Analyze distributions
    analyze_distributions(df)
    
    # 4. Analyze common items
    analyze_most_common_items(df)
    
    # 5. Generate summary statistics
    generate_summary_statistics(df)
    
    # 6. Create comprehensive dashboard
    fig = create_comprehensive_dashboard(df)
    fig.show()
    
    # 7. Save dashboard
    fig.write_html("diabetes_patient_analysis_dashboard.html")
    print(f"\n💾 Saved analysis dashboard: diabetes_patient_analysis_dashboard.html")
    
    # 8. Generate final summary
    print(f"\n" + "="*80)
    print("✅ ANALYSIS COMPLETE")
    print("="*80)
    print(f"📊 Dataset: {len(df):,} diabetes patients analyzed")
    print(f"🎯 Key Findings:")
    print(f"   • Average HbA1c: {df['hba1c_current'].mean():.1f}%")
    print(f"   • Well controlled patients: {(df['hba1c_current'] < 7).mean()*100:.1f}%")
    print(f"   • Patients with complications: {(df['complications_count'] > 0).mean()*100:.1f}%")
    print(f"   • Average CV risk: {df['cv_risk'].mean():.3f}")
    print(f"   • Type 2 diabetes prevalence: {(df['diabetes_type'] == 'Type 2').mean()*100:.1f}%")
    
    return df

if __name__ == "__main__":
    # Run the complete analysis
    patient_df = run_complete_analysis()

🚀 Starting Comprehensive Diabetes Patient Data Analysis
🔍 Fetching complete patient dataset from Weaviate...
✅ Loaded 500 patients for analysis

📊 DIABETES PATIENT DATASET OVERVIEW

📈 DATASET SIZE & STRUCTURE:
   • Total Patients: 500
   • Total Features: 43
   • Data Quality Score: 0.950
   • Missing Data: 0 values (0.00%)

👥 DEMOGRAPHICS:
   • Age Range: 18-95 years (μ=54.1, σ=15.3)
   • Gender Distribution:
     - Female: 264 (52.8%)
     - Male: 236 (47.2%)
   • Diabetes Type Distribution:
     - Type 2: 440 (88.0%)
     - Type 1: 60 (12.0%)

🔬 CLINICAL CHARACTERISTICS:
   • Disease Duration: 5.1 ± 4.8 years
   • HbA1c Current: 8.1 ± 1.9%
   • HbA1c Change: -1.4 ± 1.0%
   • BMI: 29.6 ± 5.1 kg/m²
   • Blood Pressure: 131/80 mmHg

🎯 GLYCEMIC CONTROL STATUS:
   • Well Controlled (<7%): 154 (30.8%)
   • Moderately Controlled (7-8%): 88 (17.6%)
   • Poorly Controlled (8-9%): 91 (18.2%)
   • Very Poor Control (≥9%): 167 (33.4%)

⚕️ DISEASE BURDEN:
   • Average Complications: 1.2
   • Pat


💾 Saved analysis dashboard: diabetes_patient_analysis_dashboard.html

✅ ANALYSIS COMPLETE
📊 Dataset: 500 diabetes patients analyzed
🎯 Key Findings:
   • Average HbA1c: 8.1%
   • Well controlled patients: 30.8%
   • Patients with complications: 64.0%
   • Average CV risk: 0.375
   • Type 2 diabetes prevalence: 88.0%
