# Exploratory Data Analysis for Patient Readmission (with Interactive Widgets)

This notebook performs EDA for patient readmission and includes an interactive seaborn correlation explorer using ipywidgets. It handles missing data (replaces `?` with NaN), produces summary statistics and visualizations, and provides a simple baseline logistic regression AUC estimate. Use the widget UI to interactively pick columns and generate correlation visualizations.

Notes:
- Update DATA_PATH in the configuration cell to point to your CSV file (default `diabetic_data.csv`).
- Install required packages if necessary: pandas, numpy, matplotlib, seaborn, scikit-learn, ipywidgets.
- Plots and CSVs are saved to `./eda_plots/` if you enable saving in the UI.


In [15]:
# Uncomment to install dependencies in the notebook environment (run once if needed)
# !pip install pandas numpy matplotlib seaborn scikit-learn scipy ipywidgets

import os
import sys
import textwrap
from typing import Tuple, List, Optional

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import mutual_info_classif

import ipywidgets as widgets
from IPython.display import display, clear_output

sns.set(style="whitegrid")


In [16]:
# Configuration - update DATA_PATH to your CSV file path if needed
DATA_PATH = "C:/Users/reinacherc/Downloads/diabetes+130-us+hospitals+for+years+1999-2008 (1)/diabetic_data.csv"  # change to your path
MAPPING_PATH = None  # optional mapping file
PLOT_DIR = "../eda_plots"
RANDOM_STATE = 42
os.makedirs(PLOT_DIR, exist_ok=True)


In [17]:
def load_data(path: str) -> pd.DataFrame:
    """Load CSV into DataFrame with safe defaults and normalized column names.
    Replace all '?' values with NaN.
    """
    # Check the file exists to fail fast with a clear message
    if not os.path.exists(path):
        raise FileNotFoundError(f"Data file not found: {path}")
    # Read CSV and instruct pandas that '?' values are missing
    df = pd.read_csv(path, low_memory=False, na_values=['?'])
    # Normalize column names by stripping whitespace
    df.columns = [c.strip() for c in df.columns]
    # Replace any remaining '?' string occurrences with actual NaN values
    df = df.replace('?', np.nan)
    # Report basic load information to user
    print(f"Loaded {len(df)} rows and {len(df.columns)} columns. Replaced all '?' values with NaN.")
    return df

def brief_info(df: pd.DataFrame):
    """Print shape, dtypes, memory, and head."""
    print("Data shape:", df.shape)
    print("\nColumns and dtypes:")
    print(df.dtypes.value_counts(dropna=False).to_string())
    print("\nFirst 5 rows:")
    display(df.head().T)
    print("\nMemory usage (MB): {:.2f}".format(df.memory_usage(deep=True).sum() / 1024**2))

def missing_summary(df: pd.DataFrame) -> pd.DataFrame:
    """Return DataFrame summarizing missingness and unique counts."""
    miss = df.isnull().sum()
    pct = (miss / len(df)) * 100
    uniques = df.nunique(dropna=False)
    summary = pd.DataFrame({
        "missing_count": miss,
        "missing_pct": pct,
        "unique_count": uniques,
        "dtype": df.dtypes.astype(str)
    }).sort_values("missing_pct", ascending=False)
    return summary

def plot_missingness_matrix(df: pd.DataFrame, fname: str = None):
    """Plot simple missingness heatmap (rows sampled for large datasets)."""
    sample_frac = 0.1 if len(df) > 5000 else 1.0
    sampled = df.sample(frac=sample_frac, random_state=RANDOM_STATE)
    plt.figure(figsize=(14, max(4, len(sampled.columns) / 3)))
    sns.heatmap(sampled.isnull().T, cbar=False, cmap=["#2ecc71", "#e74c3c"])
    plt.xlabel("samples (possibly sampled)")
    plt.ylabel("columns")
    plt.title("Missingness matrix (red = missing)")
    if fname:
        plt.tight_layout()
        plt.savefig(fname, dpi=150)
    plt.show()
    plt.close()

def plot_missingness_by_target(df: pd.DataFrame, target_col: str, fname: str = None):
    """Plot percent missing per column split by target classes (top columns only)."""
    col_miss = df.isnull().mean().sort_values(ascending=False)
    top_cols = col_miss.index[:20].tolist()
    if target_col not in df.columns:
        return
    miss_by_target = df[top_cols + [target_col]].groupby(target_col).apply(lambda g: g.isnull().mean()).T
    plt.figure(figsize=(12, max(4, len(top_cols) * 0.4)))
    miss_by_target.plot.barh(stacked=False)
    plt.xlabel("Fraction missing")
    plt.title("Missingness fraction by target class")
    plt.legend(title=target_col)
    if fname:
        plt.tight_layout()
        plt.savefig(fname, dpi=150)
    plt.show()
    plt.close()

def summarize_numeric(df: pd.DataFrame, numeric_cols: List[str], fname: str = None):
    if not numeric_cols:
        return
    desc = df[numeric_cols].describe(percentiles=[.01, .05, .25, .5, .75, .95, .99]).T
    print("\nNumeric features summary (selected percentiles):")
    keys = [k for k in ["count", "mean", "std", "min", "1%", "5%", "25%", "50%", "75%", "95%", "99%", "max"] if k in desc.columns]
    display(desc[keys])
    n = len(numeric_cols)
    cols = min(4, n)
    rows = int(np.ceil(n / cols))
    plt.figure(figsize=(4 * cols, 3 * rows))
    for i, c in enumerate(numeric_cols, 1):
        plt.subplot(rows, cols, i)
        sns.histplot(df[c].dropna(), kde=False, bins=30)
        plt.title(c)
    plt.tight_layout()
    if fname:
        plt.savefig(fname, dpi=150)
    plt.show()
    plt.close()

def summarize_categorical(df: pd.DataFrame, cat_cols: List[str], top_k: int = 10, fname: str = None):
    if not cat_cols:
        return
    print("\nTop categories per categorical feature (showing up to top_k):")
    for c in cat_cols:
        counts = df[c].value_counts(dropna=False).head(top_k)
        print(f"\n{c} ({df[c].dtype}) - unique: {df[c].nunique(dropna=True)}")
        print(counts.to_string())
    card = df[cat_cols].nunique().sort_values(ascending=False)
    plot_cols = card.head(6).index.tolist()
    plt.figure(figsize=(14, 3 * len(plot_cols)))
    for i, c in enumerate(plot_cols, 1):
        plt.subplot(len(plot_cols), 1, i)
        sns.countplot(y=c, data=df, order=df[c].value_counts().index[:20])
        plt.title(c)
    plt.tight_layout()
    if fname:
        plt.savefig(fname, dpi=150)
    plt.show()
    plt.close()

def target_engineering(df: pd.DataFrame, target_col: str = "readmitted") -> pd.DataFrame:
    """
    Create binary target column `readmit_30`:
      - 1 if readmitted within 30 days (string '<30')
      - 0 otherwise
    """
    # If the expected target column is missing, warn and return original DF unchanged
    if target_col not in df.columns:
        print(f"Warning: target column {target_col} not in DataFrame. Skipping target engineering.")
        return df
    # Work on a copy to avoid mutating caller data unexpectedly
    df = df.copy()
    # Normalize the target text and create the binary flag
    vals = df[target_col].astype(str).str.strip()
    df["readmit_30"] = np.where(vals == "<30", 1, 0)
    # Keep the original values for reference
    df["readmit_original"] = df[target_col]
    # Print counts so user can quickly see class balance
    print("\nTarget value counts (original):")
    print(df[target_col].value_counts(dropna=False))
    print("\nBinary target value counts (readmit_30):")
    print(df["readmit_30"].value_counts(dropna=False))
    return df

def quick_target_plots(df: pd.DataFrame, target: str = "readmit_30", fname: str = None):
    if target not in df.columns:
        return
    plt.figure(figsize=(6, 4))
    sns.countplot(x=target, data=df)
    plt.title("Target distribution (binary)")
    if fname:
        plt.tight_layout()
        plt.savefig(fname, dpi=150)
    plt.show()
    plt.close()

def correlation_with_target(df: pd.DataFrame, numeric_cols: List[str], target: str) -> pd.Series:
    """Compute Pearson correlation for numeric features with binary target (interpreted numerically)."""
    corrs = {}
    if target not in df.columns:
        return pd.Series(dtype=float)
    for c in numeric_cols:
        try:
            if df[c].dropna().shape[0] < 10:
                corrs[c] = np.nan
                continue
            corrs[c] = df[[c, target]].dropna().corr().iloc[0, 1]
        except Exception:
            corrs[c] = np.nan
    return pd.Series(corrs).sort_values(key=lambda s: s.abs(), ascending=False)

def mutual_information_rank(df: pd.DataFrame, features: List[str], target: str, discrete_threshold: int = 20) -> pd.Series:
    """Estimate mutual information between features and target.
    This handles mixed types by encoding categories numerically first.
    """
    if target not in df.columns:
        return pd.Series(dtype=float)

    X = df[features].copy()
    y = df[target]
    for col in X.columns:
        if X[col].dtype == "object" or X[col].nunique(dropna=True) <= discrete_threshold:
            X[col] = X[col].astype("category").cat.codes.replace({-1: np.nan})
    X = X.fillna(-999)
    y = y.fillna(0)
    try:
        mi = mutual_info_classif(X, y, discrete_features="auto", random_state=RANDOM_STATE)
        return pd.Series(mi, index=features).sort_values(ascending=False)
    except Exception as e:
        print("Mutual information calculation failed:", e)
        return pd.Series(dtype=float)

def baseline_logistic_cv(df: pd.DataFrame, target: str, max_features: int = 50) -> Tuple[float, pd.DataFrame]:
    """Build a simple baseline logistic regression and return mean AUC and MI ranking."""
    if target not in df.columns:
        print("Target not present; skipping baseline model.")
        return 0.0, pd.DataFrame()

    exclude = {target, "readmit_original"}
    candidates = [c for c in df.columns if c not in exclude]
    candidates = [c for c in candidates if df[c].nunique(dropna=True) > 1 and df[c].nunique(dropna=True) < len(df) * 0.9]
    mi = mutual_information_rank(df, candidates, target)
    top_features = mi.dropna().head(max_features).index.tolist()
    print(f"\nTop {len(top_features)} features by mutual information:")
    print(mi.head(20))

    X = df[top_features]
    y = df[target]

    numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_cols = [c for c in X.columns if c not in numeric_cols]

    numeric_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical_transformer = Pipeline(steps=[
        ("imputer", SimpleImputer(strategy="constant", fill_value="__MISSING__")),
        ("ohe", OneHotEncoder(handle_unknown="ignore", sparse=False))
    ])
    preprocessor = ColumnTransformer(transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols)
    ], remainder="drop")

    clf = Pipeline(steps=[
        ("pre", preprocessor),
        ("logreg", LogisticRegression(max_iter=1000, random_state=RANDOM_STATE))
    ])

    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    try:
        scores = cross_val_score(clf, X, y, cv=cv, scoring="roc_auc", n_jobs=-1)
        print("\nBaseline logistic regression AUC scores (5-fold):", scores)
        print("Mean AUC:", np.nanmean(scores))
    except Exception as e:
        print("Baseline model training failed:", e)
        scores = np.array([np.nan])
    return np.nanmean(scores), mi

# ---------- Seaborn correlation helper functions (for interactive exploration) ----------
DEFAULT_CMAP = "vlag"

def compute_numeric_columns(df: pd.DataFrame) -> List[str]:
    return df.select_dtypes(include=[np.number]).columns.tolist()

def compute_categorical_columns(df: pd.DataFrame) -> List[str]:
    return df.select_dtypes(include=["object", "category"]).columns.tolist()

def corr_matrix(df: pd.DataFrame, cols: Optional[List[str]] = None, method: str = "pearson") -> pd.DataFrame:
    if cols is None:
        cols = compute_numeric_columns(df)
    mat = df[cols].corr(method=method)
    return mat

def plot_corr_heatmap(
    corr: pd.DataFrame,
    title: str = "Correlation heatmap",
    annot: bool = True,
    fmt: str = ".2f",
    vmax: float = 1.0,
    vmin: float = -1.0,
    cmap: str = DEFAULT_CMAP,
    fname: Optional[str] = None,
    figsize: Tuple[int, int] = (10, 8),
    mask_upper: bool = True,
):
    plt.figure(figsize=figsize)
    mask = np.triu(np.ones_like(corr, dtype=bool)) if mask_upper else None
    sns.heatmap(
        corr,
        mask=mask,
        cmap=cmap,
        vmax=vmax,
        vmin=vmin,
        center=0,
        square=False,
        linewidths=0.5,
        annot=annot,
        fmt=fmt,
        cbar_kws={"shrink": 0.6},
    )
    plt.title(title)
    plt.tight_layout()
    if fname:
        plt.savefig(fname, dpi=150)
    plt.show()
    plt.close()

def plot_clustermap(
    corr: pd.DataFrame,
    title: str = "Correlation clustermap (hierarchical)",
    cmap: str = DEFAULT_CMAP,
    fname: Optional[str] = None,
    figsize: Tuple[int, int] = (10, 10),
):
    try:
        cg = sns.clustermap(
            corr,
            cmap=cmap,
            center=0,
            figsize=figsize,
            linewidths=0.5,
            cbar_kws={"shrink": 0.6},
        )
        cg.fig.suptitle(title, y=1.02)
        if fname:
            cg.savefig(fname, dpi=150)
        plt.show()
    except Exception as e:
        print("Clustermap failed:", e)

def pairplot_sample(
    df: pd.DataFrame,
    cols: List[str],
    hue: Optional[str] = None,
    sample_frac: float = 0.2,
    sample_n: Optional[int] = None,
    diag_kind: str = "hist",
    kind: str = "scatter",
    fname: Optional[str] = None,
    plot_kws: dict = None,
):
    if sample_n is not None:
        data = df[cols + ([hue] if hue and hue in df.columns else [])].dropna().sample(n=min(sample_n, len(df)), random_state=RANDOM_STATE)
    else:
        frac = min(max(sample_frac, 0.0), 1.0)
        data = df[cols + ([hue] if hue and hue in df.columns else [])].dropna().sample(frac=frac, random_state=RANDOM_STATE)
    plot_kws = plot_kws or {"s": 15, "alpha": 0.6}
    try:
        pp = sns.pairplot(data, vars=cols, hue=hue, diag_kind=diag_kind, kind=kind, plot_kws=plot_kws)
        if fname:
            pp.fig.savefig(fname, dpi=150)
        
        plt.show()
    except Exception as e:
        print("Pairplot failed (maybe too many features or memory constraints):", e)

def jointplot_pair(df: pd.DataFrame, x: str, y: str, hue: Optional[str] = None, kind: str = "reg", fname: Optional[str] = None):
    data = df[[x, y] + ([hue] if hue and hue in df.columns else [])].dropna()
    try:
        jp = sns.jointplot(data=data, x=x, y=y, hue=hue, kind=kind, height=7, marginal_kws=dict(bins=30))
        if fname:
            jp.fig.savefig(fname, dpi=150)
        plt.show()
    except Exception as e:
        print("jointplot failed with hue or kind; falling back to scatter:", e)
        sns.scatterplot(data=data, x=x, y=y, hue=hue, alpha=0.6)
        plt.title(f"{x} vs {y}")
        if fname:
            plt.savefig(fname, dpi=150)
        plt.show()
        plt.close()

def categorical_correlation_heatmap(df: pd.DataFrame, cat_cols: List[str], target: Optional[str] = None, fname: Optional[str] = None, top_n: int = 30, normalize: bool = True):
    for c in cat_cols:
        if target and target in df.columns:
            ct = pd.crosstab(df[c], df[target], normalize="index").fillna(0)
            title = f"Normalized distribution of {c} by {target}"
            plt.figure(figsize=(8, max(3, ct.shape[0] * 0.25)))
            sns.heatmap(ct, cmap="Blues", annot=True, fmt=".2f", cbar_kws={"label": "fraction"})
            plt.title(title)
            plt.ylabel(c)
            plt.xlabel(target)
            if fname:
                base, ext = os.path.splitext(fname)
                out = f"{base}_{c}{ext or '.png'}"
                plt.tight_layout()
                plt.savefig(out, dpi=150)
            plt.show()
            plt.close()
        else:
            counts = df[c].value_counts().head(top_n)
            plt.figure(figsize=(8, 6))
            sns.barplot(y=counts.index, x=counts.values, palette="muted")
            plt.title(f"Top categories for {c}")
            if fname:
                base, ext = os.path.splitext(fname)
                out = f"{base}_{c}{ext or '.png'}"
                plt.tight_layout()
                plt.savefig(out, dpi=150)
            plt.show()
            plt.close()

def corr_with_target_table(df: pd.DataFrame, target: str = "readmit_30", methods: List[str] = ["pearson", "spearman"]) -> pd.DataFrame:
    numeric = compute_numeric_columns(df)
    if target not in df.columns:
        raise KeyError(f"Target {target} not in dataframe columns.")
    results = {}
    for m in methods:
        mat = corr_matrix(df, cols=numeric + [target], method=m)
        results[m] = mat[target].drop(index=target)
    result_df = pd.DataFrame(results)
    if "pearson" in result_df.columns:
        result_df = result_df.reindex(result_df["pearson"].abs().sort_values(ascending=False).index)
    else:
        result_df = result_df.reindex(result_df.iloc[:, 0].abs().sort_values(ascending=False).index)
    return result_df

# ---------- run_eda function to run the full EDA flow (non-interactive) ----------
def run_eda(data_path: str = DATA_PATH, mapping_path: str = MAPPING_PATH):
    # Top-level coordinator: load data, summarize, run visualizations and baseline modeling
    print("Loading data from:", data_path)
    df = load_data(data_path)

    # ...existing code...

    # Create binary target used across EDA and modeling
    df = target_engineering(df, target_col="readmitted")

    # ...existing code...

    # Compute numeric correlations with the binary target (if present)
    if "readmit_30" in df.columns and numeric_cols:
        corrs = correlation_with_target(df, numeric_cols, "readmit_30")
        print("\nNumeric correlations with target (top 20 by absolute):")
        display(corrs.dropna().head(20))
        corrs.to_csv(os.path.join(PLOT_DIR, "numeric_target_correlations.csv"))

    # ...existing code...

    # Compute and save associations with the target using our helper (if available)
    if "readmit_30" in df.columns:
        try:
            # compute_feature_target_associations returns numeric (point-biserial) and categorical (Cramér's V) associations
            assoc_df = compute_feature_target_associations(df, target="readmit_30")
            out_csv = os.path.join(PLOT_DIR, f"associations_with_readmit_30.csv")
            assoc_df.to_csv(out_csv)
            print(f"\nTop associations with readmit_30:")
            display(assoc_df.head(30))
            print(f"Saved associations CSV to: {out_csv}")
        except Exception as e:
            # Keep the notebook robust: surface the error but continue
            print("Failed to compute associations with target:", e)

    print(f"\nSaved plots & CSVs to: {os.path.abspath(PLOT_DIR)}")
    return df


In [18]:
# ---------- Interactive Seaborn Correlation Explorer UI ----------
# Output area used by all button callbacks so results appear in one place
out = widgets.Output()

# UI controls
path_text = widgets.Text(value=DATA_PATH, description='CSV Path:', layout=widgets.Layout(width='70%'))
load_button = widgets.Button(description='Load Data', button_style='primary')

numeric_select = widgets.SelectMultiple(options=[], description='Numeric:', rows=8, layout=widgets.Layout(width='45%'))
cat_select = widgets.SelectMultiple(options=[], description='Categorical:', rows=8, layout=widgets.Layout(width='45%'))

method_dropdown = widgets.Dropdown(options=['pearson', 'spearman', 'kendall'], value='pearson', description='Method:')
mask_toggle = widgets.Checkbox(value=True, description='Mask upper triangle')
annot_toggle = widgets.Checkbox(value=True, description='Annotate')

heatmap_button = widgets.Button(description='Show Heatmap', button_style='success')
clustermap_button = widgets.Button(description='Show Clustermap', button_style='success')
pairplot_button = widgets.Button(description='Show Pairplot', button_style='warning')
jointplot_button = widgets.Button(description='Show Jointplot', button_style='warning')

sample_frac_slider = widgets.FloatSlider(value=0.2, min=0.01, max=1.0, step=0.01, description='Sample frac:')
sample_n_int = widgets.IntText(value=500, description='Sample n (pair):')

save_plots_toggle = widgets.Checkbox(value=False, description='Save plots to eda_plots')

# layout
top_row = widgets.HBox([path_text, load_button])
selectors = widgets.HBox([numeric_select, cat_select])
controls = widgets.HBox([method_dropdown, mask_toggle, annot_toggle, save_plots_toggle])
plot_buttons = widgets.HBox([heatmap_button, clustermap_button, pairplot_button, jointplot_button])
sampling = widgets.HBox([sample_frac_slider, sample_n_int])

ui = widgets.VBox([top_row, selectors, controls, sampling, plot_buttons, out])

# Global df holder
global_df = {'df': None}

def refresh_column_selects(df: pd.DataFrame):
    # Populate the numeric and categorical selectors based on the loaded DataFrame
    numeric_cols = compute_numeric_columns(df)
    cat_cols = compute_categorical_columns(df)
    numeric_select.options = sorted(numeric_cols)
    cat_select.options = sorted(cat_cols)

def on_load_clicked(b):
    # Called when user clicks "Load Data"
    with out:
        clear_output()
        try:
            # Attempt to load the CSV provided in the widget's text box
            df = load_data(path_text.value)
        except Exception as e:
            # Provide a clear failure message inside the notebook UI
            print('Failed to load:', e)
        else:
            # Save loaded DataFrame in the global holder and show a preview
            global_df['df'] = df
            print('\nFirst 3 rows:')
            display(df.head(3))
            # Refresh column selectors so the user can pick fields for plotting
            refresh_column_selects(df)
            # Optionally launch a categorical widget explorer if available
            try:
                create_categorical_correlation_widget(df)
            except Exception as e:
                print("Failed to launch categorical widget:", e)

def on_heatmap_clicked(b):
    # Called when user clicks "Show Heatmap"
    with out:
        clear_output()
        df = global_df.get('df')
        if df is None:
            print('Load data first')
            return
        # Use widget-selected numeric columns to compute correlation matrix
        cols = list(numeric_select.value)
        if not cols:
            print('Select numeric columns for heatmap')
            return
        method = method_dropdown.value
        corr = corr_matrix(df, cols=cols, method=method)
        fname = os.path.join(PLOT_DIR, f'heatmap_{method}.png') if save_plots_toggle.value else None
        # Render heatmap with chosen options (mask/annot)
        plot_corr_heatmap(corr, title=f'{method.title()} correlation heatmap', annot=annot_toggle.value, fname=fname, mask_upper=mask_toggle.value)
        if fname:
            print('Saved heatmap to', fname)

def on_clustermap_clicked(b):
    # Called when user clicks "Show Clustermap"
    with out:
        clear_output()
        df = global_df.get('df')
        if df is None:
            print('Load data first')
            return
        cols = list(numeric_select.value)
        if not cols:
            print('Select numeric columns for clustermap')
            return
        method = method_dropdown.value
        corr = corr_matrix(df, cols=cols, method=method)
        fname = os.path.join(PLOT_DIR, f'clustermap_{method}.png') if save_plots_toggle.value else None
        plot_clustermap(corr, fname=fname)
        if fname:
            print('Saved clustermap to', fname)

def on_pairplot_clicked(b):
    # Pairplot can be expensive; we respect sampling controls from the UI
    with out:
        clear_output()
        df = global_df.get('df')
        if df is None:
            print('Load data first')
            return
        cols = list(numeric_select.value)
        if not cols:
            print('Select numeric columns for pairplot (2-6 recommended)')
            return
        frac = sample_frac_slider.value
        n = sample_n_int.value if sample_n_int.value and sample_n_int.value > 0 else None
        fname = os.path.join(PLOT_DIR, f'pairplot_sample.png') if save_plots_toggle.value else None
        pairplot_sample(df, cols=cols, hue=( 'readmit_30' if 'readmit_30' in df.columns else None ), sample_frac=frac, sample_n=n, fname=fname)
        if fname:
            print('Saved pairplot to', fname)

def on_jointplot_clicked(b):
    # Jointplot of first two selected numeric columns, with optional hue
    with out:
        clear_output()
        df = global_df.get('df')
        if df is None:
            print('Load data first')
            return
        cols = list(numeric_select.value)
        if len(cols) < 2:
            print('Select at least two numeric columns for a jointplot')
            return
        x = cols[0]
        y = cols[1]
        fname = os.path.join(PLOT_DIR, f'jointplot_{x}_{y}.png') if save_plots_toggle.value else None
        jointplot_pair(df, x=x, y=y, hue=( 'readmit_30' if 'readmit_30' in df.columns else None ), kind='reg', fname=fname)
        if fname:
            print('Saved jointplot to', fname)

load_button.on_click(on_load_clicked)
heatmap_button.on_click(on_heatmap_clicked)
clustermap_button.on_click(on_clustermap_clicked)
pairplot_button.on_click(on_pairplot_clicked)
jointplot_button.on_click(on_jointplot_clicked)

print('Interactive Seaborn Correlation Explorer UI:')
display(ui)


Interactive Seaborn Correlation Explorer UI:


VBox(children=(HBox(children=(Text(value='C:/Users/reinacherc/Downloads/diabetes+130-us+hospitals+for+years+19…

Usage instructions:
1. Edit the CSV Path field above or leave the default if your data file is in the notebook directory.
2. Click 'Load Data' — the column selectors will populate.
3. Choose numeric columns (left) and optional categorical columns (right).
4. Adjust method, sampling and toggles, then click the desired plot button.
5. If 'Save plots to eda_plots' is checked, plots will be saved to the `eda_plots/` folder.

You can still run the non-interactive `run_eda(DATA_PATH)` cell (below) to perform the full EDA flow programmatically.


In [19]:
# Optionally run the full non-interactive EDA flow programmatically
try:
    # Only run if you want the full EDA immediately
    # df = run_eda(DATA_PATH)
    pass
except Exception as e:
    print('Run EDA failed:', e)


Done — the EDA notebook now includes the interactive Seaborn-based correlation explorer. Open this notebook in Jupyter, run the top cells (install dependencies if necessary), load your data, and use the widget UI to explore correlations visually. If you'd like, I can also patch your original PatientReadmission_EDA.ipynb inside a GitHub repo and open a PR — give me the repo owner/name and branch info and I'll prepare a PR.