### <div class="alert alert-info" align = center> Imports</div>

In [None]:
import os, sys, warnings, subprocess, json, hashlib, shutil, glob, pathlib, time, joblib
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import chi2,mutual_info_classif
from sklearn.preprocessing import LabelEncoder


# Settings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 100)
plt.style.use("seaborn-v0_8")   # clean default style
sns.set_palette("muted")        # consistent colors

#Start a timer to check the execution time of the notebook.
start_time = time.time()

### <div class="alert alert-info" align = center> Functions</div>

In [None]:
def label_default(status):
    bad = {"Charged Off", "Default", "Late (31-120 days)", "Late (16-30 days)"}   # risky
    good = {"Fully Paid"}                                                          # safe
    if status in bad: 
        return 1
    elif status in good: 
        return 0
    else: 
        return np.nan   # ambiguous → drop later if needed

In [None]:
def clean_and_reduce(
    df,
    na_thresh: float = 0.99,         # drop cols with >99% NA
    cat_freq_thresh: float = 0.5,    # object→category if unique_ratio < 0.5
    add_missing_ind: bool = True,    # add __is_missing flags for NA-heavy cols
    ind_thresh: float = 0.20,        # add indicator if >=5% missing
    impute: bool = True,             # run SimpleImputer (no row drops)
    fitted_imputer=None,             # pass a previously fitted imputer to reuse
    return_artifacts: bool = True    # return metadata incl. dropped cols & imputer
):
    """
    Memory-savvy reducer + optional imputation without dropping rows.
    Steps:
      1) Drop cols with > na_thresh missing or constant.
      2) Downcast numerics; object→category when repetitive.
      3) Optionally add missingness indicators: <col>__is_missing (int8).
      4) Optionally impute: median for numerics, most_frequent for categoricals.
    Returns:
      df_out, (optional) artifacts dict.
    """


    n0, c0 = len(df), df.shape[1]

    # 1) Drop NA-heavy + constant columns
    na_frac = df.isna().mean()
    drop_na = na_frac[na_frac > na_thresh].index.tolist()
    drop_const = [c for c in df.columns if df[c].nunique(dropna=False) <= 1]
    dropped_cols = sorted(set(drop_na + drop_const))
    df_red = df.drop(columns=dropped_cols) if dropped_cols else df.copy()

    # 2) Downcast numerics; object→category (repetitive only)
    num_cols_all = df_red.select_dtypes(include="number").columns
    for col in num_cols_all:
        s = df_red[col]
        if pd.api.types.is_float_dtype(s):
            df_red[col] = s.astype("float32")
        elif pd.api.types.is_integer_dtype(s):
            df_red[col] = pd.to_numeric(s, downcast="integer")

    obj_cols_all = df_red.select_dtypes(include="object").columns
    n_rows = max(len(df_red), 1)
    for col in obj_cols_all:
        if df_red[col].nunique(dropna=False) / n_rows < cat_freq_thresh:
            df_red[col] = df_red[col].astype("category")

    # 3) Add missingness indicators (no row drops)
    indicators = []
    if add_missing_ind:
        miss = df_red.isna().mean()
        for c, frac in miss.items():
            if frac >= ind_thresh:
                ind_name = f"{c}__is_missing"
                df_red[ind_name] = df_red[c].isna().astype("int8")
                indicators.append(ind_name)

    # Identify columns for imputation (exclude indicators)
    num_cols = df_red.select_dtypes(include=["number", "float32", "float64", "int32", "int64"]).columns.tolist()
    cat_cols = df_red.select_dtypes(include=["category", "object"]).columns.tolist()
    num_cols = [c for c in num_cols if not c.endswith("__is_missing")]
    cat_cols = [c for c in cat_cols if not c.endswith("__is_missing")]

    # 4) Impute (median for numeric, most_frequent for categorical)
    imputer_pipe = None
    if impute:
        num_imputer = SimpleImputer(strategy="median")
        cat_imputer = SimpleImputer(strategy="most_frequent")

        pre = ColumnTransformer(
            transformers=[
                ("num", num_imputer, num_cols),
                ("cat", cat_imputer, cat_cols),
            ],
            remainder="drop",
            n_jobs=-1,
        )

        if fitted_imputer is None:
            imputer_pipe = Pipeline([("imputer", pre)])
            X_imp = imputer_pipe.fit_transform(df_red)
        else:
            imputer_pipe = fitted_imputer
            X_imp = imputer_pipe.transform(df_red)

        # rebuild dataframe in same order, then reattach indicators
        imp_cols = num_cols + cat_cols
        df_out = pd.DataFrame(X_imp, columns=imp_cols, index=df_red.index)

        # cast back categories
        for c in cat_cols:
            df_out[c] = df_out[c].astype("category")

        # reattach indicators
        for ind in indicators:
            df_out[ind] = df_red[ind].astype("int8")
    else:
        df_out = df_red
        imputer_pipe = None

    artifacts = {
        "dropped_cols": dropped_cols,
        "indicators_added": indicators,
        "num_cols": num_cols,
        "cat_cols": cat_cols,
        "imputer": imputer_pipe,
        "rows_before": n0,
        "cols_before": c0,
        "rows_after": len(df_out),
        "cols_after": df_out.shape[1],
    }

    return (df_out, artifacts) if return_artifacts else df_out



In [None]:
def find_weak_categoricals(df: pd.DataFrame, target: str, min_freq: int = 2):
    """
    Returns a DataFrame with categorical features ranked by signal:
      - chi2_min_p: min p-value across one-hot levels (lower is stronger)
      - mi_mean: mean mutual information across one-hot levels (higher is stronger)

    min_freq: levels with count < min_freq are collapsed into '__rare__' to stabilize stats.
    """
    # 0) checks
    if target not in df.columns:
        raise ValueError(f"target '{target}' not in DataFrame columns")
    if df[target].isna().any():
        # simple fix: drop NAs in target only (no row imputation here)
        df = df.loc[~df[target].isna()].copy()

    y = LabelEncoder().fit_transform(df[target])

    # 1) collect categorical columns (exclude target + indicators)
    cat_cols = df.select_dtypes(include=["object","category"]).columns.tolist()
    cat_cols = [c for c in cat_cols if c != target and not c.endswith("__is_missing")]

    if not cat_cols:
        return pd.DataFrame(columns=["feature","chi2_min_p","mi_mean","n_levels","kept_levels"])

    # 2) sanitize each categorical: collapse rare levels
    df_cat = df[cat_cols].copy()
    for c in cat_cols:
        s = df_cat[c].astype("category")
        vc = s.value_counts(dropna=False)
        rare = vc[vc < min_freq].index
        if len(rare) > 0:
            s = s.replace(dict.fromkeys(rare, "__rare__")).astype("category")
        df_cat[c] = s

    # 3) one-hot encode all cats (keep all levels to evaluate signal)
    X = pd.get_dummies(df_cat, dummy_na=False)  # no drop_first; evaluate all levels

    # guard: chi2 requires non-negative
    if X.shape[1] == 0:
        return pd.DataFrame(columns=["feature","chi2_min_p","mi_mean","n_levels","kept_levels"])

    # remove zero-variance columns (can happen after rare collapsing)
    nz_cols = [c for c in X.columns if X[c].nunique() > 1]
    X = X[nz_cols]
    if X.shape[1] == 0:
        return pd.DataFrame(columns=["feature","chi2_min_p","mi_mean","n_levels","kept_levels"])

    # 4) chi2 + MI
    chi_stats, chi_pvals = chi2(X, y)
    mi_vals = mutual_info_classif(X, y, discrete_features=True, random_state=42)

    # 5) aggregate back to original feature names
    #   one-hot columns look like "<col>_<level>"
    def base_name(col):
        # handle underscores inside categories by splitting from the right once
        # pandas uses col + '_' + level; we reverse split one time
        parts = col.rsplit("_", 1)
        return parts[0] if len(parts) == 2 else col

    df_scores = pd.DataFrame({
        "oh_col": X.columns,
        "chi_p": chi_pvals,
        "mi": mi_vals
    })
    df_scores["feature"] = df_scores["oh_col"].map(base_name)

    agg = (df_scores
           .groupby("feature")
           .agg(chi2_min_p=("chi_p","min"),
                mi_mean=("mi","mean"),
                n_levels=("oh_col","count"))
           .reset_index()
           .sort_values(["mi_mean","chi2_min_p"], ascending=[False, True]))

    # keep a compact view of which levels survived (optional but handy)
    kept_levels = (df_scores.groupby("feature")["oh_col"]
                   .apply(lambda s: ", ".join(s.head(5)) + (" ..." if len(s) > 5 else "")))
    agg["kept_levels"] = agg["feature"].map(kept_levels)

    return agg

In [None]:
def plot_default_prob_grid(
    df, 
    features, 
    target="target_default", 
    n_cols=3, 
    bins=6, 
    strategy="quantile",        # "quantile" or "uniform"
    max_cats=12,                # limit for wide categoricals
    figsize_per_plot=(6.0, 4.2),
    decimals=4                  # bin label precision when not integer-like
    ):
    """
    Grid of P(target=1) plots:
      - Numeric (non-binary): binned barplots with pretty bin labels
      - Binary numeric & categoricals: category barplots
    Skips missing columns; hides unused subplots.
    """
    ok_feats = [f for f in features if f in df.columns]
    miss_feats = [f for f in features if f not in df.columns]
    if miss_feats:
        print("Skipping missing columns:", miss_feats)
    if not ok_feats:
        print("No valid features to plot.")
        return



    n = len(ok_feats)
    n_rows = int(np.ceil(n / n_cols))
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(figsize_per_plot[0]*n_cols, figsize_per_plot[1]*n_rows))
    axes = np.atleast_1d(axes).ravel()

    def _is_binary_numeric(s: pd.Series) -> bool:
        return pd.api.types.is_numeric_dtype(s) and s.dropna().nunique() <= 2

    def _is_integer_like(s: pd.Series) -> bool:
        # true if integer dtype OR all non-null values are whole numbers
        if pd.api.types.is_integer_dtype(s):
            return True
        v = s.dropna().values
        if v.size == 0:
            return False
        return np.all(np.isfinite(v)) and np.all(np.abs(v - np.round(v)) < 1e-9)

    for i, feat in enumerate(ok_feats):
        ax = axes[i]
        s = df[feat]

        # Decide plotting mode
        treat_as_numeric = pd.api.types.is_numeric_dtype(s) and not _is_binary_numeric(s)

        if treat_as_numeric:
            # Numeric → bin
            if strategy == "quantile":
                b = pd.qcut(s, q=bins, duplicates="drop")
            else:
                b = pd.cut(s, bins=bins)

            # Pretty bin labels
            if _is_integer_like(s):
                b = b.cat.rename_categories(lambda x: f"{int(np.floor(x.left))} – {int(np.ceil(x.right))}")
            else:
                fmt = f"{{:.{decimals}f}} – {{:.{decimals}f}}"
                b = b.cat.rename_categories(lambda x: fmt.format(x.left, x.right))

            # Compute default rate per bin
            g = (
                df[[target]]
                .join(b.rename("__bin__"))
                .dropna(subset=["__bin__"])
                .groupby("__bin__")[target]
                .mean()
                .reset_index(name="p_default")
            )

            sns.barplot(x="__bin__", y="p_default", data=g, ax=ax)
            ax.set_title(f"{feat} (binned)")
            ax.set_xlabel(feat)
            ax.set_ylabel("P(Default=1)")
            ax.tick_params(axis="x", rotation=45, labelsize=9)

        else:
            # Categorical or binary → category default rate
            vc = df[feat].value_counts(dropna=False)
            cats = vc.index[:max_cats]
            g = (
                df[df[feat].isin(cats)]
                .groupby(feat, observed=True)[target]
                .mean()
                .reset_index(name="p_default")
                .sort_values("p_default", ascending=False)
            )
            sns.barplot(x=feat, y="p_default", data=g, ax=ax)
            ax.set_title(f"{feat}")
            ax.set_xlabel(feat)
            ax.set_ylabel("P(Default=1)")
            ax.tick_params(axis="x", rotation=45, labelsize=9)

    # Remove any unused axes
    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()


### <div class="alert alert-info" align = center> Data Wrangling</div>

### <div class="alert alert-info" > Load The Data</div>

In [None]:
# Smart, idempotent fetch with checksum + metadata

import subprocess, sys, hashlib, pathlib, glob, shutil, time, json
from pathlib import Path

# 0) Ensure kagglehub installed in THIS kernel
subprocess.run([sys.executable, "-m", "pip", "install", "-q", "--upgrade", "kagglehub"], check=True)
import kagglehub

def sha256_of(path, chunk=1024*1024):
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            b = f.read(chunk)
            if not b: break
            h.update(b)
    return h.hexdigest()

def write_meta_if_changed(meta_file: Path, meta: dict, change_keys=("cache_sha256","local_sha256","cached_csv")):
    """Write metadata only when data changed. Keeps audit trail in .history.jsonl."""
    prev = {}
    if meta_file.exists():
        try: prev = json.loads(meta_file.read_text())
        except: prev = {}
    changed = (not prev) or any(prev.get(k) != meta.get(k) for k in change_keys)
    if changed:
        meta_file.write_text(json.dumps(meta, indent=2))
        hist = meta_file.with_suffix(".history.jsonl")
        with open(hist, "a", encoding="utf-8") as fh:
            fh.write(json.dumps(meta) + "\n")
        print(f"📝 Wrote metadata: {meta_file}")
    else:
        print(f"✅ Metadata unchanged: {meta_file}")

# 1) Repo layout
proj = pathlib.Path().cwd()
raw_dir = proj / "data" / "raw"
raw_dir.mkdir(parents=True, exist_ok=True)
local_file = raw_dir / "loan_default_probability_raw.csv"
meta_file  = raw_dir / ".dataset_meta.json"

# 2) Download to cache (kagglehub caches under user dir)
cache_path = kagglehub.dataset_download("adarshsng/lending-club-loan-data-csv")
cache_path = pathlib.Path(cache_path)

# Pick the CSV we need from cache
candidate = None
for f in glob.glob(str(cache_path / "*.csv")):
    # adjust filter as needed to target the main loan-level table
    if "loan" in f.lower() and "dictionary" not in f.lower():
        candidate = pathlib.Path(f)
        break
if candidate is None:
    raise FileNotFoundError("Expected loan CSV not found in downloaded dataset.")

# 3) Compare checksum; copy only if different or local missing
cache_hash = sha256_of(candidate)
local_hash = sha256_of(local_file) if local_file.exists() else None

if local_hash == cache_hash:
    print(f"✅ Up-to-date: {local_file}")
else:
    shutil.copy(candidate, local_file)
    print(f"⬇️  Updated: {local_file}")

# 4) Write metadata (only if changed)
meta = {
    "source": "kagglehub:adarshsng/lending-club-loan-data-csv",
    "cached_csv": str(candidate),
    "cache_sha256": cache_hash,
    "local_path": str(local_file),
    "local_sha256": sha256_of(local_file),
    "fetched_at": time.strftime("%Y-%m-%d %H:%M:%S"),
}
# Try to capture version from cache path (if present)
parts = [p for p in candidate.parts if p.lower().startswith("v") and p[1:].isdigit()]
if parts:
    meta["dataset_version_hint"] = parts[0]

write_meta_if_changed(meta_file, meta)


In [None]:
loan_default = pd.read_csv(local_file,nrows=100_000)

<div class="alert alert-info" ><strong>We are going to look into the details of our data.</strong><br><br>

This includes info(), describe(), shape, checking for nulls, and more</div>

In [None]:
loan_default.info()

In [None]:
loan_default.describe()

In [None]:
loan_default.shape

In [None]:
loan_default.isna().sum().sum()     # confirm true null count

In [None]:
USE_SAMPLE = True
parq_in = "data/processed/loan_default_slim.parquet"
df_in = pd.read_parquet(parq_in).sample(frac=0.1, random_state=42) if USE_SAMPLE else pd.read_parquet(parq_in)

# First pass (fit imputer on sample)
df_imp, art = clean_and_reduce(df_in, na_thresh=0.95, ind_thresh=0.05, impute=True)
pathlib.Path("models").mkdir(exist_ok=True)
joblib.dump(art["imputer"], "models/imputer_simple.joblib")

# Save output
out_path = "data/processed/loan_default_imputed_sample.parquet" if USE_SAMPLE else "data/processed/loan_default_imputed_full.parquet"
df_imp.to_parquet(out_path, index=False)

# Later on full data: reuse the same imputer for consistent treatment
if not USE_SAMPLE:
    from joblib import load
    imputer = load("models/imputer_simple.joblib")
    df_imp_full, art_full = clean_and_reduce(
        df_in, na_thresh=0.95, ind_thresh=0.05, impute=True, fitted_imputer=imputer
    )
    df_imp_full.to_parquet("data/processed/loan_default_imputed_full.parquet", index=False)



In [None]:
df_imp.shape

In [None]:
df_imp.head()

In [None]:
df_imp.info()

<div class="alert alert-info" ><strong> Now that we have a cleaner dataset</strong><br>
We need to check for categorical features that add no signal, no siginificance to our analysis.</div>

In [None]:
# Run feature signal check
agg = find_weak_categoricals(df_imp, target="loan_status", min_freq=5)

# Define weak threshold (tune as you like)
weak = agg[(agg["chi2_min_p"] > 0.9) & (agg["mi_mean"] < 1e-4)]

print(f"Identified {len(weak)} weak categorical features to drop")
print(weak[["feature","chi2_min_p","mi_mean"]].head())

# Drop them from df
df_pruned = df_imp.drop(columns=weak["feature"].tolist(), errors="ignore")

print("Before:", df_imp.shape, "After:", df_pruned.shape)


In [None]:
out_path = "data/processed/loan_default_pruned_sample.parquet" if USE_SAMPLE else "data/processed/loan_default_pruned_full.parquet"
df_pruned.to_parquet(out_path, index=False)
print(f"✅ Saved pruned dataset: {out_path}")

In [None]:
# 1) Select numeric columns only
num_df = df_pruned.select_dtypes(include=["number","float32","float64","int32","int64"])

# 2) Compute correlation matrix (Spearman handles skewed/ordinal better than Pearson)
corr = num_df.corr(method="spearman")

# 3) Mask upper triangle (to make heatmap readable)
mask = np.triu(np.ones_like(corr, dtype=bool))

plt.figure(figsize=(16,12))
sns.heatmap(corr, mask=mask, cmap="coolwarm", center=0,
            square=True, linewidths=0.5, cbar_kws={"shrink": .6})
plt.title("Numeric Feature Correlation Heatmap", fontsize=16)
plt.show()


In [None]:
# Threshold for high correlation
THRESH = 0.85
to_drop = set()

corr_matrix = corr.abs()
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i,j] > THRESH:
            colname = corr_matrix.columns[i]
            to_drop.add(colname)

print(f"Suggest dropping {len(to_drop)} highly correlated features:")
print(list(to_drop)[:15])

df_uncorr = df_pruned.drop(columns=list(to_drop))


In [None]:
drop_corr = [
    'sec_app_inq_last_6mths__is_missing',
    'sec_app_collections_12_mths_ex_med__is_missing',
    'sec_app_mort_acc__is_missing',
    'sec_app_chargeoff_within_12_mths__is_missing',
    'sec_app_earliest_cr_line__is_missing',
    'sec_app_open_act_il__is_missing',
    'dti_joint__is_missing',
    'sec_app_open_acc__is_missing',
    'sec_app_revol_util__is_missing',
    'sec_app_num_rev_accts__is_missing',
    'revol_bal_joint__is_missing',
    'verification_status_joint__is_missing'
]

df_uncorr = df_pruned.drop(columns=drop_corr, errors="ignore")
print("Before:", df_pruned.shape, "After:", df_uncorr.shape)

In [None]:
df_uncorr[target].value_counts(normalize=True).plot(kind="bar")
plt.title("Target Distribution")
plt.show()

In [None]:
num_cols = df_uncorr.select_dtypes(include=["number","float32","float64","int32","int64"]).columns
df_uncorr[num_cols].describe(percentiles=[0.01,0.05,0.95,0.99]).T

In [None]:
df_uncorr.isna().mean(axis=1).plot(kind="hist", bins=30)
plt.title("Fraction of Missing per Row")
plt.show()

In [None]:
cat_cols = df_uncorr.select_dtypes(include=["object","category"]).columns
for col in cat_cols:
    print(col, df_uncorr[col].nunique(), "unique values")

In [None]:
X_enc = df_uncorr.drop(columns=[target]).copy()
for c in X_enc.select_dtypes(include=["object","category"]).columns:
    X_enc[c] = LabelEncoder().fit_transform(X_enc[c].astype(str))

y_enc = df_uncorr[target]

mi = mutual_info_classif(X_enc, y_enc, discrete_features="auto", random_state=42)


In [None]:
# mi is your array; X_enc are the columns you used
mi_df = (pd.DataFrame({"feature": X_enc.columns, "mi": mi})
           .sort_values("mi", ascending=False))
display(mi_df.head(20))     # top signals
display(mi_df.tail(20))     # weakest

# Plot top-30
mi_df.head(30).plot.bar(x="feature", y="mi", figsize=(10,4), rot=75)

# Drop very weak features (e.g., MI ~ 0)
weak_feats = mi_df.query("mi <= 1e-4")["feature"].tolist()
df_uncorr2 = df_uncorr.drop(columns=weak_feats, errors="ignore")
print("Dropped:", len(weak_feats), "| New shape:", df_uncorr2.shape)

# Save for audit
mi_df.to_csv("data/processed/mi_rank_100k.csv", index=False)


<div style="border: 2px solid #4CAF50; border-radius: 10px; padding: 15px; background-color: #f9fff9;">

## 🧹 Wrangling & EDA Summary  

We started with **145 raw features** from the LendingClub loan dataset. Through systematic wrangling and exploratory data analysis, we reduced noise and redundancy to reach a **lean 80-feature dataset** ready for modeling.  

### 🔎 Key Steps Taken  
- **Dropped NA-heavy & constant columns**  
  Columns with >99% missing values or no variance were removed.  

- **Removed weak categorical features**  
  Used Chi² tests and Mutual Information to identify and drop categorical variables with no measurable relationship to the target.  

- **Pruned highly correlated features**  
  Applied correlation heatmaps (Spearman) and removed strongly collinear indicators (e.g., multiple `__is_missing` flags for secondary applicants).  

- **Filtered by Mutual Information**  
  Calculated MI scores on a 100k subset and dropped features with near-zero predictive contribution.  

### 📉 Feature Reduction  
- Initial: **145 features**  
- After cleaning + pruning: **80 features**  
- **45% dimensionality reduction** while preserving signal  

### ✅ Outcome  
We now have a **lean, efficient dataset** that balances predictive potential with interpretability and compute efficiency. This forms the foundation for:  
- More reliable EDA insights  
- Faster model training  
- Reduced risk of overfitting  

</div>


In [None]:
df_uncorr2["target_default"] = df_uncorr2["loan_status"].apply(label_default)

print(df_uncorr2["target_default"].value_counts(dropna=False, normalize=True))


In [None]:
print(df_uncorr2.columns.tolist()[:30])   # first 30 cols
print(df_uncorr2.select_dtypes(include=["number"]).columns[:10])  # numeric sample
print(df_uncorr2.select_dtypes(include=["category","object"]).columns[:10])  # categorical sample


In [None]:
target = "target_default"

# 1) Categorical (binary) → countplot + default rate bars
cat_feat = "emp_length__is_missing"  # 0/1 indicator you have
plt.figure(figsize=(7,4))
sns.countplot(x=cat_feat, hue=target, data=df_uncorr2, order=[0,1])
plt.title(f"{cat_feat} vs {target}")
plt.xlabel(f"{cat_feat} (0 = present, 1 = missing)")
plt.ylabel("Count")
plt.show()

# Optional: default rate by category (probability bars)
rate = (df_uncorr2.groupby(cat_feat)[target].mean() * 100).reindex([0,1])
rate.plot(kind="bar", figsize=(5,3), rot=0)
plt.title(f"P({target}=1) by {cat_feat}")
plt.ylabel("Default Rate (%)")
plt.show()

# 2) Numeric → boxplot vs target
num_feat = "int_rate"  # numeric in your list
plt.figure(figsize=(7,4))
sns.boxplot(x=target, y=num_feat, data=df_uncorr2)
plt.title(f"{num_feat} vs {target}")
plt.xlabel(target)
plt.ylabel(num_feat)
plt.show()


<div class="alert alert-info" >Group continuous features into bins, then calculate the probability of Default within each bin.</div>

In [None]:
target = "target_default"

num_feats = ["int_rate", "loan_amnt", "dti", "revol_util", "total_acc", "delinq_2yrs"]
cat_feats = ["emp_length__is_missing", "annual_inc_joint__is_missing", "dti_joint__is_missing"]

plot_default_prob_grid(df_uncorr2, num_feats, target=target, n_cols=3, bins=5, strategy="quantile")
plot_default_prob_grid(df_uncorr2, cat_feats, target=target, n_cols=3)

