In [1]:
import os
from pathlib import Path
import sys

# ----- repo root -----
os.chdir("/Users/chentahung/Desktop/git/mob-py/")
repo_src = Path("src").resolve()
if str(repo_src) not in sys.path:
    sys.path.insert(0, str(repo_src))

# plotting + data
import numpy as np
import pandas as pd

from MOBPY.binning.mob import MonotonicBinner
from MOBPY.core.constraints import BinningConstraints
from MOBPY.plot.csd_gcm import (
    plot_csd_only,
    plot_csd_pava_step,
    plot_gcm_on_csd,
)
from MOBPY.plot.mob_plot import MOBPlot

# output folder for images
IMG_DIR = Path("/Users/chentahung/Desktop/git/mob-py/doc/images").resolve()
IMG_DIR.mkdir(parents=True, exist_ok=True)


#### 1) Load data & prep target

In [2]:
df = pd.read_csv("data/german_data_credit_cat.csv")

# target: default -> {0,1}
df["default"] = (df["default"] - 1).clip(lower=0, upper=1).astype(int)
df.head()


Unnamed: 0,Statusofexistingcheckingaccount,Durationinmonth,Credithistory,Purpose,Creditamount,Savingsaccountbonds,Presentemploymentsince,Installmentrate,Personalstatussex,Otherdebtors,...,Property,Age,installmentplans,Housing,existingcredits,Job,Numberofpeople,Telephone,foreignworker,default
0,A11,6,A34,A43,1169,A65,A75,4,A93,A101,...,A121,67,A143,A152,2,A173,1,A192,A201,0
1,A12,48,A32,A43,5951,A61,A73,2,A92,A101,...,A121,22,A143,A152,1,A173,1,A191,A201,1
2,A14,12,A34,A46,2096,A61,A74,2,A93,A101,...,A121,49,A143,A152,1,A172,2,A191,A201,0
3,A11,42,A32,A42,7882,A61,A74,2,A93,A103,...,A122,45,A143,A153,1,A173,2,A191,A201,0
4,A11,24,A33,A40,4870,A61,A73,3,A93,A101,...,A124,53,A143,A153,2,A173,2,A191,A201,1


#### 2) Fit the binner

In [5]:
cons = BinningConstraints(
    max_bins=6,
    min_bins=2,
    min_samples=0.05,   # 5% of clean rows per bin
    initial_pvalue=0.4, # starting merge threshold (lib may anneal effectively)
    maximize_bins=True, # classic MOB style
)

binner = MonotonicBinner(
    df=df,
    x="Durationinmonth",     # <- choose any numeric X you want to bin
    y="default",      # <- binary target
    metric="mean",    # mean-only (median reserved for future work)
    sign="auto",
    strict=True,
    constraints=cons,
    exclude_values=None,  # or e.g. [0] if you want to treat a special value separately
    sort_kind="quicksort",
).fit()

bins = binner.bins_()
summary = binner.summary_()

print("Bins (clean, no Missing/Excluded):")
display(bins)

print("Summary (MOB-style when y is binary):")
display(summary)


Bins (clean, no Missing/Excluded):


Unnamed: 0,left,right,n,sum,mean,std,min,max
0,-inf,9.0,94,10.0,0.106383,0.30998,0.0,1.0
1,9.0,12.0,86,17.0,0.197674,0.400581,0.0,1.0
2,12.0,16.0,251,62.0,0.247012,0.432135,0.0,1.0
3,16.0,36.0,399,129.0,0.323308,0.468327,0.0,1.0
4,36.0,45.0,100,42.0,0.42,0.496045,0.0,1.0
5,45.0,inf,70,40.0,0.571429,0.498445,0.0,1.0


Summary (MOB-style when y is binary):


Unnamed: 0,left,right,interval,nsamples,bads,goods,bad_rate,woe,iv_grp
0,-inf,9.0,"(-inf, 9)",94,10.0,84.0,0.106383,1.23903,0.106168
1,9.0,12.0,"[9, 12)",86,17.0,69.0,0.197674,0.532779,0.021833
2,12.0,16.0,"[12, 16)",251,62.0,189.0,0.247012,0.262876,0.016439
3,16.0,36.0,"[16, 36)",399,129.0,270.0,0.323308,-0.109755,0.004917
4,36.0,45.0,"[36, 45)",100,42.0,58.0,0.42,-0.526824,0.030513
5,45.0,inf,"[45, inf)",70,40.0,30.0,0.571429,-1.129922,0.103088


#### 3) Save bins & summary (optional)

In [None]:
# bins.to_csv(IMG_DIR / "bins.csv", index=False)
# summary.to_csv(IMG_DIR / "summary.csv", index=False)

#### 4) Plot: MOB summary (WoE bars + bad-rate line)

In [7]:
MOBPlot.plot_bins_summary(
    summary,
    savepath=str(IMG_DIR / "mob_summary.png"),
    dpi=150,
)
print("Saved:", IMG_DIR / "mob_summary.png")


Saved: /Users/chentahung/Desktop/git/mob-py/doc/images/mob_summary.png


#### 5) Plot: CSD-only (true cumulative sum diagram)

In [12]:
pava_groups = binner._pava.groups_     # x-grouped counts/sums (already sorted by x)
plot_csd_only(
    groups_df=pava_groups,
    savepath=str(IMG_DIR / "csd_only.png"),
    dpi=150,
)
print("Saved:", IMG_DIR / "csd_only.png")


Saved: /Users/chentahung/Desktop/git/mob-py/doc/images/csd_only.png


#### 6) Plot: CSD + PAVA step (with optional merged overlay)

In [13]:
# PAVA blocks (pre-merge) as dicts (left/right/n/sum/...)
pava_blocks_dicts = binner._pava.export_blocks(as_dict=True)

# Optional overlay of final merged bins (transform bins DataFrame into block-like dicts)
merged_blocks_dicts = [
    {"left": float(r.left), "right": float(r.right), "n": int(r.n), "sum": float(r.sum)}
    for r in bins.itertuples(index=False)
]

plot_csd_pava_step(
    groups_df=pava_groups,
    pava_blocks=pava_blocks_dicts,
    merged_blocks=merged_blocks_dicts,   # comment out if you don’t want the overlay
    x_name=binner.x,
    y_name=binner.y,
    savepath=str(IMG_DIR / "csd_pava_step.png"),
    dpi=150,
)
print("Saved:", IMG_DIR / "csd_pava_step.png")


Saved: /Users/chentahung/Desktop/git/mob-py/doc/images/csd_pava_step.png


#### 7) Plot: GCM on CSD (blue cumulative mean vs red PAVA step)

In [14]:
plot_gcm_on_csd(
    groups_df=pava_groups,
    pava_blocks=pava_blocks_dicts,
    x_name=binner.x,
    y_name=binner.y,
    savepath=str(IMG_DIR / "gcm_on_csd.png"),
    dpi=150,
)
print("Saved:", IMG_DIR / "gcm_on_csd.png")


Saved: /Users/chentahung/Desktop/git/mob-py/doc/images/gcm_on_csd.png


#### 8) Transform raw X to bins

In [15]:
# Map to interval labels like "(-inf, 6)" ... "[12, inf)"
labels = binner.transform(df[binner.x], assign="interval")
labels.head()


0    (-inf, 9)
1    [45, inf)
2     [12, 16)
3     [36, 45)
4     [16, 36)
Name: Durationinmonth, dtype: object