In [1]:
import fastwoe
import fastwoe.fastwoe_rs as rs
import numpy as np
import pandas as pd
from fastwoe import FastWoe, WoePreprocessor

print("fastwoe package:", fastwoe.__file__)
print("extension:", rs.__file__)


fastwoe package: /Users/bryan.bosire/anaconda_projects/fastwoe/python/fastwoe/__init__.py
extension: /Users/bryan.bosire/anaconda_projects/fastwoe/python/fastwoe/fastwoe_rs.abi3.so


In [2]:
# Single-feature API
model = FastWoe(smoothing=0.5, default_woe=0.0)
categories = ["A", "A", "B", "C"]
target = [1, 0, 0, 1]

model.fit(categories, target)
woe_values = model.transform(["A", "B", "Z"])
proba = model.predict_proba(["A", "B", "Z"])
mapping = model.get_mapping()

print("woe_values:", woe_values)
print("proba:", proba)
print("mapping:", mapping)

woe_values: [0.0, -1.0986122886681098, 0.0]
proba: [0.5, 0.25, 0.5]
mapping: [WoeRow(category="A", event_count=1, non_event_count=1, woe=0.000000, woe_se=1.154701), WoeRow(category="B", event_count=0, non_event_count=1, woe=-1.098612, woe_se=1.632993), WoeRow(category="C", event_count=1, non_event_count=0, woe=1.098612, woe_se=1.632993)]


In [3]:
# Matrix API (binary target)
rows = [["A", "x"], ["A", "y"], ["B", "x"], ["C", "z"]]
target = [1, 0, 0, 1]

model = FastWoe()
X_woe = model.fit_transform_matrix(rows, target, feature_names=["cat", "bucket"], as_frame=True)
proba = model.predict_proba_matrix(rows)
cat_mapping = model.get_feature_mapping("cat")

cat_mapping_df = pd.DataFrame([
    {
        "category": r.category,
        "event_count": r.event_count,
        "non_event_count": r.non_event_count,
        "woe": r.woe,
        "woe_se": r.woe_se,
    }
    for r in cat_mapping
])

print(X_woe.head())
print("proba:", proba)
print(cat_mapping_df)

        cat    bucket
0  0.000000  0.000000
1  0.000000 -1.098612
2 -1.098612  0.000000
3  1.098612  1.098612
proba: [0.5, 0.25, 0.25, 0.8999999999999999]
  category  event_count  non_event_count       woe    woe_se
0        A            1                1  0.000000  1.154701
1        B            0                1 -1.098612  1.632993
2        C            1                0  1.098612  1.632993


In [4]:
# Matrix API (multiclass target)
rows = [["A", "x"], ["A", "y"], ["B", "x"], ["C", "z"], ["B", "y"]]
labels = ["c0", "c1", "c2", "c0", "c1"]

model = FastWoe()
model.fit_matrix_multiclass(rows, labels, feature_names=["cat", "bucket"])
all_probs = model.predict_proba_matrix_multiclass(rows)
classes = model.get_class_labels()
X_woe_multi = model.transform_matrix_multiclass(rows, as_frame=True)

print("classes:", classes)
print("first probability row:", all_probs[0])
print(X_woe_multi.head())

classes: ['c0', 'c1', 'c2']
first probability row: [0.7044274570168803, 0.14088549140337603, 0.15468705157974377]
   cat_class_c0  bucket_class_c0  cat_class_c1  bucket_class_c1  cat_class_c2  \
0      0.251314         0.251314      0.251314        -1.358123     -0.820981   
1      0.251314        -1.358123      0.251314         1.860752     -0.820981   
2     -1.358123         0.251314      0.251314        -1.358123      0.788457   
3      1.349927         1.349927     -0.847298        -0.847298     -0.310155   
4     -1.358123        -1.358123      0.251314         1.860752      0.788457   

   bucket_class_c2  
0         0.788457  
1        -0.820981  
2         0.788457  
3        -0.310155  
4        -0.820981  


In [5]:
# Confidence intervals
model = FastWoe()
model.fit(["A", "B", "A"], [1, 0, 1])
ci = model.predict_ci(["A", "Z"], alpha=0.05)
print(ci)

[(0.8695652173913044, 0.24245934371319505, 0.9928501128522621), (0.6666666666666666, 0.07533681599414054, 0.9800378843248612)]


In [6]:
# Numeric preprocessing
rows = [[0.1], [0.2], [0.3], [10.0], [10.2], [20.0]]
pre = WoePreprocessor(n_bins=3, binning_method="kmeans")
rows_binned = pre.fit_transform(rows, numerical_features=[0])
print(rows_binned)

[['bin_0'], ['bin_0'], ['bin_0'], ['bin_1'], ['bin_1'], ['bin_2']]


In [7]:
# End-to-end high-cardinality example
np.random.seed(42)
n = 350
data = pd.DataFrame({
    "category": np.random.choice(["A", "B", "C", "D"], size=n, p=[0.35, 0.30, 0.25, 0.10]),
    "high_card_cat": [f"cat_{i}" for i in np.random.randint(0, 50, size=n)],
    "target": np.random.binomial(1, 0.3, size=n),
})

pre = WoePreprocessor(max_categories=10, min_count=5)
X = pre.fit_transform(
    data[["category", "high_card_cat"]],
    cat_features=["high_card_cat"],
)

woe = FastWoe()
X_woe = woe.fit_transform_matrix(
    X,
    data["target"],
    feature_names=["category", "high_card_cat"],
    as_frame=True,
)

print("X_woe shape:", X_woe.shape)
print(X_woe.head())

rows = woe.get_feature_mapping("category")
mapping_df = pd.DataFrame([{
    "category": r.category,
    "event_count": r.event_count,
    "non_event_count": r.non_event_count,
    "woe": r.woe,
    "woe_se": r.woe_se,
} for r in rows])
mapping_df["count"] = mapping_df["event_count"] + mapping_df["non_event_count"]
mapping_df["event_rate"] = mapping_df["event_count"] / mapping_df["count"]

print("\nCategory mapping:")
print(mapping_df[["category", "count", "event_rate", "woe", "woe_se"]].sort_values("category"))

X_woe shape: (350, 2)
   category  high_card_cat
0  0.026372       0.033646
1  0.275743       0.386302
2 -0.076859       0.754026
3  0.026372      -0.008114
4 -0.044598       0.033646

Category mapping:
  category  count  event_rate       woe    woe_se
0        A    131    0.305344 -0.044598  0.188733
1        B    103    0.320388  0.026372  0.209846
2        C     84    0.297619 -0.076859  0.236691
3        D     32    0.375000  0.275743  0.358860


## README Examples Validation
Additional runnable cells mirroring README examples to verify behavior end-to-end.


In [8]:
# README: Assumption-Risk Diagnostics
rows = [["A", "x"], ["A", "y"], ["B", "x"], ["C", "z"]]
target = [1, 0, 0, 1]

model = FastWoe()
model.fit_matrix(rows, target, feature_names=["f0", "f1"])
diagnostics = model.get_assumption_diagnostics()
quiet_model = FastWoe(warn_on_assumption_risk=False)

print("diagnostics keys:", sorted(diagnostics.keys()))
print("at_risk:", diagnostics["at_risk"])
print("quiet_model warning flag:", quiet_model._warn_on_assumption_risk)


diagnostics keys: ['at_risk', 'dependence', 'n_features', 'n_rows', 'sparsity']
at_risk: True


In [9]:
# README: IV Analysis
rows = [["A", "x"], ["A", "y"], ["B", "x"], ["C", "z"]]
target = [1, 0, 0, 1]

model = FastWoe()
model.fit_matrix(rows, target, feature_names=["cat", "bucket"])

iv_rows = model.get_iv_analysis(alpha=0.05)
iv_cat_only = model.get_iv_analysis(feature_name="cat", alpha=0.05)
iv_df = model.get_iv_analysis(as_frame=True)

model.fit_matrix_multiclass(rows, ["c0", "c1", "c2", "c0"], feature_names=["cat", "bucket"])
iv_c0 = model.get_iv_analysis_multiclass("c0", alpha=0.05)

print("iv_rows:", len(iv_rows))
print("iv_cat_only:", len(iv_cat_only))
print(iv_df)
print("iv_c0:", len(iv_c0))


iv_rows: 2
iv_cat_only: 1
  feature        iv     iv_se  iv_ci_lower  iv_ci_upper  iv_significance
0     cat  0.627778  1.052571          0.0      2.69078  Not Significant
1  bucket  0.627778  1.052571          0.0      2.69078  Not Significant
iv_c0: 2


In [10]:
# README: High-cardinality preprocessing
rows = [
    ["cat_1", "segment_a"],
    ["cat_1", "segment_b"],
    ["cat_2", "segment_a"],
    ["cat_99", "segment_z"],
]

target = [1, 0, 0, 1]
pre = WoePreprocessor(top_p=0.9, min_count=2, max_categories=20)
rows_reduced = pre.fit_transform(rows)
summary = pre.get_reduction_summary(as_frame=True)

model = FastWoe()
model.fit_matrix(rows_reduced, target, feature_names=["merchant", "segment"])

print(rows_reduced)
print(summary)


[['cat_1', 'segment_a'], ['cat_1', '__other__'], ['__other__', 'segment_a'], ['__other__', '__other__']]
     feature  original_unique  reduced_unique  coverage
0  feature_0                3               2       0.5
1  feature_1                3               2       0.5


In [11]:
# README: Quantile numeric binning
rows = [[1000.0, "A"], [1200.0, "B"], [1400.0, "C"], [None, "D"]]
pre = WoePreprocessor(n_bins=3, binning_method="quantile")
rows_binned = pre.fit_transform(rows, numerical_features=[0], cat_features=[1])
print(rows_binned)


[['bin_0', 'A'], ['bin_1', 'B'], ['bin_2', 'C'], ['__missing__', 'D']]


In [12]:
# README: FAISS optional numeric binning (falls back to kmeans if unavailable)
rows = [[0.1], [0.2], [0.3], [10.0], [10.2], [20.0]]
pre = WoePreprocessor(n_bins=3, binning_method="faiss")
rows_binned = pre.fit_transform(rows, numerical_features=[0])
print(rows_binned)


[['bin_0'], ['bin_0'], ['bin_0'], ['bin_1'], ['bin_1'], ['bin_2']]




In [13]:
# README: Supervised tree-style numeric binning
rows = [[1000.0], [1100.0], [1200.0], [2000.0], [2100.0], [2200.0]]
y = [0, 0, 0, 1, 1, 1]
pre = WoePreprocessor(n_bins=2, binning_method="tree")
rows_binned = pre.fit_transform(rows, numerical_features=[0], target=y)
print(rows_binned)


[['bin_0'], ['bin_0'], ['bin_0'], ['bin_1'], ['bin_1'], ['bin_1']]


In [14]:
# README: Monotonic numerical binning
rows = [[1.0], [2.0], [3.0], [4.0], [5.0], [6.0]]
y = [0, 0, 1, 1, 1, 1]
pre = WoePreprocessor(n_bins=4, binning_method="quantile")
rows_binned = pre.fit_transform(
    rows,
    numerical_features=[0],
    target=y,
    monotonic_constraints="increasing",
)
print(rows_binned)


[['bin_0'], ['bin_0'], ['bin_1'], ['bin_2'], ['bin_3'], ['bin_3']]


In [15]:
# README: Pandas output mode
X = pd.DataFrame({"cat": ["A", "B"], "bucket": ["x", "y"]})
y = [1, 0]

model = FastWoe()
model.fit_matrix(X, y, feature_names=X.columns)

X_woe_df = model.transform_matrix(X, as_frame=True)
ci_df = model.predict_ci_matrix(X, as_frame=True)

model.fit_matrix_multiclass(X, ["c0", "c1"], feature_names=X.columns)
proba_multi_df = model.predict_proba_matrix_multiclass(X, as_frame=True)

print(X_woe_df)
print(ci_df)
print(proba_multi_df)


        cat    bucket
0  1.098612  1.098612
1 -1.098612 -1.098612
   prediction  lower_ci  upper_ci
0         0.9  0.088740  0.998799
1         0.1  0.001201  0.911260
   proba_c0  proba_c1
0  0.987805  0.012195
1  0.012195  0.987805
