In [5]:
"""
Design 1 (updated, no leakage from history into label):
- Population: CURRENT contraceptive users
- Target: HIGH_RISK_DISCONTINUE, defined ONLY from intention variables
- Features: demographics, fertility, method characteristics, history (as predictors only)
"""

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# -------------------------------------------------------------------
# 1. Load data
# -------------------------------------------------------------------

DATA_PATH = "../../data/interim/merged_dataset.csv"  # adjust if needed
df = pd.read_csv(DATA_PATH)

print("Raw data shape:", df.shape)
print("Columns:", list(df.columns))

# -------------------------------------------------------------------
# 2. Helper: safe membership check on mixed code/label columns
# -------------------------------------------------------------------

def is_in(series: pd.Series, values) -> pd.Series:
    """
    Robust membership check that works if the column has mixed types
    (e.g. numeric codes and string labels).
    """
    values_str = set(str(v) for v in values)
    return series.astype(str).isin(values_str)

# -------------------------------------------------------------------
# 3. Filter to CURRENT users only
#    Treat '3' and 'Current user' as current user status.
# -------------------------------------------------------------------

if "CURRENT_USE_TYPE" not in df.columns:
    raise ValueError("Column 'CURRENT_USE_TYPE' not found in the dataset.")

current_user_values = ["3", "Current user"]
df_current = df[is_in(df["CURRENT_USE_TYPE"], current_user_values)].copy()

print("Filtered to current users. Shape:", df_current.shape)

# -------------------------------------------------------------------
# 4. Construct HIGH_RISK_DISCONTINUE target (intention-based only)
#    Logic:
#    - High risk (1) if intention suggests "using but intends to stop" or similar
#    - Low risk (0) if intention suggests "using and intends to continue"
#    - Others => NaN (dropped later)
# -------------------------------------------------------------------

# 4.1 Intention-based risk using CONTRACEPTIVE_USE_AND_INTENTION
if "CONTRACEPTIVE_USE_AND_INTENTION" in df_current.columns:
    high_risk_intention_values = [
        "3",                           # numeric code (example) for "Using but intends to stop"
        "Using but intends to stop",
        "Using but unsure",
    ]
    low_risk_intention_values = [
        "1",                           # numeric code (example) for "Using and intends to continue"
        "Using and intends to continue",
    ]
    intention_high_risk = is_in(df_current["CONTRACEPTIVE_USE_AND_INTENTION"],
                                high_risk_intention_values)
    intention_low_risk = is_in(df_current["CONTRACEPTIVE_USE_AND_INTENTION"],
                               low_risk_intention_values)
else:
    intention_high_risk = pd.Series(False, index=df_current.index)
    intention_low_risk = pd.Series(False, index=df_current.index)

# 4.2 Additional INTENTION_USE-based risk (secondary hint)
if "INTENTION_USE" in df_current.columns:
    high_risk_INTENTION_USE_values = [
        "4", "5", "7",   # example codes: "no intention"/"undecided"/"intends to stop"
        "No intention",
        "Intends to stop",
        "Undecided",
    ]
    low_risk_INTENTION_USE_values = [
        "1", "2", "3",
        "Intends to continue",
        "Intends to use",
    ]
    intention_use_high_risk = is_in(df_current["INTENTION_USE"],
                                    high_risk_INTENTION_USE_values)
    intention_use_low_risk = is_in(df_current["INTENTION_USE"],
                                   low_risk_INTENTION_USE_values)
else:
    intention_use_high_risk = pd.Series(False, index=df_current.index)
    intention_use_low_risk = pd.Series(False, index=df_current.index)

# Combine intention signals (ONLY intentions, no history in label)
any_intention_high_risk = intention_high_risk | intention_use_high_risk
any_intention_low_risk = intention_low_risk | intention_use_low_risk

# 4.3 Final target: HIGH_RISK_DISCONTINUE (intention-based)
target = np.where(
    any_intention_high_risk,
    1,
    np.where(
        any_intention_low_risk & ~any_intention_high_risk,
        0,
        np.nan
    )
)

df_current["HIGH_RISK_DISCONTINUE"] = target

print("\nTarget value counts (including NaN):")
print(df_current["HIGH_RISK_DISCONTINUE"].value_counts(dropna=False))

# Keep only rows with defined target
df_model_d1 = df_current.dropna(subset=["HIGH_RISK_DISCONTINUE"]).copy()
df_model_d1["HIGH_RISK_DISCONTINUE"] = df_model_d1["HIGH_RISK_DISCONTINUE"].astype(int)

print("\nDesign 1 modeling sample shape:", df_model_d1.shape)
print("Target distribution (%):")
print((df_model_d1["HIGH_RISK_DISCONTINUE"].value_counts(normalize=True) * 100).round(2))

# -------------------------------------------------------------------
# 5. Define feature set (no intention columns to avoid label leakage)
#    We allow history variables (LAST_METHOD_DISCONTINUED, REASON_DISCONTINUED)
#    as predictors, but they are NOT used in the label definition.
# -------------------------------------------------------------------

feature_cols_demo = [
    "AGE", "REGION", "EDUC_LEVEL", "RELIGION", "ETHNICITY",
    "MARITAL_STATUS", "RESIDING_WITH_PARTNER",
    "HOUSEHOLD_HEAD_SEX", "OCCUPATION",
    "HUSBANDS_EDUC", "HUSBAND_AGE", "PARTNER_EDUC",
    "SMOKE_CIGAR"
]

feature_cols_fertility = [
    "PARITY",
    "DESIRE_FOR_MORE_CHILDREN",
    "WANT_LAST_CHILD",
    "WANT_LAST_PREGNANCY",
]

feature_cols_method = [
    "CONTRACEPTIVE_METHOD",
    "MONTH_USE_CURRENT_METHOD",
    "PATTERN_USE",
    "TOLD_ABT_SIDE_EFFECTS",
    "LAST_SOURCE_TYPE",
    "LAST_METHOD_DISCONTINUED",      # history as feature (OK now)
    "REASON_DISCONTINUED",           # history as feature (OK now)
    "HSBND_DESIRE_FOR_MORE_CHILDREN",
]

# Columns to exclude from features (direct definition of target)
leakage_cols = [
    "CONTRACEPTIVE_USE_AND_INTENTION",
    "INTENTION_USE",
    "HIGH_RISK_DISCONTINUE",
]

all_candidate_features = feature_cols_demo + feature_cols_fertility + feature_cols_method
feature_cols = [
    c for c in all_candidate_features
    if c in df_model_d1.columns and c not in leakage_cols
]

print("\nNumber of selected features:", len(feature_cols))
print("Selected feature columns:")
print(feature_cols)

TARGET_D1 = "HIGH_RISK_DISCONTINUE"
X = df_model_d1[feature_cols].copy()
y = df_model_d1[TARGET_D1].copy()

print("\nFinal X shape:", X.shape)
print("Final y shape:", y.shape)

# -------------------------------------------------------------------
# 6. Train-test split (stratified)
# -------------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train target distribution (%):")
print((y_train.value_counts(normalize=True) * 100).round(2))

# -------------------------------------------------------------------
# 7. Identify numeric vs categorical features
# -------------------------------------------------------------------

all_features = list(X_train.columns)
numeric_features = [
    col for col in all_features
    if pd.api.types.is_numeric_dtype(X_train[col])
]
categorical_features = [
    col for col in all_features
    if col not in numeric_features
]

print("\nNumeric features:", numeric_features)
print("Categorical features:", categorical_features)

# -------------------------------------------------------------------
# 8. Preprocessing: imputation + one-hot encoding
# -------------------------------------------------------------------

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# -------------------------------------------------------------------
# 9. Train XGBoost baseline
# -------------------------------------------------------------------

xgb_clf = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    tree_method="hist",
    eval_metric="logloss",
    random_state=42,
)

xgb_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_clf)
])

print("\n=== Training XGBoost baseline ===")
xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)

print("\n=== XGBoost performance (HIGH_RISK_DISCONTINUE) ===")
print(classification_report(y_test, y_pred_xgb, digits=3))
print("Confusion matrix (XGBoost):")
print(confusion_matrix(y_test, y_pred_xgb))

# -------------------------------------------------------------------
# 10. Train Decision Tree baseline
# -------------------------------------------------------------------

dt_clf = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=50,
    random_state=42,
    class_weight="balanced"
)

dt_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", dt_clf)
])

print("\n=== Training Decision Tree baseline ===")
dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt_pipeline.predict(X_test)

print("\n=== Decision Tree performance (HIGH_RISK_DISCONTINUE) ===")
print(classification_report(y_test, y_pred_dt, digits=3))
print("Confusion matrix (Decision Tree):")
print(confusion_matrix(y_test, y_pred_dt))

# -------------------------------------------------------------------
# 11. Hybrid model: XGBoost + Decision Tree override on low-confidence cases
# -------------------------------------------------------------------

print("\n=== Training hybrid components ===")

xgb_hybrid = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    tree_method="hist",
    eval_metric="logloss",
    random_state=42,
)

xgb_pipeline_hybrid = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_hybrid)
])

dt_hybrid = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=50,
    random_state=42,
    class_weight="balanced"
)

dt_pipeline_hybrid = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", dt_hybrid)
])

xgb_pipeline_hybrid.fit(X_train, y_train)
dt_pipeline_hybrid.fit(X_train, y_train)

# Probabilities from XGBoost
proba_xgb = xgb_pipeline_hybrid.predict_proba(X_test)[:, 1]  # P(y=1)
pred_xgb = (proba_xgb >= 0.5).astype(int)

# Predictions from Decision Tree
pred_dt = dt_pipeline_hybrid.predict(X_test)

# Confidence margin for hybrid switching
CONF_MARGIN = 0.1  # tune this: smaller = fewer overrides
confidence_xgb = np.abs(proba_xgb - 0.5)
use_dt = confidence_xgb < CONF_MARGIN

hybrid_pred = np.where(use_dt, pred_dt, pred_xgb)

print("\n=== Hybrid (XGBoost + Decision Tree) performance ===")
print(classification_report(y_test, hybrid_pred, digits=3))
print("Confusion matrix (Hybrid):")
print(confusion_matrix(y_test, hybrid_pred))

override_rate = use_dt.mean() * 100
print(f"\nFraction of test cases overridden by Decision Tree: {override_rate:.2f}%")

"""
You now have:
- df_model_d1: current users with intention-based HIGH_RISK_DISCONTINUE
- X_train, X_test, y_train, y_test
- xgb_pipeline, dt_pipeline, and a hybrid model

Next steps:
- Inspect especially recall/precision for class 1 (high-risk).
- Tune XGBoost, Decision Tree, and CONF_MARGIN for your desired trade-off.
"""

Raw data shape: (6612, 31)
Columns: ['CASEID', 'AGE', 'AGE_GRP', 'REGION', 'EDUC_LEVEL', 'RELIGION', 'ETHNICITY', 'EDUC', 'HOUSEHOLD_HEAD_SEX', 'PARITY', 'CONTRACEPTIVE_METHOD', 'CURRENT_USE_TYPE', 'LAST_SOURCE_TYPE', 'MONTH_USE_CURRENT_METHOD', 'LAST_METHOD_DISCONTINUED', 'REASON_DISCONTINUED', 'PATTERN_USE', 'INTENTION_USE', 'CONTRACEPTIVE_USE_AND_INTENTION', 'WANT_LAST_CHILD', 'WANT_LAST_PREGNANCY', 'TOLD_ABT_SIDE_EFFECTS', 'SMOKE_CIGAR', 'MARITAL_STATUS', 'RESIDING_WITH_PARTNER', 'DESIRE_FOR_MORE_CHILDREN', 'HSBND_DESIRE_FOR_MORE_CHILDREN', 'OCCUPATION', 'HUSBANDS_EDUC', 'HUSBAND_AGE', 'PARTNER_EDUC']
Filtered to current users. Shape: (3521, 31)

Target value counts (including NaN):
HIGH_RISK_DISCONTINUE
0.0    2999
NaN     316
1.0     206
Name: count, dtype: int64

Design 1 modeling sample shape: (3205, 32)
Target distribution (%):
HIGH_RISK_DISCONTINUE
0    93.57
1     6.43
Name: proportion, dtype: float64

Number of selected features: 25
Selected feature columns:
['AGE', 'REGION

'\nYou now have:\n- df_model_d1: current users with intention-based HIGH_RISK_DISCONTINUE\n- X_train, X_test, y_train, y_test\n- xgb_pipeline, dt_pipeline, and a hybrid model\n\nNext steps:\n- Inspect especially recall/precision for class 1 (high-risk).\n- Tune XGBoost, Decision Tree, and CONF_MARGIN for your desired trade-off.\n'

In [6]:
import joblib
joblib.dump((X_train, X_test, y_train, y_test), '../../data/processed/discontinuation_design1_data_v2.pkl')
joblib.dump(df_model_d1, '../../data/processed/discontinuation_design1_full_data_v2.pkl')
joblib.dump(xgb_pipeline_hybrid, '../models/discontinuation_design1_xgb_hybrid_model_d1_v2.pkl')
joblib.dump(dt_pipeline_hybrid, '../models/discontinuation_design1_dt_hybrid_model_d1_v2.pkl')
joblib.dump(xgb_pipeline, '../models/discontinuation_design1_xgb_model_d1_v2.pkl')
joblib.dump(dt_pipeline, '../models/discontinuation_design1_dt_model_d1_v2.pkl')

['../models/discontinuation_design1_dt_model_d1_v2.pkl']

In [7]:
"""
Model v2: High-recall variant for HIGH_RISK_DISCONTINUE

Requirements:
- Assumes X_train, X_test, y_train, y_test, preprocessor, numeric_features, categorical_features
  are already defined from the earlier code.
- Does NOT change any earlier variables or models; it creates new ones with suffix `_v2`.

Goals:
- Increase recall for the high-risk class (1), targeting ~0.85 recall.
- Use:
    * class weighting (scale_pos_weight) for XGBoost
    * class_weight for Decision Tree
    * lower decision threshold for XGBoost
    * "upgrade-only" hybrid override by Decision Tree
"""

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import classification_report, confusion_matrix
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# -------------------------------------------------------------------
# 1. Compute class imbalance and boosted scale_pos_weight
# -------------------------------------------------------------------

pos_v2 = (y_train == 1).sum()
neg_v2 = (y_train == 0).sum()
scale_pos_weight_v2 = neg_v2 / max(pos_v2, 1)

print("Baseline scale_pos_weight:", scale_pos_weight_v2)

# Boost it a bit to further emphasize class 1
scale_pos_weight_v2 *= 1.5
print("Boosted scale_pos_weight (v2):", scale_pos_weight_v2)

# -------------------------------------------------------------------
# 2. Define v2 models (weighted XGBoost + stronger DT for class 1)
# -------------------------------------------------------------------

# XGBoost v2 (weighted)
xgb_clf_v2 = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    tree_method="hist",
    eval_metric="logloss",
    random_state=42,
    scale_pos_weight=scale_pos_weight_v2  # emphasize class 1
)

xgb_pipeline_v2 = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_clf_v2)
])

# Decision Tree v2 (more depth + class weight)
dt_clf_v2 = DecisionTreeClassifier(
    max_depth=6,             # slightly deeper than v1
    min_samples_leaf=20,     # smaller leaves, more granularity
    random_state=42,
    class_weight={0: 1.0, 1: 3.0}  # emphasize class 1
)

dt_pipeline_v2 = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", dt_clf_v2)
])

# -------------------------------------------------------------------
# 3. Fit v2 models
# -------------------------------------------------------------------

print("\n=== Training XGBoost v2 (weighted) ===")
xgb_pipeline_v2.fit(X_train, y_train)
y_pred_xgb_v2 = xgb_pipeline_v2.predict(X_test)

print("\n=== XGBoost v2 performance (baseline threshold 0.5) ===")
print(classification_report(y_test, y_pred_xgb_v2, digits=3))
print("Confusion matrix (XGBoost v2 @0.5):")
print(confusion_matrix(y_test, y_pred_xgb_v2))

print("\n=== Training Decision Tree v2 (weighted) ===")
dt_pipeline_v2.fit(X_train, y_train)
y_pred_dt_v2 = dt_pipeline_v2.predict(X_test)

print("\n=== Decision Tree v2 performance (default threshold) ===")
print(classification_report(y_test, y_pred_dt_v2, digits=3))
print("Confusion matrix (Decision Tree v2):")
print(confusion_matrix(y_test, y_pred_dt_v2))

# -------------------------------------------------------------------
# 4. Threshold scan for XGBoost v2 (no hybrid)
#    to understand recall/precision trade-off on class 1
# -------------------------------------------------------------------

from sklearn.metrics import classification_report

def evaluate_threshold_xgb_v2(thresh):
    proba = xgb_pipeline_v2.predict_proba(X_test)[:, 1]
    pred = (proba >= thresh).astype(int)
    print(f"\n=== XGBoost v2 @ THRESH={thresh:.2f} ===")
    print(classification_report(y_test, pred, digits=3))

print("\n=== Threshold scan for XGBoost v2 ===")
for t in [0.50, 0.40, 0.30, 0.25, 0.20, 0.15]:
    evaluate_threshold_xgb_v2(t)

# -------------------------------------------------------------------
# 5. High-recall Hybrid v2: upgrade-only rule
#    Strategy:
#      - Use a lower threshold for XGBoost v2 (e.g., 0.20)
#      - Define a low-confidence zone around that threshold
#      - If in low-confidence zone AND DT says 1, upgrade to 1
# -------------------------------------------------------------------

# Choose a threshold for high recall (adjust after looking at the scan above)
THRESH_V2 = 0.20   # starting point; consider 0.15 if recall still too low
CONF_MARGIN_V2 = 0.15  # low-confidence band around THRESH_V2

# Probabilities from XGBoost v2
proba_xgb_v2 = xgb_pipeline_v2.predict_proba(X_test)[:, 1]  # P(y=1)

# Base XGBoost v2 prediction with lowered threshold
pred_xgb_v2 = (proba_xgb_v2 >= THRESH_V2).astype(int)

# Decision Tree v2 prediction
pred_dt_v2 = dt_pipeline_v2.predict(X_test)

# Confidence relative to THRESH_V2
confidence_xgb_v2 = np.abs(proba_xgb_v2 - THRESH_V2)
use_dt_v2 = confidence_xgb_v2 < CONF_MARGIN_V2

# UPGRADE-ONLY HYBRID:
#   Start from XGBoost v2 prediction.
#   If XGBoost is low-confidence AND DT predicts 1, set final prediction to 1.
hybrid_pred_v2 = pred_xgb_v2.copy()
hybrid_pred_v2[(use_dt_v2) & (pred_dt_v2 == 1)] = 1

print("\n=== Hybrid v2 (XGBoost v2 + Decision Tree v2, upgrade-only) ===")
print(classification_report(y_test, hybrid_pred_v2, digits=3))
print("Confusion matrix (Hybrid v2):")
print(confusion_matrix(y_test, hybrid_pred_v2))

override_rate_v2 = use_dt_v2.mean() * 100
print(f"\nFraction of test cases where DT v2 can upgrade to 1: {override_rate_v2:.2f}%")

"""
Interpretation notes (for documentation):

- XGBoost v2 uses boosted scale_pos_weight_v2 to penalize errors on class 1 more.
- Decision Tree v2 uses class_weight={0:1, 1:3} and a deeper tree to better capture
  high-risk patterns.
- The threshold scan shows how recall and precision on class 1 change as we lower THRESH.
- Hybrid v2 starts from the low-threshold XGBoost v2 predictions and only allows
  the Decision Tree v2 to "upgrade" uncertain cases to high-risk (class 1) in a
  low-confidence band around THRESH_V2.

To target a specific recall (e.g., ~0.85), adjust:
- scale_pos_weight_v2 (e.g., try 2x instead of 1.5x),
- THRESH_V2 (e.g., 0.20 → 0.15),
- CONF_MARGIN_V2 (e.g., 0.15 → 0.20),
and re-run this block to compare the resulting classification_report
for class 1 (high-risk).
"""

Baseline scale_pos_weight: 14.539393939393939
Boosted scale_pos_weight (v2): 21.80909090909091

=== Training XGBoost v2 (weighted) ===

=== XGBoost v2 performance (baseline threshold 0.5) ===
              precision    recall  f1-score   support

           0      0.969     0.887     0.926       600
           1      0.261     0.585     0.361        41

    accuracy                          0.867       641
   macro avg      0.615     0.736     0.643       641
weighted avg      0.924     0.867     0.890       641

Confusion matrix (XGBoost v2 @0.5):
[[532  68]
 [ 17  24]]

=== Training Decision Tree v2 (weighted) ===

=== Decision Tree v2 performance (default threshold) ===
              precision    recall  f1-score   support

           0      0.973     0.892     0.930       600
           1      0.286     0.634     0.394        41

    accuracy                          0.875       641
   macro avg      0.629     0.763     0.662       641
weighted avg      0.929     0.875     0.896   

'\nInterpretation notes (for documentation):\n\n- XGBoost v2 uses boosted scale_pos_weight_v2 to penalize errors on class 1 more.\n- Decision Tree v2 uses class_weight={0:1, 1:3} and a deeper tree to better capture\n  high-risk patterns.\n- The threshold scan shows how recall and precision on class 1 change as we lower THRESH.\n- Hybrid v2 starts from the low-threshold XGBoost v2 predictions and only allows\n  the Decision Tree v2 to "upgrade" uncertain cases to high-risk (class 1) in a\n  low-confidence band around THRESH_V2.\n\nTo target a specific recall (e.g., ~0.85), adjust:\n- scale_pos_weight_v2 (e.g., try 2x instead of 1.5x),\n- THRESH_V2 (e.g., 0.20 → 0.15),\n- CONF_MARGIN_V2 (e.g., 0.15 → 0.20),\nand re-run this block to compare the resulting classification_report\nfor class 1 (high-risk).\n'

In [8]:
"""
Model v3: Slightly more aggressive high-recall variant

Goal: Increase recall for class 1 a bit beyond Hybrid v2 (~0.805),
      by lowering THRESH further and widening CONF_MARGIN slightly.
Assumes:
- xgb_pipeline_v2, dt_pipeline_v2, X_test, y_test already defined and fitted (from v2).
"""

from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# More aggressive threshold towards class 1
THRESH_V3 = 0.15       # was 0.20 in v2
CONF_MARGIN_V3 = 0.20  # was 0.15 in v2

# Probabilities from XGBoost v2
proba_xgb_v3 = xgb_pipeline_v2.predict_proba(X_test)[:, 1]  # P(y=1)

# Base XGBoost v2 prediction with lower threshold
pred_xgb_v3 = (proba_xgb_v3 >= THRESH_V3).astype(int)

# Decision Tree v2 prediction (same model as in v2)
pred_dt_v3 = dt_pipeline_v2.predict(X_test)

# Confidence relative to THRESH_V3
confidence_xgb_v3 = np.abs(proba_xgb_v3 - THRESH_V3)
use_dt_v3 = confidence_xgb_v3 < CONF_MARGIN_V3

# UPGRADE-ONLY HYBRID v3:
# Start from XGBoost v2 prediction.
# If XGBoost is low-confidence AND DT predicts 1, set final prediction to 1.
hybrid_pred_v3 = pred_xgb_v3.copy()
hybrid_pred_v3[(use_dt_v3) & (pred_dt_v3 == 1)] = 1

print("\n=== Hybrid v3 (more aggressive high-recall) ===")
print(classification_report(y_test, hybrid_pred_v3, digits=3))
print("Confusion matrix (Hybrid v3):")
print(confusion_matrix(y_test, hybrid_pred_v3))

override_rate_v3 = use_dt_v3.mean() * 100
print(f"\nFraction of test cases where DT v2 can upgrade to 1 (v3): {override_rate_v3:.2f}%")


=== Hybrid v3 (more aggressive high-recall) ===
              precision    recall  f1-score   support

           0      0.990     0.818     0.896       600
           1      0.248     0.878     0.387        41

    accuracy                          0.822       641
   macro avg      0.619     0.848     0.642       641
weighted avg      0.942     0.822     0.863       641

Confusion matrix (Hybrid v3):
[[491 109]
 [  5  36]]

Fraction of test cases where DT v2 can upgrade to 1 (v3): 82.37%


In [9]:
"""
Export block for high-recall pipelines:
- Renames v2 pipelines to xgb_high_recall and dt_high_recall (used in Hybrid v3).
- Saves them plus a Hybrid v3 config file.
Assumes:
- xgb_pipeline_v2, dt_pipeline_v2, scale_pos_weight_v2, THRESH_V3, CONF_MARGIN_V3
  are already defined and fitted.
"""

import joblib
import json
import os

# -------------------------------------------------------------------
# 1. Alias pipelines with high-recall names
# -------------------------------------------------------------------

xgb_high_recall = xgb_pipeline_v2
dt_high_recall = dt_pipeline_v2

# -------------------------------------------------------------------
# 2. Define output directory
# -------------------------------------------------------------------

MODEL_DIR = "models_high_risk_v3"
os.makedirs(MODEL_DIR, exist_ok=True)

# -------------------------------------------------------------------
# 3. Save the fitted high-recall pipelines
# -------------------------------------------------------------------

xgb_path = os.path.join(MODEL_DIR, "xgb_high_recall.joblib")
dt_path = os.path.join(MODEL_DIR, "dt_high_recall.joblib")

joblib.dump(xgb_high_recall, xgb_path)
joblib.dump(dt_high_recall, dt_path)

print(f"Saved XGBoost high-recall pipeline to: {xgb_path}")
print(f"Saved Decision Tree high-recall pipeline to: {dt_path}")

# -------------------------------------------------------------------
# 4. Save Hybrid v3 configuration (thresholds, weights, names)
# -------------------------------------------------------------------

hybrid_v3_config = {
    "description": "Hybrid v3: high-recall configuration for HIGH_RISK_DISCONTINUE",
    "xgb_model_file": "xgb_high_recall.joblib",
    "dt_model_file": "dt_high_recall.joblib",
    "scale_pos_weight_v2": float(scale_pos_weight_v2),
    "threshold_v3": float(THRESH_V3),          # e.g. 0.15
    "conf_margin_v3": float(CONF_MARGIN_V3),   # e.g. 0.20
    "hybrid_rule": "upgrade_only_if_low_confidence_and_dt_predicts_1",
    "target_name": "HIGH_RISK_DISCONTINUE",
    "notes": (
        "xgb_high_recall: XGBoost v2 trained with boosted scale_pos_weight_v2. "
        "dt_high_recall: DecisionTree v2 trained with class_weight={0:1.0, 1:3.0}. "
        "Inference steps: "
        "1) Use xgb_high_recall to get P(y=1). "
        "2) Base prediction = 1 if P>=threshold_v3 else 0. "
        "3) Compute |P - threshold_v3|; if < conf_margin_v3 and dt_high_recall predicts 1, "
        "   then final label = 1 (upgrade-only)."
    )
}

config_path = os.path.join(MODEL_DIR, "hybrid_v3_config.json")
with open(config_path, "w") as f:
    json.dump(hybrid_v3_config, f, indent=2)

print(f"Saved Hybrid v3 config to: {config_path}")

Saved XGBoost high-recall pipeline to: models_high_risk_v3\xgb_high_recall.joblib
Saved Decision Tree high-recall pipeline to: models_high_risk_v3\dt_high_recall.joblib
Saved Hybrid v3 config to: models_high_risk_v3\hybrid_v3_config.json
