In [2]:
"""
Design 1 (updated, no leakage from history into label):
- Population: CURRENT contraceptive users
- Target: HIGH_RISK_DISCONTINUE, defined ONLY from intention variables
- Features: demographics, fertility, method characteristics, history (as predictors only)
"""

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

# -------------------------------------------------------------------
# 1. Load data
# -------------------------------------------------------------------

DATA_PATH = "../../data/interim/merged_dataset.csv"  # adjust if needed
df = pd.read_csv(DATA_PATH)

print("Raw data shape:", df.shape)
print("Columns:", list(df.columns))

# -------------------------------------------------------------------
# 2. Helper: safe membership check on mixed code/label columns
# -------------------------------------------------------------------

def is_in(series: pd.Series, values) -> pd.Series:
    """
    Robust membership check that works if the column has mixed types
    (e.g. numeric codes and string labels).
    """
    values_str = set(str(v) for v in values)
    return series.astype(str).isin(values_str)

# -------------------------------------------------------------------
# 3. Filter to CURRENT users only
#    Treat '3' and 'Current user' as current user status.
# -------------------------------------------------------------------

if "CURRENT_USE_TYPE" not in df.columns:
    raise ValueError("Column 'CURRENT_USE_TYPE' not found in the dataset.")

current_user_values = ["3", "Current user"]
df_current = df[is_in(df["CURRENT_USE_TYPE"], current_user_values)].copy()

print("Filtered to current users. Shape:", df_current.shape)

# -------------------------------------------------------------------
# 4. Construct HIGH_RISK_DISCONTINUE target (intention-based only)
#    Logic:
#    - High risk (1) if intention suggests "using but intends to stop" or similar
#    - Low risk (0) if intention suggests "using and intends to continue"
#    - Others => NaN (dropped later)
# -------------------------------------------------------------------

# 4.1 Intention-based risk using CONTRACEPTIVE_USE_AND_INTENTION
if "CONTRACEPTIVE_USE_AND_INTENTION" in df_current.columns:
    high_risk_intention_values = [
        "3",                           # numeric code (example) for "Using but intends to stop"
        "Using but intends to stop",
        "Using but unsure",
    ]
    low_risk_intention_values = [
        "1",                           # numeric code (example) for "Using and intends to continue"
        "Using and intends to continue",
    ]
    intention_high_risk = is_in(df_current["CONTRACEPTIVE_USE_AND_INTENTION"],
                                high_risk_intention_values)
    intention_low_risk = is_in(df_current["CONTRACEPTIVE_USE_AND_INTENTION"],
                               low_risk_intention_values)
else:
    intention_high_risk = pd.Series(False, index=df_current.index)
    intention_low_risk = pd.Series(False, index=df_current.index)

# 4.2 Additional INTENTION_USE-based risk (secondary hint)
if "INTENTION_USE" in df_current.columns:
    high_risk_INTENTION_USE_values = [
        "4", "5", "7",   # example codes: "no intention"/"undecided"/"intends to stop"
        "No intention",
        "Intends to stop",
        "Undecided",
    ]
    low_risk_INTENTION_USE_values = [
        "1", "2", "3",
        "Intends to continue",
        "Intends to use",
    ]
    intention_use_high_risk = is_in(df_current["INTENTION_USE"],
                                    high_risk_INTENTION_USE_values)
    intention_use_low_risk = is_in(df_current["INTENTION_USE"],
                                   low_risk_INTENTION_USE_values)
else:
    intention_use_high_risk = pd.Series(False, index=df_current.index)
    intention_use_low_risk = pd.Series(False, index=df_current.index)

# Combine intention signals (ONLY intentions, no history in label)
any_intention_high_risk = intention_high_risk | intention_use_high_risk
any_intention_low_risk = intention_low_risk | intention_use_low_risk

# 4.3 Final target: HIGH_RISK_DISCONTINUE (intention-based)
target = np.where(
    any_intention_high_risk,
    1,
    np.where(
        any_intention_low_risk & ~any_intention_high_risk,
        0,
        np.nan
    )
)

df_current["HIGH_RISK_DISCONTINUE"] = target

print("\nTarget value counts (including NaN):")
print(df_current["HIGH_RISK_DISCONTINUE"].value_counts(dropna=False))

# Keep only rows with defined target
df_model_d1 = df_current.dropna(subset=["HIGH_RISK_DISCONTINUE"]).copy()
df_model_d1["HIGH_RISK_DISCONTINUE"] = df_model_d1["HIGH_RISK_DISCONTINUE"].astype(int)

print("\nDesign 1 modeling sample shape:", df_model_d1.shape)
print("Target distribution (%):")
print((df_model_d1["HIGH_RISK_DISCONTINUE"].value_counts(normalize=True) * 100).round(2))

# -------------------------------------------------------------------
# 5. Define feature set (no intention columns to avoid label leakage)
#    We allow history variables (LAST_METHOD_DISCONTINUED, REASON_DISCONTINUED)
#    as predictors, but they are NOT used in the label definition.
# -------------------------------------------------------------------

feature_cols_demo = [
    "AGE", "REGION", "EDUC_LEVEL", "RELIGION", "ETHNICITY",
    "MARITAL_STATUS", "RESIDING_WITH_PARTNER",
    "HOUSEHOLD_HEAD_SEX", "OCCUPATION",
    "HUSBANDS_EDUC", "HUSBAND_AGE", "PARTNER_EDUC",
    "SMOKE_CIGAR"
]

feature_cols_fertility = [
    "PARITY",
    "DESIRE_FOR_MORE_CHILDREN",
    "WANT_LAST_CHILD",
    "WANT_LAST_PREGNANCY",
]

feature_cols_method = [
    "CONTRACEPTIVE_METHOD",
    "MONTH_USE_CURRENT_METHOD",
    "PATTERN_USE",
    "TOLD_ABT_SIDE_EFFECTS",
    "LAST_SOURCE_TYPE",
    "LAST_METHOD_DISCONTINUED",      # history as feature (OK now)
    "REASON_DISCONTINUED",           # history as feature (OK now)
    "HSBND_DESIRE_FOR_MORE_CHILDREN",
]

# Columns to exclude from features (direct definition of target)
leakage_cols = [
    "CONTRACEPTIVE_USE_AND_INTENTION",
    "INTENTION_USE",
    "HIGH_RISK_DISCONTINUE",
]

all_candidate_features = feature_cols_demo + feature_cols_fertility + feature_cols_method
feature_cols = [
    c for c in all_candidate_features
    if c in df_model_d1.columns and c not in leakage_cols
]

print("\nNumber of selected features:", len(feature_cols))
print("Selected feature columns:")
print(feature_cols)

TARGET_D1 = "HIGH_RISK_DISCONTINUE"
X = df_model_d1[feature_cols].copy()
y = df_model_d1[TARGET_D1].copy()

print("\nFinal X shape:", X.shape)
print("Final y shape:", y.shape)

# -------------------------------------------------------------------
# 6. Train-test split (stratified)
# -------------------------------------------------------------------

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\nTrain shape:", X_train.shape, "Test shape:", X_test.shape)
print("Train target distribution (%):")
print((y_train.value_counts(normalize=True) * 100).round(2))

# -------------------------------------------------------------------
# 7. Identify numeric vs categorical features
# -------------------------------------------------------------------

all_features = list(X_train.columns)
numeric_features = [
    col for col in all_features
    if pd.api.types.is_numeric_dtype(X_train[col])
]
categorical_features = [
    col for col in all_features
    if col not in numeric_features
]

print("\nNumeric features:", numeric_features)
print("Categorical features:", categorical_features)

# -------------------------------------------------------------------
# 8. Preprocessing: imputation + one-hot encoding
# -------------------------------------------------------------------

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# -------------------------------------------------------------------
# 9. Train XGBoost baseline
# -------------------------------------------------------------------

xgb_clf = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    tree_method="hist",
    eval_metric="logloss",
    random_state=42,
)

xgb_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_clf)
])

print("\n=== Training XGBoost baseline ===")
xgb_pipeline.fit(X_train, y_train)
y_pred_xgb = xgb_pipeline.predict(X_test)

print("\n=== XGBoost performance (HIGH_RISK_DISCONTINUE) ===")
print(classification_report(y_test, y_pred_xgb, digits=3))
print("Confusion matrix (XGBoost):")
print(confusion_matrix(y_test, y_pred_xgb))

# -------------------------------------------------------------------
# 10. Train Decision Tree baseline
# -------------------------------------------------------------------

dt_clf = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=50,
    random_state=42,
    class_weight="balanced"
)

dt_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", dt_clf)
])

print("\n=== Training Decision Tree baseline ===")
dt_pipeline.fit(X_train, y_train)
y_pred_dt = dt_pipeline.predict(X_test)

print("\n=== Decision Tree performance (HIGH_RISK_DISCONTINUE) ===")
print(classification_report(y_test, y_pred_dt, digits=3))
print("Confusion matrix (Decision Tree):")
print(confusion_matrix(y_test, y_pred_dt))

# -------------------------------------------------------------------
# 11. Hybrid model: XGBoost + Decision Tree override on low-confidence cases
# -------------------------------------------------------------------

print("\n=== Training hybrid components ===")

xgb_hybrid = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="binary:logistic",
    tree_method="hist",
    eval_metric="logloss",
    random_state=42,
)

xgb_pipeline_hybrid = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_hybrid)
])

dt_hybrid = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=50,
    random_state=42,
    class_weight="balanced"
)

dt_pipeline_hybrid = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", dt_hybrid)
])

xgb_pipeline_hybrid.fit(X_train, y_train)
dt_pipeline_hybrid.fit(X_train, y_train)

# Probabilities from XGBoost
proba_xgb = xgb_pipeline_hybrid.predict_proba(X_test)[:, 1]  # P(y=1)
pred_xgb = (proba_xgb >= 0.5).astype(int)

# Predictions from Decision Tree
pred_dt = dt_pipeline_hybrid.predict(X_test)

# Confidence margin for hybrid switching
CONF_MARGIN = 0.1  # tune this: smaller = fewer overrides
confidence_xgb = np.abs(proba_xgb - 0.5)
use_dt = confidence_xgb < CONF_MARGIN

hybrid_pred = np.where(use_dt, pred_dt, pred_xgb)

print("\n=== Hybrid (XGBoost + Decision Tree) performance ===")
print(classification_report(y_test, hybrid_pred, digits=3))
print("Confusion matrix (Hybrid):")
print(confusion_matrix(y_test, hybrid_pred))

override_rate = use_dt.mean() * 100
print(f"\nFraction of test cases overridden by Decision Tree: {override_rate:.2f}%")

"""
You now have:
- df_model_d1: current users with intention-based HIGH_RISK_DISCONTINUE
- X_train, X_test, y_train, y_test
- xgb_pipeline, dt_pipeline, and a hybrid model

Next steps:
- Inspect especially recall/precision for class 1 (high-risk).
- Tune XGBoost, Decision Tree, and CONF_MARGIN for your desired trade-off.
"""

Raw data shape: (6612, 31)
Columns: ['CASEID', 'AGE', 'AGE_GRP', 'REGION', 'EDUC_LEVEL', 'RELIGION', 'ETHNICITY', 'EDUC', 'HOUSEHOLD_HEAD_SEX', 'PARITY', 'CONTRACEPTIVE_METHOD', 'CURRENT_USE_TYPE', 'LAST_SOURCE_TYPE', 'MONTH_USE_CURRENT_METHOD', 'LAST_METHOD_DISCONTINUED', 'REASON_DISCONTINUED', 'PATTERN_USE', 'INTENTION_USE', 'CONTRACEPTIVE_USE_AND_INTENTION', 'WANT_LAST_CHILD', 'WANT_LAST_PREGNANCY', 'TOLD_ABT_SIDE_EFFECTS', 'SMOKE_CIGAR', 'MARITAL_STATUS', 'RESIDING_WITH_PARTNER', 'DESIRE_FOR_MORE_CHILDREN', 'HSBND_DESIRE_FOR_MORE_CHILDREN', 'OCCUPATION', 'HUSBANDS_EDUC', 'HUSBAND_AGE', 'PARTNER_EDUC']
Filtered to current users. Shape: (3521, 31)

Target value counts (including NaN):
HIGH_RISK_DISCONTINUE
0.0    2999
NaN     316
1.0     206
Name: count, dtype: int64

Design 1 modeling sample shape: (3205, 32)
Target distribution (%):
HIGH_RISK_DISCONTINUE
0    93.57
1     6.43
Name: proportion, dtype: float64

Number of selected features: 25
Selected feature columns:
['AGE', 'REGION

'\nYou now have:\n- df_model_d1: current users with intention-based HIGH_RISK_DISCONTINUE\n- X_train, X_test, y_train, y_test\n- xgb_pipeline, dt_pipeline, and a hybrid model\n\nNext steps:\n- Inspect especially recall/precision for class 1 (high-risk).\n- Tune XGBoost, Decision Tree, and CONF_MARGIN for your desired trade-off.\n'

In [3]:
import joblib
joblib.dump((X_train, X_test, y_train, y_test), '../../data/processed/discontinuation_design1_data_v2.pkl')
joblib.dump(df_model_d1, '../../data/processed/discontinuation_design1_full_data_v2.pkl')
joblib.dump(xgb_pipeline_hybrid, '../models/discontinuation_design1_xgb_hybrid_model_d1_v2.pkl')
joblib.dump(dt_pipeline_hybrid, '../models/discontinuation_design1_dt_hybrid_model_d1_v2.pkl')
joblib.dump(xgb_pipeline, '../models/discontinuation_design1_xgb_model_d1_v2.pkl')
joblib.dump(dt_pipeline, '../models/discontinuation_design1_dt_model_d1_v2.pkl')

['../models/discontinuation_design1_dt_model_d1_v2.pkl']