In [None]:
# Data Preprocessing Notebook

# This notebook is designed to preprocess the dataset for machine learning tasks. It includes steps for data cleaning, feature engineering, and model training.

In [None]:
# Step 0: Imports & basic config

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix

from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier

pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)
sns.set_theme(style="whitegrid", context="notebook")

In [None]:
# Step 1: Load Data

DATA_PATH = "../../data/interim/merged_dataset.csv"

df = pd.read_csv(DATA_PATH)

In [None]:
# Step 2: Standardize mixed code/label columns

# Example mapping dictionaries â€“ **adjust to your exact coding scheme**

educ_level_map = {
    "0": 0, "1": 1, "2": 2, "3": 3, "4": 4, "5": 5,
    "Primary": 1, "Elementary": 1,
    "Secondary": 3,
    "College": 5, "University": 5
}

# ... (other mapping dictionaries) ...

  return col.replace(mapping)
  return col.replace(mapping)
  return col.replace(mapping)
  return col.replace(mapping)
  return col.replace(mapping)


Unnamed: 0,EDUC_LEVEL,HOUSEHOLD_HEAD_SEX,CURRENT_USE_TYPE,PATTERN_USE,CONTRACEPTIVE_USE_AND_INTENTION,WANT_LAST_CHILD,WANT_LAST_PREGNANCY,TOLD_ABT_SIDE_EFFECTS,SMOKE_CIGAR,RESIDING_WITH_PARTNER
0,2.0,1,0,4.0,4,1.0,1,,0,1.0
1,2.0,1,0,4.0,4,1.0,1,,0,1.0
2,1.0,1,3,1.0,1,1.0,1,1.0,0,1.0
3,2.0,1,0,4.0,4,1.0,1,,0,1.0
4,1.0,1,0,4.0,4,1.0,1,,1,1.0


In [None]:
# Step 4: Target selection for this notebook (you can change)
TARGET = "CONTRACEPTIVE_USE_AND_INTENTION"  # or "CONTRACEPTIVE_METHOD", "CURRENT_USE_TYPE"

# Drop rows with missing target (simplest choice)
df_model = df_std.dropna(subset=[TARGET]).copy()
print("Shape after dropping rows with missing target:", df_model.shape)

Shape after dropping rows with missing target: (6612, 31)


In [None]:
# Step 5: Identify column types after standardization

numeric_features = [
    "AGE", "AGE_GRP", "REGION", "RELIGION", "ETHNICITY", "EDUC",
    "HOUSEHOLD_HEAD_SEX", "PARITY", "CONTRACEPTIVE_METHOD",
    "CURRENT_USE_TYPE", "PATTERN_USE", "CONTRACEPTIVE_USE_AND_INTENTION",
    "WANT_LAST_PREGNANCY", "SMOKE_CIGAR", "MARITAL_STATUS",
    "DESIRE_FOR_MORE_CHILDREN", "OCCUPATION"
]

# Keep only numeric columns that actually exist
numeric_features = [c for c in numeric_features if c in df_model.columns]

categorical_features = [
    "CASEID", "LAST_SOURCE_TYPE", "MONTH_USE_CURRENT_METHOD",
    "LAST_METHOD_DISCONTINUED", "REASON_DISCONTINUED",
    "INTENTION_USE", "TOLD_ABT_SIDE_EFFECTS", "RESIDING_WITH_PARTNER",
    "HSBND_DESIRE_FOR_MORE_CHILDREN", "HUSBANDS_EDUC", "HUSBAND_AGE",
    "PARTNER_EDUC"
]

Numeric features: ['AGE', 'AGE_GRP', 'REGION', 'RELIGION', 'ETHNICITY', 'EDUC', 'HOUSEHOLD_HEAD_SEX', 'PARITY', 'CONTRACEPTIVE_METHOD', 'CURRENT_USE_TYPE', 'PATTERN_USE', 'CONTRACEPTIVE_USE_AND_INTENTION', 'WANT_LAST_PREGNANCY', 'SMOKE_CIGAR', 'MARITAL_STATUS', 'DESIRE_FOR_MORE_CHILDREN', 'OCCUPATION']
Categorical features: ['CASEID', 'LAST_SOURCE_TYPE', 'MONTH_USE_CURRENT_METHOD', 'LAST_METHOD_DISCONTINUED', 'REASON_DISCONTINUED', 'INTENTION_USE', 'TOLD_ABT_SIDE_EFFECTS', 'RESIDING_WITH_PARTNER', 'HSBND_DESIRE_FOR_MORE_CHILDREN', 'HUSBANDS_EDUC', 'HUSBAND_AGE', 'PARTNER_EDUC']


In [None]:
# Step 6: Mode imputation for some key columns if you want to do it upfront

def fill_mode(col):
    mode_val = col.mode(dropna=True)
    if len(mode_val) == 0:
        return col
    return col.fillna(mode_val.iloc[0])

for col in ["CASEID", "LAST_SOURCE_TYPE", "INTENTION_USE", "TOLD_ABT_SIDE_EFFECTS",
            "LAST_METHOD_DISCONTINUED", "CONTRACEPTIVE_METHOD", "AGE_GRP"]:
    if col in df_model.columns:
        df_model[col] = fill_mode(df_model[col])

# Display missing values after imputation

display(df_model[["CASEID", "AGE_GRP", "LAST_SOURCE_TYPE", "INTENTION_USE",
                  "TOLD_ABT_SIDE_EFFECTS", "LAST_METHOD_DISCONTINUED",
                  "CONTRACEPTIVE_METHOD"]].isna().sum())

CASEID                      0
AGE_GRP                     0
LAST_SOURCE_TYPE            0
INTENTION_USE               0
TOLD_ABT_SIDE_EFFECTS       0
LAST_METHOD_DISCONTINUED    0
CONTRACEPTIVE_METHOD        0
dtype: int64

In [None]:
# Step 7: Rare category handling

def group_rare_categories(series, min_freq=0.02):
    freqs = series.value_counts(normalize=True)
    rare_cats = freqs[freqs < min_freq].index
    return series.where(~series.isin(rare_cats), other="Other")

high_cardinality_cols = [
    "CASEID", "MONTH_USE_CURRENT_METHOD", "REASON_DISCONTINUED",
    "HUSBAND_AGE", "OCCUPATION"
]

for col in high_cardinality_cols:
    if col in df_model.columns:
        df_model[col] = group_rare_categories(df_model[col].astype(str), min_freq=0.02)

# Display the updated dataframe

display(df_model[high_cardinality_cols].head())

Unnamed: 0,CASEID,MONTH_USE_CURRENT_METHOD,REASON_DISCONTINUED,HUSBAND_AGE,OCCUPATION
0,Other,,,27,0
1,Other,,,Other,91
2,Other,Other,,25,0
3,Other,,,24,0
4,Other,,,32,0


In [None]:
# Step 8: Drop highly correlated features where necessary

# Example: drop AGE_GRP if both AGE and AGE_GRP present
if "AGE" in df_model.columns and "AGE_GRP" in df_model.columns:
    df_model = df_model.drop(columns=["AGE_GRP"])

# If target is CONTRACEPTIVE_USE_AND_INTENTION, remove highly correlated CURRENT_USE_TYPE
if TARGET == "CONTRACEPTIVE_USE_AND_INTENTION" and "CURRENT_USE_TYPE" in df_model.columns:
    df_model = df_model.drop(columns=["CURRENT_USE_TYPE"])


In [None]:
# Step 9: Reconfirm features after drops
numeric_features = [c for c in numeric_features if c in df_model.columns and c != TARGET]
categorical_features = [c for c in categorical_features if c in df_model.columns and c != TARGET]

print("Final numeric features:", numeric_features)
print("Final categorical features:", categorical_features)

Final numeric features: ['AGE', 'REGION', 'RELIGION', 'ETHNICITY', 'EDUC', 'HOUSEHOLD_HEAD_SEX', 'PARITY', 'CONTRACEPTIVE_METHOD', 'PATTERN_USE', 'WANT_LAST_PREGNANCY', 'SMOKE_CIGAR', 'MARITAL_STATUS', 'DESIRE_FOR_MORE_CHILDREN', 'OCCUPATION']
Final categorical features: ['CASEID', 'LAST_SOURCE_TYPE', 'MONTH_USE_CURRENT_METHOD', 'LAST_METHOD_DISCONTINUED', 'REASON_DISCONTINUED', 'INTENTION_USE', 'TOLD_ABT_SIDE_EFFECTS', 'RESIDING_WITH_PARTNER', 'HSBND_DESIRE_FOR_MORE_CHILDREN', 'HUSBANDS_EDUC', 'HUSBAND_AGE', 'PARTNER_EDUC']


In [None]:
# Step 10: Define preprocessing transformers

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median"))
    # no scaler needed for tree-based models
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

In [None]:
# Step 11: Train-test split with stratification

X = df_model.drop(columns=[TARGET])
y = df_model[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)
print("Train target distribution:")
display(y_train.value_counts(normalize=True) * 100)

Train shape: (5289, 28)
Test shape: (1323, 28)
Train target distribution:


CONTRACEPTIVE_USE_AND_INTENTION
1    45.698620
4    24.522594
2    15.106825
3    14.671961
Name: proportion, dtype: float64

In [18]:
import joblib
joblib.dump(df_model, "../../data/processed/preprocessed_full_data.pkl")
joblib.dump((X_train, X_test, y_train, y_test), "../../data/processed/train_test_data.pkl")

['../../data/processed/train_test_data.pkl']

In [14]:
# Step 12: XGBoost pipeline

xgb_clf = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",  # or 'binary:logistic' for binary targets
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42
)

xgb_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_clf)
])

# xgb_pipeline.fit(X_train, y_train)
# y_pred_xgb = xgb_pipeline.predict(X_test)

# print("=== XGBoost performance ===")
# print(classification_report(y_test, y_pred_xgb))

In [15]:
# Step 13: Decision Tree pipeline

dt_clf = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=50,
    random_state=42,
    class_weight="balanced"  # helpful for imbalance
)

dt_pipeline = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", dt_clf)
])

# dt_pipeline.fit(X_train, y_train)
# y_pred_dt = dt_pipeline.predict(X_test)

# print("=== Decision Tree performance ===")
# print(classification_report(y_test, y_pred_dt))

In [16]:
# Step 14: Simple hybrid combiner

# Make sure we can handle non-numeric labels consistently
# le = LabelEncoder()
# y_train_enc = le.fit_transform(y_train)
# y_test_enc = le.transform(y_test)

# Refit XGBoost with encoded labels, if needed
xgb_clf_hybrid = XGBClassifier(
    n_estimators=300,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="multi:softprob",
    tree_method="hist",
    eval_metric="mlogloss",
    random_state=42
)

xgb_pipeline_hybrid = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", xgb_clf_hybrid)
])

# xgb_pipeline_hybrid.fit(X_train, y_train_enc)

dt_clf_hybrid = DecisionTreeClassifier(
    max_depth=5,
    min_samples_leaf=50,
    random_state=42,
    class_weight="balanced"
)

dt_pipeline_hybrid = Pipeline(steps=[
    ("preprocess", preprocessor),
    ("model", dt_clf_hybrid)
])

# dt_pipeline_hybrid.fit(X_train, y_train_enc)

# Get probabilities from XGBoost
# proba_xgb = xgb_pipeline_hybrid.predict_proba(X_test)
# pred_xgb_enc = proba_xgb.argmax(axis=1)
# conf_xgb = proba_xgb.max(axis=1)

# # Tree predictions
# pred_dt_enc = dt_pipeline_hybrid.predict(X_test)

# # Hybrid: if XGBoost confidence < threshold, use Decision Tree
# CONF_THRESHOLD = 0.6
# hybrid_pred_enc = np.where(conf_xgb < CONF_THRESHOLD, pred_dt_enc, pred_xgb_enc)

# # Decode back to original labels
# hybrid_pred = le.inverse_transform(hybrid_pred_enc)

# print("=== Hybrid (XGBoost + Decision Tree) performance ===")
# print(classification_report(y_test, hybrid_pred))

In [17]:
import joblib

joblib.dump(preprocessor, "preprocessor.joblib")
joblib.dump(xgb_pipeline, "xgb_pipeline.joblib")
joblib.dump(dt_pipeline, "dt_pipeline.joblib")
joblib.dump(xgb_pipeline_hybrid, "xgb_pipeline_hybrid.joblib")
joblib.dump(dt_pipeline_hybrid, "dt_pipeline_hybrid.joblib")

['dt_pipeline_hybrid.joblib']