**1: Import Libraries and Setup**

In [None]:
# Core libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.preprocessing import (
    StandardScaler,
    MinMaxScaler,
    RobustScaler,
    LabelEncoder,
)
from sklearn.metrics import classification_report
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import tensorflow as tf


# Scikit-learn utilities

import warnings

# Configure settings
warnings.filterwarnings("ignore")
plt.style.use("seaborn-v0_8-whitegrid")
sns.set_palette("husl")

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)
tf.random.set_seed(42)

print("✅ All libraries imported successfully!")
print("🎯 Week 2: Feature Engineering & Deep Learning Preparation")
print("=" * 70)


✅ All libraries imported successfully!
🎯 Week 2: Feature Engineering & Deep Learning Preparation


**Import dataset again (same as week 1)**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ucimlrepo import fetch_ucirepo

sns.set_style("whitegrid")

# fetch dataset
datset = fetch_ucirepo(id=891)  # CDC Diabetes Health Indicators
# data (as pandas dataframes)
X = datset.data.features  # feature set
y = datset.data.targets  # target variable
# combine features and target into a single dataframe for easier analysis
df = pd.concat([X, y], axis=1)
# df.head()
# Programmatically merge df columns with datset.variables metadata
try:
    if (
        isinstance(datset.variables, pd.DataFrame)
        and "name" in datset.variables.columns
    ):
        df_columns = pd.DataFrame({"name": df.columns})
        merged = pd.merge(datset.variables, df_columns, on="name", how="right")
        pd.set_option("display.max_colwidth", True)
        display(merged)
    else:
        print("datset.variables is not a DataFrame with a 'name' column.")
        print(datset.variables)
except Exception as e:
    print("Error combining variables:", e)


Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,HighBP,Feature,Binary,,0 = no high BP 1 = high BP,,no
1,HighChol,Feature,Binary,,0 = no high cholesterol 1 = high cholesterol,,no
2,CholCheck,Feature,Binary,,0 = no cholesterol check in 5 years 1 = yes cholesterol check in 5 years,,no
3,BMI,Feature,Integer,,Body Mass Index,,no
4,Smoker,Feature,Binary,,Have you smoked at least 100 cigarettes in your entire life? [Note: 5 packs = 100 cigarettes] 0 = no 1 = yes,,no
5,Stroke,Feature,Binary,,(Ever told) you had a stroke. 0 = no 1 = yes,,no
6,HeartDiseaseorAttack,Feature,Binary,,coronary heart disease (CHD) or myocardial infarction (MI) 0 = no 1 = yes,,no
7,PhysActivity,Feature,Binary,,physical activity in past 30 days - not including job 0 = no 1 = yes,,no
8,Fruits,Feature,Binary,,Consume Fruit 1 or more times per day 0 = no 1 = yes,,no
9,Veggies,Feature,Binary,,Consume Vegetables 1 or more times per day 0 = no 1 = yes,,no


#### 2. Data Cleanup

**2.1 Remove duplicate rows**

In [None]:
print("====DELETE DUPLICATE ROWS=====")
## Original dataset shape
print("Original dataset (df) shape:", df.shape)
# Duplicate rows:
print("Duplicate rows:", df.duplicated().sum())
# Drop duplicates
df_dedupe = df.drop_duplicates()

# Cleaned dataset shape
print("Cleaned dataset (df_dedupe) shape:", df_dedupe.shape)

# Compare original and cleaned dataset
print("Number of rows removed:", len(df) - len(df_dedupe))

print("\n✅ All duplicate rows removed")


====DELETE DUPLICATE ROWS=====
Original dataset (df) shape: (253680, 22)
Duplicate rows: 24206
Cleaned dataset (df_dedupe) shape: (229474, 22)
Number of rows removed: 24206

✅ All duplicate rows removed


**2.2 Optimize Data Sets**

- Optimizing data types reduces the memory footprint of your dataset, which is crucial when working with large data or limited resources.
Downcasting numeric columns and converting suitable object columns to category types can significantly decrease RAM usage.
Lower memory usage enables faster data processing, more efficient model training, and the ability to handle larger datasets.
This step is especially important for scalable machine learning pipelines and neural network training, where memory efficiency directly impacts performance and feasibility.

In [None]:
# Analyze current data types
print("=== DATA TYPE OPTIMIZATION ===")
print("Current data types:")
print(df_dedupe.dtypes.value_counts())

# Check for float columns that could be integers
print("\nAnalyzing float64 columns for potential integer conversion:")
float_columns = df_dedupe.select_dtypes(include=["float64"]).columns
if float_columns.empty:
    print("\n---No float64 columns found that could be converted to integers.\n")
else:
    for col in float_columns:
        # Check if all values are whole numbers
        if df_dedupe[col].apply(lambda x: x.is_integer()).all():
            print(
                f"  {col}: Can be converted to integer since all values are whole numbers"
            )
            df_dedupe[col] = df_dedupe[col].astype("int32")
        else:
            print(
                f"  {col}: Must remain float64 as it contains one or more decimal values"
            )
            # Check range to determine if int32 or int64 is appropriate
            min_val, max_val = df_dedupe[col].min(), df_dedupe[col].max()
            print(f"   Range: {min_val} to {max_val}")

# Optimize integer columns
print(f"\nOptimizing integer columns:")
int_columns = df_dedupe.select_dtypes(include=["int64"]).columns
for col in int_columns:
    min_val, max_val = df_dedupe[col].min(), df_dedupe[col].max()
    if min_val >= 0 and max_val <= 255:
        df_dedupe[col] = df_dedupe[col].astype("uint8")
        print(f"  {col}: Optimized to uint8 (range: {min_val}-{max_val})")
    elif min_val >= -128 and max_val <= 127:
        df_dedupe[col] = df_dedupe[col].astype("int8")
        print(f"  {col}: Optimized to int8 (range: {min_val}-{max_val})")
    elif min_val >= 0 and max_val <= 65535:
        df_dedupe[col] = df_dedupe[col].astype("uint16")
        print(f"  {col}: Optimized to uint16 (range: {min_val}-{max_val})")
    else:
        df_dedupe[col] = df_dedupe[col].astype("int32")
        print(f"  {col}: Optimized to int32 (range: {min_val}-{max_val})")

print(
    f"\nMemory usage after optimization: {df_dedupe.memory_usage(deep=True).sum() / 1024**2:.2f} MB"
)
print(
    f"Memory reduction:--> {((df.memory_usage(deep=True).sum() - df_dedupe.memory_usage(deep=True).sum()) / df.memory_usage(deep=True).sum() * 100):.2f}%"
)


=== DATA TYPE OPTIMIZATION ===
Current data types:
int64    22
Name: count, dtype: int64

Analyzing float64 columns for potential integer conversion:

---No float64 columns found that could be converted to integers.


Optimizing integer columns:
  HighBP: Optimized to uint8 (range: 0-1)
  HighChol: Optimized to uint8 (range: 0-1)
  CholCheck: Optimized to uint8 (range: 0-1)
  BMI: Optimized to uint8 (range: 12-98)
  Smoker: Optimized to uint8 (range: 0-1)
  Stroke: Optimized to uint8 (range: 0-1)
  HeartDiseaseorAttack: Optimized to uint8 (range: 0-1)
  PhysActivity: Optimized to uint8 (range: 0-1)
  Fruits: Optimized to uint8 (range: 0-1)
  Veggies: Optimized to uint8 (range: 0-1)
  HvyAlcoholConsump: Optimized to uint8 (range: 0-1)
  AnyHealthcare: Optimized to uint8 (range: 0-1)
  NoDocbcCost: Optimized to uint8 (range: 0-1)
  GenHlth: Optimized to uint8 (range: 1-5)
  MentHlth: Optimized to uint8 (range: 0-30)
  PhysHlth: Optimized to uint8 (range: 0-30)
  DiffWalk: Optimized to ui

🏷️ **1. Categorical Feature Encoding**
- Q: Which categorical features in the dataset have more than two unique values?
- Q: Apply integer-encoding to these high-cardinality features. Why is this strategy suitable for a subsequent neural network with an embedding layer?
- Q: Display the first 5 rows of the transformed data to show the new integer labels.


In [None]:
import pandas as pd


def get_column_types(df, cardinality_threshold=20):
    categorical_binary = []
    categorical = []
    numeric = []
    for col in df.columns:
        dtype = df[col].dtype
        nunique = df[col].nunique()
        if dtype == "object" or dtype.name == "category":
            if nunique == 2:
                categorical_binary.append(col)
            else:
                categorical.append(col)
        elif pd.api.types.is_integer_dtype(dtype):
            if nunique == 2:
                categorical_binary.append(col)
            elif nunique < cardinality_threshold:
                categorical.append(col)
            else:
                numeric.append(col)
        elif pd.api.types.is_float_dtype(dtype):
            numeric.append(col)
    return categorical_binary, categorical, numeric


cat_bin_cols, cat_cols, num_cols = get_column_types(df_dedupe)
print("Categorical binary features with 2 unique values:", cat_bin_cols)
print("Categorical (non-binary) features with more than 2 unique values:", cat_cols)
print("Numeric/continuous columns:", num_cols)


Categorical binary features with 2 unique values: ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'DiffWalk', 'Sex', 'Diabetes_binary']
Categorical (non-binary) features with more than 2 unique values: ['GenHlth', 'Age', 'Education', 'Income']
Numeric/continuous columns: ['BMI', 'MentHlth', 'PhysHlth']


🔎 Identify high-cardinality categorical features

In [None]:
# Identify high cardinality categorical features to encode
def identify_high_cardinality_features(df, low_threshold, up_threshold):
    """
    Identify categorical features with more than threshold unique values. High-cardinality categorical variables are good candidates for integer encoding + embeddings in neural networks.

    Args:
        df (pd.DataFrame): The dataset
        threshold (int): Threshold for high cardinality

    Returns:
        list: High cardinality categorical features
    """
    high_cardinality = []

    for col in df.columns:
        if col != "Diabetes_binary":  # Exclude target
            unique_count = df[col].nunique()
            # Consider as categorical if it has discrete values and not too many
            if low_threshold < unique_count <= up_threshold:
                high_cardinality.append(col)
    return high_cardinality


# Invoke the function with desired thresholds
low_threshold = 2
up_threshold = 20
high_cat_candidates = identify_high_cardinality_features(
    df_dedupe, low_threshold, up_threshold
)
print(
    f"High-cardinality categorical features (more than {low_threshold} unique values):",
    high_cat_candidates,
)


High-cardinality categorical features (more than 2 unique values): ['GenHlth', 'Age', 'Education', 'Income']


🔤 Integer encoding for categorical features

In [None]:
def apply_integer_encoding(df, categorical_features):
    """
    Apply integer encoding to categorical features.

    Args:
        df (pd.DataFrame): The dataset
        categorical_features (list): List of categorical features to encode

    Returns:
        tuple: (encoded_dataframe, encoding_mappings)
    """
    df_encoded = df.copy()
    encoding_mappings = {}

    for feature in categorical_features:
        if feature in df.columns:
            # Create label encoder
            le = LabelEncoder()
            df_encoded[feature] = le.fit_transform(df[feature].astype(str))

            # Store the mapping for reference
            encoding_mappings[feature] = {
                "encoder": le,
                "mapping": dict(zip(le.classes_, le.transform(le.classes_))),
            }

    return df_encoded, encoding_mappings


# Apply integer encoding
df_encoded, enc_maps = apply_integer_encoding(df_dedupe, high_cat_candidates)
print("✅ Integer encoding applied to high-cardinality categorical features.")
print("Encoded DataFrame shape:", df_encoded.shape)
print("Sample encoding mappings:")
for feature, mapping in enc_maps.items():
    print(f"  {feature}: {mapping['mapping']}")


✅ Integer encoding applied to high-cardinality categorical features.
Encoded DataFrame shape: (229474, 22)
Sample encoding mappings:
  GenHlth: {'1': np.int64(0), '2': np.int64(1), '3': np.int64(2), '4': np.int64(3), '5': np.int64(4)}
  Age: {'1': np.int64(0), '10': np.int64(1), '11': np.int64(2), '12': np.int64(3), '13': np.int64(4), '2': np.int64(5), '3': np.int64(6), '4': np.int64(7), '5': np.int64(8), '6': np.int64(9), '7': np.int64(10), '8': np.int64(11), '9': np.int64(12)}
  Education: {'1': np.int64(0), '2': np.int64(1), '3': np.int64(2), '4': np.int64(3), '5': np.int64(4), '6': np.int64(5)}
  Income: {'1': np.int64(0), '2': np.int64(1), '3': np.int64(2), '4': np.int64(3), '5': np.int64(4), '6': np.int64(5), '7': np.int64(6), '8': np.int64(7)}


📏 Analyze scaling requirements

In [None]:
def analyze_scaling_requirements(df, target_col="Diabetes_binary"):
    """
    ML models (especially gradient-based ones like neural nets, logistic regression, SVMs) perform poorly if features are on very different scales. This step decides which columns need normalization.

    Args:
        df (pd.DataFrame): The dataset
        target_col (str): Name of the target column

    Returns:
        dict: Scaling analysis results
    """
    numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    if target_col in numerical_cols:
        numerical_cols.remove(target_col)

    scaling_analysis = {}

    for col in numerical_cols:
        col_stats = {
            "mean": df[col].mean(),
            "std": df[col].std(),
            "min": df[col].min(),
            "max": df[col].max(),
            "range": df[col].max() - df[col].min(),
            "scale_factor": df[col].max()
            / (df[col].min() + 1e-8),  # Avoid division by zero
        }

        # Determine if scaling is needed
        needs_scaling = False
        scaling_reason = []

        # Check for large range
        if col_stats["range"] > 1000:
            needs_scaling = True
            scaling_reason.append("Large range of values")

        # Check for different scales compared to other features
        if col_stats["scale_factor"] > 100:
            needs_scaling = True
            scaling_reason.append("Values span multiple orders of magnitude")

        # Check standard deviation
        if col_stats["std"] > 100:
            needs_scaling = True
            scaling_reason.append("High standard deviation")

        col_stats["needs_scaling"] = needs_scaling
        col_stats["scaling_reason"] = scaling_reason

        scaling_analysis[col] = col_stats

    return scaling_analysis


# 3) Analyze scaling needs
scaling_report = analyze_scaling_requirements(df_encoded, target_col="Diabetes_binary")
features_to_scale = [k for k, v in scaling_report.items() if v["needs_scaling"]]
print("Features identified for scaling:", features_to_scale)
for feature, stats in scaling_report.items():
    print(f"\nFeature: {feature}")
    for stat, value in stats.items():
        if stat != "scaling_reason":
            print(f"  {stat}: {value}")
    if stats["needs_scaling"]:
        print(f"  Scaling needed due to: {', '.join(stats['scaling_reason'])}")
    else:
        print("  No scaling needed.")


Features identified for scaling: ['HighBP', 'HighChol', 'CholCheck', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', 'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', 'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']

Feature: HighBP
  mean: 0.45434341145402096
  std: 0.4979121973713543
  min: 0
  max: 1
  range: 1
  scale_factor: 100000000.0
  needs_scaling: True
  Scaling needed due to: Values span multiple orders of magnitude

Feature: HighChol
  mean: 0.44164044728378815
  std: 0.49658356519075136
  min: 0
  max: 1
  range: 1
  scale_factor: 100000000.0
  needs_scaling: True
  Scaling needed due to: Values span multiple orders of magnitude

Feature: CholCheck
  mean: 0.9594812484203004
  std: 0.19717289815963487
  min: 0
  max: 1
  range: 1
  scale_factor: 100000000.0
  needs_scaling: True
  Scaling needed due to: Values span multiple orders of magnitude

Feature: BMI
  mean: 28.687507081412274
  std: 6.789204221

🔧 Apply feature scaling

In [None]:
def apply_feature_scaling(df, features_to_scale, scaling_method="standard"):
    """
    Apply scaling to specified features. Scaling prevents features with large magnitudes from dominating the model. Choice depends on algorithm:
    StandardScaler → good for algorithms assuming Gaussian-like distributions (LogReg, NN).
    MinMaxScaler → good for bounded features or distance-based algorithms (KNN, Neural Nets with ReLU).

    Args:
        df (pd.DataFrame): The dataset
        features_to_scale (list): Features to scale
        scaling_method (str): 'standard' or 'minmax'

    Returns:
        tuple: (scaled_dataframe, scaler_objects)
    """
    df_scaled = df.copy()
    scalers = {}

    for feature in features_to_scale:
        if feature in df.columns:
            if scaling_method == "standard":
                scaler = StandardScaler()
            elif scaling_method == "minmax":
                scaler = MinMaxScaler()
            else:
                raise ValueError("scaling_method must be 'standard' or 'minmax'")

            # Fit and transform the feature
            df_scaled[feature] = scaler.fit_transform(df[[feature]])
            scalers[feature] = scaler

    return df_scaled, scalers


# Apply scaling
df_scaled, scalers = apply_feature_scaling(
    df_encoded, features_to_scale, scaling_method="standard"
)
print("✅ Feature scaling applied using StandardScaler.")
print("Scaled DataFrame shape:", df_scaled.shape)


✅ Feature scaling applied using StandardScaler.
Scaled DataFrame shape: (229474, 22)


✂️ Stratified train/val/test split

In [None]:
def perform_stratified_split(
    df,
    target_col="Diabetes_binary",
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    random_state=42,
):
    """
    Perform stratified splitting of the dataset. For imbalanced data (like diabetes prediction where positives < negatives), stratification ensures all splits reflect original class proportions. Otherwise, validation/test may contain very few positive cases.

    Args:
        df (pd.DataFrame): The dataset
        target_col (str): Name of the target column
        train_size (float): Proportion for training set
        val_size (float): Proportion for validation set
        test_size (float): Proportion for test set
        random_state (int): Random state for reproducibility

    Returns:
        tuple: (X_train, X_val, X_test, y_train, y_val, y_test)
    """
    # Validate split sizes
    if abs(train_size + val_size + test_size - 1.0) > 1e-6:
        raise ValueError("Split sizes must sum to 1.0")

    # Separate features and target
    X = df.drop(columns=[target_col])
    y = df[target_col]

    # First split: separate test set
    X_temp, X_test, y_temp, y_test = train_test_split(
        X, y, test_size=test_size, stratify=y, random_state=random_state
    )

    # Second split: separate train and validation from remaining data
    val_size_adjusted = val_size / (train_size + val_size)
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp,
        y_temp,
        test_size=val_size_adjusted,
        stratify=y_temp,
        random_state=random_state,
    )

    return X_train, X_val, X_test, y_train, y_val, y_test


# Split data
X_train, X_val, X_test, y_train, y_val, y_test = perform_stratified_split(
    df_scaled,
    target_col="Diabetes_binary",
    train_size=0.7,
    val_size=0.15,
    test_size=0.15,
    random_state=42,
)
print("✅ Data split into train, validation, and test sets.")
print(
    f"Train set shape: {X_train.shape}, Validation set shape: {X_val.shape}, Test set shape: {X_test.shape}"
)


✅ Data split into train, validation, and test sets.
Train set shape: (160631, 21), Validation set shape: (34421, 21), Test set shape: (34422, 21)


✅ Verify stratification

In [None]:
def verify_stratification(
    y_train, y_val, y_test, set_names=["Train", "Validation", "Test"]
):
    """
    Verify that stratification was successful by comparing class distributions.

    Args:
        y_train, y_val, y_test: Target arrays for each set
        set_names (list): Names for the sets

    Returns:
        pd.DataFrame: Class distribution comparison
    """
    distributions = []

    for y_set, name in zip([y_train, y_val, y_test], set_names):
        value_counts = pd.Series(y_set).value_counts().sort_index()
        proportions = pd.Series(y_set).value_counts(normalize=True).sort_index()

        distributions.append(
            {
                "Set": name,
                "Total_Samples": len(y_set),
                "Class_0_Count": value_counts.get(0, 0),
                "Class_1_Count": value_counts.get(1, 0),
                "Class_0_Proportion": proportions.get(0, 0),
                "Class_1_Proportion": proportions.get(1, 0),
            }
        )

    return pd.DataFrame(distributions)


# Verify stratification
verify_stratification(y_train, y_val, y_test)
print("✅ Stratification verified across splits.")


✅ Stratification verified across splits.


🧾 Feature engineering summary

In [None]:
import json
from pprint import pprint
import pandas as pd


def generate_feature_engineering_summary(
    original_df,
    processed_df,
    encoding_mappings,
    scalers,
    split_info,
    *,
    as_dfs=False,
    verbose=False,
    mapping_preview_n=5,
):
    """
    Generate a comprehensive summary of feature engineering steps.

    Args:
        original_df (pd.DataFrame): Original dataset
        processed_df (pd.DataFrame): Processed dataset
        encoding_mappings (dict): {feature: {'encoder': LabelEncoder, 'mapping': {orig:str->int}}}
        scalers (dict): {feature: fitted scaler}
        split_info (dict): Any metadata about splits (sizes, ratios, etc.)
        as_dfs (bool): If True, also return tidy DataFrames for display
        verbose (bool): If True, pretty-print a human-readable summary
        mapping_preview_n (int): How many mapping items to preview per encoded feature

    Returns:
        dict (always): 'summary' dict with details
        dict of DataFrames (optional, when as_dfs=True):
            {'overview': df, 'encoding': df, 'scaling': df, 'splits': df}
    """
    summary = {
        "data_transformation": {
            "original_shape": original_df.shape,
            "processed_shape": processed_df.shape,
            "features_encoded": len(encoding_mappings),
            "features_scaled": len(scalers),
        },
        "encoding_details": {},
        "scaling_details": {},
        "data_splits": split_info if isinstance(split_info, dict) else {},
    }

    # Encoding details
    for feature, mapping in encoding_mappings.items():
        mp = mapping.get("mapping", {})
        preview_items = list(mp.items())[:mapping_preview_n]
        summary["encoding_details"][feature] = {
            "original_unique_values": len(mp),
            "encoding_type": "Integer Encoding",
            "mapping_preview": dict(preview_items),
        }

    # Scaling details
    for feature, scaler in scalers.items():
        if hasattr(scaler, "mean_"):  # StandardScaler
            summary["scaling_details"][feature] = {
                "scaling_type": "Standard Scaling",
                "mean": float(scaler.mean_[0]),
                "scale": float(scaler.scale_[0]),
            }
        elif hasattr(scaler, "min_"):  # MinMaxScaler
            summary["scaling_details"][feature] = {
                "scaling_type": "MinMax Scaling",
                "min": float(scaler.min_[0]),
                "scale": float(scaler.scale_[0]),
            }
        else:
            summary["scaling_details"][feature] = {
                "scaling_type": type(scaler).__name__,
                "details": "Scaler does not expose mean_/min_ attributes",
            }

    if verbose:
        print("=== Data Transformation ===")
        print(f"Original shape:  {summary['data_transformation']['original_shape']}")
        print(f"Processed shape: {summary['data_transformation']['processed_shape']}")
        print(f"Features encoded: {summary['data_transformation']['features_encoded']}")
        print(
            f"Features scaled:  {summary['data_transformation']['features_scaled']}\n"
        )

        print("=== Encoding Details ===")
        if summary["encoding_details"]:
            for feat, det in summary["encoding_details"].items():
                print(
                    f"- {feat}: {det['encoding_type']} (unique={det['original_unique_values']})"
                )
                print(f"  preview: {det['mapping_preview']}")
        else:
            print("  (none)")
        print()

        print("=== Scaling Details ===")
        if summary["scaling_details"]:
            for feat, det in summary["scaling_details"].items():
                print(
                    f"- {feat}: {det['scaling_type']} | { {k:v for k,v in det.items() if k!='scaling_type'} }"
                )
        else:
            print("  (none)")
        print()

        if summary["data_splits"]:
            print("=== Data Splits ===")
            pprint(summary["data_splits"])
            print()

    if not as_dfs:
        return summary

    # Build nice DataFrames for display
    overview_df = pd.DataFrame([summary["data_transformation"]])

    if summary["encoding_details"]:
        enc_rows = []
        for feat, det in summary["encoding_details"].items():
            enc_rows.append(
                {
                    "feature": feat,
                    "encoding_type": det["encoding_type"],
                    "original_unique_values": det["original_unique_values"],
                    "mapping_preview": json.dumps(det["mapping_preview"]),
                }
            )
        encoding_df = pd.DataFrame(enc_rows).sort_values("feature")
    else:
        encoding_df = pd.DataFrame(
            columns=[
                "feature",
                "encoding_type",
                "original_unique_values",
                "mapping_preview",
            ]
        )

    if summary["scaling_details"]:
        sc_rows = []
        for feat, det in summary["scaling_details"].items():
            row = {"feature": feat, "scaling_type": det.get("scaling_type", "")}
            for k, v in det.items():
                if k != "scaling_type":
                    row[k] = v
            sc_rows.append(row)
        scaling_df = pd.DataFrame(sc_rows).sort_values("feature")
    else:
        scaling_df = pd.DataFrame(
            columns=["feature", "scaling_type", "mean", "scale", "min"]
        )

    if summary["data_splits"]:
        splits_df = pd.DataFrame([summary["data_splits"]])
    else:
        splits_df = pd.DataFrame()

    return summary, {
        "overview": overview_df,
        "encoding": encoding_df,
        "scaling": scaling_df,
        "splits": splits_df,
    }


# Summarize feature engineering
split_info = {
    "train_size": len(y_train),
    "val_size": len(y_val),
    "test_size": len(y_test),
}
summary = generate_feature_engineering_summary(
    df_dedupe, df_scaled, enc_maps, scalers, split_info
)
print("=== Feature Engineering Summary ===")
for key, value in summary.items():
    print(f"{key}: {value}")


=== Feature Engineering Summary ===
data_transformation: {'original_shape': (229474, 22), 'processed_shape': (229474, 22), 'features_encoded': 4, 'features_scaled': 20}
encoding_details: {'GenHlth': {'original_unique_values': 5, 'encoding_type': 'Integer Encoding', 'mapping_preview': {'1': np.int64(0), '2': np.int64(1), '3': np.int64(2), '4': np.int64(3), '5': np.int64(4)}}, 'Age': {'original_unique_values': 13, 'encoding_type': 'Integer Encoding', 'mapping_preview': {'1': np.int64(0), '10': np.int64(1), '11': np.int64(2), '12': np.int64(3), '13': np.int64(4)}}, 'Education': {'original_unique_values': 6, 'encoding_type': 'Integer Encoding', 'mapping_preview': {'1': np.int64(0), '2': np.int64(1), '3': np.int64(2), '4': np.int64(3), '5': np.int64(4)}}, 'Income': {'original_unique_values': 8, 'encoding_type': 'Integer Encoding', 'mapping_preview': {'1': np.int64(0), '2': np.int64(1), '3': np.int64(2), '4': np.int64(3), '5': np.int64(4)}}}
scaling_details: {'HighBP': {'scaling_type': 'Stan

🔁 Before/after comparison (stats only)

In [None]:
def create_before_after_comparison(original_df, processed_df, features_to_compare):
    """
    Create visualizations comparing features before and after processing.

    (Note: This function returns stats; you can add your own visualizations in separate cells.)

    Args:
        original_df (pd.DataFrame): Original dataset
        processed_df (pd.DataFrame): Processed dataset
        features_to_compare (list): Features to compare

    Returns:
        dict: Comparison statistics and visualizations
    """
    comparisons = {}

    for feature in features_to_compare:
        if feature in original_df.columns and feature in processed_df.columns:
            comparison = {
                "original_stats": {
                    "mean": original_df[feature].mean(),
                    "std": original_df[feature].std(),
                    "min": original_df[feature].min(),
                    "max": original_df[feature].max(),
                },
                "processed_stats": {
                    "mean": processed_df[feature].mean(),
                    "std": processed_df[feature].std(),
                    "min": processed_df[feature].min(),
                    "max": processed_df[feature].max(),
                },
            }
            comparisons[feature] = comparison

    return comparisons


# Compare before/after for selected features
Compare_stats = create_before_after_comparison(df, df_scaled, features_to_scale[:5])
print("Comparison stats for selected features:")
for feature, stats in Compare_stats.items():
    print(f"- {feature}:")
    print(f"  Original: {stats['original_stats']}")
    print(f"  Processed: {stats['processed_stats']}")


Comparison stats for selected features:
- HighBP:
  Original: {'mean': np.float64(0.4290011037527594), 'std': np.float64(0.4949344626899013), 'min': np.int64(0), 'max': np.int64(1)}
  Processed: {'mean': np.float64(-1.367369166492327e-16), 'std': np.float64(1.0000021789032056), 'min': np.float64(-0.9124990386316569), 'max': np.float64(1.0958915655402288)}
- HighChol:
  Original: {'mean': np.float64(0.4241209397666351), 'std': np.float64(0.49420980465688485), 'min': np.int64(0), 'max': np.int64(1)}
  Processed: {'mean': np.float64(-5.152405554898623e-17), 'std': np.float64(1.0000021789032054), 'min': np.float64(-0.8893596980116083), 'max': np.float64(1.1244044476444752)}
- CholCheck:
  Original: {'mean': np.float64(0.9626695048880479), 'std': np.float64(0.1895707543627255), 'min': np.int64(0), 'max': np.int64(1)}
  Processed: {'mean': np.float64(-7.134099999090401e-17), 'std': np.float64(1.0000021789032056), 'min': np.float64(-4.866202951788296), 'max': np.float64(0.20549903279979448)}
