### Preprocessing Pipeline for pre_campaign model

The pipeline was developed based on the findings during EDA.  
It automates preprocessing of the dataset by applying the required transformations in the correct order.  
During EDA, necessary transformations were identified.  
Instead of repeating these steps manually every time, the pipeline ensures consistency and reproducibility.  
This prevents mistakes such as applying transformations in the wrong order or forgetting a step.  
It also makes the workflow dynamic and reusable, so the same pipeline can be applied to new data in the future.

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import PowerTransformer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from scipy.stats.mstats import winsorize
import os
import time
import joblib

In [21]:
# dynamic_customer_pipeline_fixed.py
# Custom Transformers
class NumericCaster(BaseEstimator, TransformerMixin):
    """Convert all numeric columns to float dtype."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        num_cols = X.select_dtypes(include=[np.number]).columns
        X[num_cols] = X[num_cols].astype(float)
        return X

class AgeGroupCreator(BaseEstimator, TransformerMixin):
    """Create 'age_group' column from 'age' and drop 'age'."""
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        X = X.copy()
        if 'age' in X.columns:
            X['age_group'] = pd.cut(
                X['age'],
                bins=[18, 40, 65, 95],
                labels=['Young', 'Middle', 'Old'],
                include_lowest=True
            )
            X.drop(columns=['age'], inplace=True)
        return X

class BalanceProcessor(BaseEstimator, TransformerMixin):
    """
    Handle balance column:
    - Create balance_group using median=407
    - Apply Yeo-Johnson transform
    - Winsorize outliers (0.10, 0.10)
    - Standardize using z-score
    """
    def __init__(self, median_val=407):
        self.median_val = median_val
        self.pt = PowerTransformer(method='yeo-johnson', standardize=False)
        self.scaler = StandardScaler()

    def fit(self, X, y=None):
        X = X.copy()
        if 'balance' in X.columns:
            bal = pd.to_numeric(X['balance'], errors='coerce').fillna(0)
            bal_reshaped = bal.values.reshape(-1, 1)
            self.pt.fit(bal_reshaped)
            transformed = self.pt.transform(bal_reshaped).flatten()
            transformed = winsorize(transformed, limits=(0.10, 0.10))
            self.scaler.fit(transformed.reshape(-1, 1))
        return self

    def transform(self, X):
        X = X.copy()
        if 'balance' in X.columns:
            bal = pd.to_numeric(X['balance'], errors='coerce').fillna(0)
            X['balance_group'] = np.where(bal <= self.median_val, 'low', 'high')

            # Yeo-Johnson
            bal_t = self.pt.transform(bal.values.reshape(-1, 1)).flatten()
            # Winsorize
            bal_t = winsorize(bal_t, limits=(0.10, 0.10))
            # Standardize
            bal_t = self.scaler.transform(bal_t.reshape(-1, 1)).flatten()
            X['balance'] = bal_t
        return X

class OneHotInPlace(BaseEstimator, TransformerMixin):
    """One-hot encode selected categorical columns in-place (drop first dummy)."""
    def __init__(self, columns):
        self.columns = columns
        self.encoders = {}

    def fit(self, X, y=None):
        for col in self.columns:
            if col in X.columns:
                enc = OneHotEncoder(drop='first', sparse_output=False, handle_unknown='ignore')
                enc.fit(X[[col]])
                self.encoders[col] = enc
        return self

    def transform(self, X):
        X = X.copy()
        for col, enc in self.encoders.items():
            if col in X.columns:
                transformed = enc.transform(X[[col]])
                new_cols = [f"{col}_{cat}" for cat in enc.categories_[0][1:]]
                transformed_df = pd.DataFrame(transformed, columns=new_cols, index=X.index)
                X.drop(columns=[col], inplace=True)
                X = pd.concat([X, transformed_df], axis=1)
        return X

# Pipeline Builder
def build_pipeline():
    categorical_cols = [
        'job', 'marital', 'education', 'default',
        'housing', 'loan', 'age_group', 'balance_group'
    ]
    pipeline = Pipeline([
        ('numeric_cast', NumericCaster()),
        ('age_group', AgeGroupCreator()),
        ('balance_processing', BalanceProcessor(median_val=407)),
        ('one_hot', OneHotInPlace(columns=categorical_cols))
    ])
    return pipeline

# Run Pipeline 
if __name__ == "__main__":
    csv_path = r"C:\Users\USER\Documents\Customer Intention\df_pre_camp.csv"
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"Dataset not found at {csv_path}")

    df = pd.read_csv(csv_path)
    pipe = build_pipeline()
    df_transformed = pipe.fit_transform(df)

    print("\n✅ Transformation complete!")
    print(f"Shape: {df_transformed.shape}")

    # Measure time and apply pipeline 
    start_time = time.time()
    df_transformed = pipe.fit_transform(df) 
    end_time = time.time()
    print(f"Pipeline execution time: {end_time - start_time:.2f} seconds")

    

    # Save the pipeline for future reuse 
    pipeline_path = r"C:\Users\USER\Documents\Customer Intention\pre_campaign_preprocessing_dynamic_pipeline.pkl"
    joblib.dump(pipe, pipeline_path)
    print(f"✅ pre_campaign_preprocessing_dynamic_pipeline saved for reuse at: {pipeline_path}")


✅ Transformation complete!
Shape: (40000, 24)
Pipeline execution time: 0.19 seconds
✅ pre_campaign_preprocessing_dynamic_pipeline saved for reuse at: C:\Users\USER\Documents\Customer Intention\pre_campaign_preprocessing_dynamic_pipeline.pkl


In [13]:
df_transformed.head()

Unnamed: 0,balance,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,marital_single,education_secondary,education_tertiary,education_unknown,default_yes,housing_yes,loan_yes,age_group_Old,age_group_Young,balance_group_low
0,1.173558,no,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.850451,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,-0.899427,no,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
3,0.639102,no,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.901809,no,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


Result above is the first 5 rows of the transformed dataset.

In [15]:
# Convert 'y' column: 'no' → 0, 'yes' → 1
df_transformed['y'] = df_transformed['y'].map({'no': 0, 'yes': 1})

unique_values = df_transformed['y'].unique()
print( f"Distinct values are:")
print(unique_values)

Distinct values are:
[0 1]


In [17]:
df_transformed.head()

Unnamed: 0,balance,y,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,...,marital_single,education_secondary,education_tertiary,education_unknown,default_yes,housing_yes,loan_yes,age_group_Old,age_group_Young,balance_group_low
0,1.173558,0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,-0.850451,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
2,-0.899427,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0
3,0.639102,0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
4,-0.901809,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0


In [19]:
# Save the transformed dataset.

output_path = r"C:\Users\USER\Documents\Customer Intention\df_ML1.csv"
df_transformed.to_csv(output_path, index=False)
print(f"ML1 dataset saved to: {output_path}")

ML1 dataset saved to: C:\Users\USER\Documents\Customer Intention\df_ML1.csv


In [None]:
Next, pre_campaign model development.