### Preprocessing Pipeline for post_campaign model

The pipeline was developed based on the findings during EDA.  
It automates preprocessing of the dataset by applying the required transformations in the correct order.  
During EDA, necessary transformations were identified.  
Instead of repeating these steps manually every time, the pipeline ensures consistency and reproducibility.  
This prevents mistakes such as applying transformations in the wrong order or forgetting a step.  
It also makes the workflow dynamic and reusable, so the same pipeline can be applied to new data in the future.

#### Import libraries

In [None]:
import pandas as pd
import numpy as np
import joblib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from scipy.stats.mstats import winsorize
from sklearn.preprocessing import StandardScaler, PowerTransformer

In [1]:


# 1. Load dataset 
data_path = r"C:\Users\USER\Documents\Customer Intention\term-deposit-marketing-2020.csv"
df = pd.read_csv(data_path)

# 2. Convert integer columns to float 
def convert_int_to_float(df):
    int_cols = df.select_dtypes(include=['int']).columns
    df[int_cols] = df[int_cols].astype(float)
    return df

# 3 Feature Engineering Transformer 
class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        if 'age' in X.columns:
            X['age_group'] = pd.cut(
                X['age'], bins=[18, 40, 65, 95],
                labels=['Young', 'Middle', 'Old'],
                include_lowest=True
            )
        if 'balance' in X.columns:
            median_balance = 407  # fixed median
            X['balance_group'] = np.where(X['balance'] < median_balance, 'Low', 'High')
        if 'day' in X.columns:
            X['day_group'] = pd.cut(
                X['day'], bins=[0, 9, 20, 31],
                labels=['Early', 'Mid', 'Late'],
                include_lowest=True
            )
        if 'campaign' in X.columns:
            X['campaign_group'] = pd.cut(
                X['campaign'], bins=[0, 1, 3, np.inf],
                labels=['Low', 'Moderate', 'High'],
                include_lowest=True
            )
        return X

# 4. Numeric Transformer (exactly as manual) 
class NumericTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.column_configs = {
            'age': {'yeo': False, 'winsor': (0.002, 0.002)},
            'balance': {'yeo': True, 'winsor': (0.10, 0.10)},
            'day': {'yeo': False, 'winsor': None},
            'duration': {'yeo': True, 'winsor': (0.02, 0.02)},
            'campaign': {'yeo': True, 'winsor': None}
        }
        self.pt_transformers = {}
        self.scalers = {}

    def fit(self, X, y=None):
        X = X.copy()
        for col, cfg in self.column_configs.items():
            col_data = X[[col]].values  # 2D

            if cfg['yeo']:
                pt = PowerTransformer(method='yeo-johnson', standardize=False)
                pt.fit(col_data)
                self.pt_transformers[col] = pt

            self.scalers[col] = StandardScaler()
            temp = col_data.copy()
            if cfg['yeo']:
                temp = self.pt_transformers[col].transform(temp)
            if cfg['winsor']:
                temp = winsorize(temp.flatten(), limits=cfg['winsor']).reshape(-1,1)
            self.scalers[col].fit(temp)

        return self

    def transform(self, X):
        X = X.copy()
        for col, cfg in self.column_configs.items():
            col_data = X[[col]].values
            if cfg['yeo']:
                col_data = self.pt_transformers[col].transform(col_data)
            if cfg['winsor']:
                col_data = winsorize(col_data.flatten(), limits=cfg['winsor']).reshape(-1,1)
            col_data = self.scalers[col].transform(col_data)
            X[col] = col_data
        return X

# 5. Build dynamic pipeline 
def build_dynamic_pipeline(df):
    df = convert_int_to_float(df)

    y_col = None
    if 'y' in df.columns:
        y_col = df['y'].copy()
        df = df.drop(columns=['y'])

    df = FeatureEngineeringTransformer().fit_transform(df)

    numeric_cols = ['age','balance','day','duration','campaign']
    categorical_cols = [
        'job','marital','education','default','housing','loan','contact','month',
        'age_group','balance_group','day_group','campaign_group'
    ]

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', NumericTransformer(), numeric_cols),
            ('cat', Pipeline([
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('encoder', OneHotEncoder(drop=None, sparse_output=False, handle_unknown='ignore'))
            ]), categorical_cols)
        ],
        remainder='drop'
    )

    pipeline = Pipeline([
        ('feature_engineering', FeatureEngineeringTransformer()),
        ('preprocessor', preprocessor)
    ])

    pipeline.y_column = y_col
    return pipeline

# 6. Fit, transform, and save 
pipeline = build_dynamic_pipeline(df)
pipeline.fit(df)
df_processed = pipeline.transform(df)

# Column names
num_features = pipeline['preprocessor'].named_transformers_['num'].column_configs.keys()
cat_features = pipeline['preprocessor'].named_transformers_['cat']['encoder'].get_feature_names_out(
    pipeline['preprocessor'].transformers_[1][2]
)
all_features = list(num_features) + list(cat_features)

# Convert to DataFrame
df_processed = pd.DataFrame(df_processed, columns=all_features)

# Add target back
if pipeline.y_column is not None:
    df_processed['y'] = pipeline.y_column.values

print("✅ Preprocessing complete!")
print(df_processed[num_features].describe().T[['mean','std']].round(4))

# Save pipeline
save_path = r"C:\Users\USER\Documents\Customer Intention\post_campaign_dynamic_preprocessing_pipeline.pkl"
joblib.dump(pipeline, save_path)
print(f"✅ Pipeline saved successfully at:\n{save_path}")
#print(df_processed[cat_cols].head())

✅ Preprocessing complete!
          mean  std
age       -0.0  1.0
balance    0.0  1.0
day        0.0  1.0
duration   0.0  1.0
campaign   0.0  1.0
✅ Pipeline saved successfully at:
C:\Users\USER\Documents\Customer Intention\post_campaign_dynamic_preprocessing_pipeline.pkl


In [3]:
df_processed.head()

Unnamed: 0,age,balance,day,duration,campaign,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,age_group_Young,balance_group_High,balance_group_Low,day_group_Early,day_group_Late,day_group_Mid,campaign_group_High,campaign_group_Low,campaign_group_Moderate,y
0,1.821705,1.173558,-1.3309,0.429153,-1.140505,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,no
1,0.361477,-0.850451,-1.3309,-0.195075,-1.140505,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,no
2,-0.785845,-0.899427,-1.3309,-0.921066,-1.140505,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,no
3,0.674383,0.639102,-1.3309,-0.72527,-1.140505,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,no
4,-0.785845,-0.901809,-1.3309,0.108862,-1.140505,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,no


#### convert y to numerical variable

In [5]:
# Convert 'y' column: 'no' → 0, 'yes' → 1
df_processed['y'] = df_processed['y'].map({'no': 0, 'yes': 1})

unique_values = df_processed['y'].unique()
print( f"Distinct values are:")
print(unique_values)

Distinct values are:
[0 1]


In [7]:
df_processed.head()

Unnamed: 0,age,balance,day,duration,campaign,job_admin,job_blue-collar,job_entrepreneur,job_housemaid,job_management,...,age_group_Young,balance_group_High,balance_group_Low,day_group_Early,day_group_Late,day_group_Mid,campaign_group_High,campaign_group_Low,campaign_group_Moderate,y
0,1.821705,1.173558,-1.3309,0.429153,-1.140505,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
1,0.361477,-0.850451,-1.3309,-0.195075,-1.140505,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0
2,-0.785845,-0.899427,-1.3309,-0.921066,-1.140505,0.0,0.0,1.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0
3,0.674383,0.639102,-1.3309,-0.72527,-1.140505,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0
4,-0.785845,-0.901809,-1.3309,0.108862,-1.140505,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0


####  Save the processed data

In [9]:
# Save the processed data.
output_path = r"C:\Users\USER\Documents\Customer Intention\df_post_camp_shap2.csv"
df_processed.to_csv(output_path, index=False)
print(f"df_post_camp_shap dataset saved to: {output_path}")

df_post_camp_shap dataset saved to: C:\Users\USER\Documents\Customer Intention\df_post_camp_shap2.csv
