In [202]:
import pickle
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mutual_info_score
from sklearn.base import BaseEstimator, TransformerMixin

## Load splits from 01_etl_eda

In [203]:
with open("../data/interim/train_test_split.pkl", "rb") as file:
    splits = pickle.load(file)
    X_train, X_test = splits["X_train"], splits["X_test"]
    y_train, y_test = splits["y_train"], splits["y_test"],
    id_train, id_test = splits["id_train"], splits["id_test"]

## Exploratory Feature Analysis

In [204]:
numerical_columns = X_train.select_dtypes(include=[np.number]).columns

In [205]:
correlation_with_churn = {}
for col in numerical_columns:
    correlation_with_churn[col] = X_train[col].corr(y_train)

sorted_corr = sorted(correlation_with_churn.items(),
                     key=lambda x: abs(x[1]),
                     reverse=True)

sorted_corr

[('age', np.float64(0.2851929796484342)),
 ('active_member', np.float64(-0.15628518523682997)),
 ('balance', np.float64(0.12669185759034565)),
 ('products_number', np.float64(-0.03367289044061732)),
 ('credit_score', np.float64(-0.026802433035470526)),
 ('estimated_salary', np.float64(0.014295668356474703)),
 ('tenure', np.float64(-0.011972242857265092)),
 ('credit_card', np.float64(0.000908695778592704))]

## Mutual Information against churn

###  For categorical features

In [206]:
categorical_columns = X_train.select_dtypes(include=["object", "category"]).columns.to_list()

In [207]:
def mutual_info_for_categorical_scores(series):
    return mutual_info_score(y_train, series)

mutual_infos = X_train[list(categorical_columns)].apply(mutual_info_for_categorical_scores)
round(mutual_infos.sort_values(ascending=False), 2)

country    0.01
gender     0.01
dtype: float64

### For numerical features

In [208]:
mutual_infos = X_train[list(numerical_columns)].apply(mutual_info_for_categorical_scores)
round(mutual_infos.sort_values(ascending=False), 2)



estimated_salary    0.51
balance             0.36
age                 0.08
products_number     0.07
credit_score        0.03
active_member       0.01
tenure              0.00
credit_card         0.00
dtype: float64

## Feature creation

In [209]:
class FeatureCreator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.feature_names_ = X.columns.tolist()
        return self

    def transform(self, X):
        X = X.copy()
        X["balance_exists"] = (X["balance"] > 0).astype(int)
        X["is_senior"] = (X["age"] > 60).astype(int)

        return X

    def get_feature_names_out(self, input_features=None):
        return np.array(self.feature_names_ + ["balance_exists", "is_senior"])

In [210]:
feature_creator = FeatureCreator()
X_train_featured = feature_creator.fit_transform(X_train)
X_test_featured = feature_creator.fit_transform(X_test)

In [211]:
X_train_featured

Unnamed: 0,credit_score,country,gender,age,tenure,balance,products_number,credit_card,active_member,estimated_salary,balance_exists,is_senior
8700,709,Germany,Male,23,8,73314.04,2,1,0,63446.47,1,0
4438,672,Germany,Male,68,0,126061.51,2,1,1,184936.77,1,1
4923,651,Germany,Male,34,2,90355.12,2,0,0,193597.94,1,0
17,549,Spain,Female,24,9,0.00,2,1,1,14406.41,0,0
5382,724,France,Female,40,6,110054.45,1,1,1,86950.72,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1751,541,France,Male,29,7,127504.57,1,0,0,86173.92,1,0
8826,412,France,Male,29,5,0.00,2,0,0,12510.53,0,0
1235,523,Germany,Male,63,6,116227.27,1,1,1,119404.63,1,1
1286,579,Spain,Male,37,5,152212.88,2,0,0,120219.14,1,0


### Sklearn Pipelines

In [212]:
numerical_features = ["credit_score", "age", "tenure",
                      "balance", "products_number"]
categorical_features = ["country", "gender"]
binary_features = ["credit_card", "active_member", "balance_exists", "is_senior"]

categorical_transformer = Pipeline(steps=[
    ("encoder", OneHotEncoder(sparse_output=False))
])

preprocessor = ColumnTransformer(transformers=[
        ("num", "passthrough", numerical_features),
        ("cat", categorical_transformer, categorical_features),
        ("bin", "passthrough", binary_features)
    ],
    remainder="drop"
)

In [213]:
preprocessor.fit(X_train_featured)

In [214]:
preprocessor.feature_names_in_

array(['credit_score', 'country', 'gender', 'age', 'tenure', 'balance',
       'products_number', 'credit_card', 'active_member',
       'estimated_salary', 'balance_exists', 'is_senior'], dtype=object)

In [215]:
preprocessor.get_feature_names_out()

array(['num__credit_score', 'num__age', 'num__tenure', 'num__balance',
       'num__products_number', 'cat__country_France',
       'cat__country_Germany', 'cat__country_Spain', 'cat__gender_Female',
       'cat__gender_Male', 'bin__credit_card', 'bin__active_member',
       'bin__balance_exists', 'bin__is_senior'], dtype=object)

In [216]:
X_train_processed = preprocessor.transform(X_train_featured)
X_test_processed = preprocessor.transform(X_test_featured)

In [217]:
list(zip(preprocessor.get_feature_names_out(), X_train_processed[1]))

[('num__credit_score', np.float64(672.0)),
 ('num__age', np.float64(68.0)),
 ('num__tenure', np.float64(0.0)),
 ('num__balance', np.float64(126061.51)),
 ('num__products_number', np.float64(2.0)),
 ('cat__country_France', np.float64(0.0)),
 ('cat__country_Germany', np.float64(1.0)),
 ('cat__country_Spain', np.float64(0.0)),
 ('cat__gender_Female', np.float64(0.0)),
 ('cat__gender_Male', np.float64(1.0)),
 ('bin__credit_card', np.float64(1.0)),
 ('bin__active_member', np.float64(1.0)),
 ('bin__balance_exists', np.float64(1.0)),
 ('bin__is_senior', np.float64(1.0))]

In [218]:
#  Save for 03_modeling.ipynb
with open("../data/interim/processed_data.pkl", "wb") as file:
    pickle.dump({
        "X_train_processed": X_train_processed,
        "X_test_processed": X_test_processed,
        "y_train": y_train,
        "y_test": y_test,
        "feature_names": preprocessor.get_feature_names_out(),
        "id_train": id_train,
        "id_test": id_test
    }, file)