Import all necessary libraries.

In [None]:
import numpy as np
import seaborn as sb
import matplotlib as mb
import matplotlib.pyplot as plt
import plotly as pl
import pandas as pd
import sklearn as sk
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import feature_engine as fe

Extract the csv dataset and create a sample of it to observe and work from.

In [81]:
bc = pd.read_csv("../assets/bank-churners.csv")
bc.sample(frac=0.025, random_state=5)

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
1646,716870583,Existing Customer,36,F,3,Unknown,Married,Less than $40K,Blue,36,...,2786.0,1628,1158.0,1.314,2853,55,0.667,0.584,0.000385,0.999610
7880,708307533,Existing Customer,42,M,2,Uneducated,Married,$60K - $80K,Blue,30,...,3086.0,0,3086.0,0.808,4129,81,0.884,0.000,0.000197,0.999800
7586,713381433,Attrited Customer,38,F,1,High School,Married,Less than $40K,Blue,28,...,4196.0,731,3465.0,0.485,1868,30,0.200,0.174,0.996390,0.003609
9646,710667708,Existing Customer,47,M,2,Uneducated,Single,$60K - $80K,Blue,36,...,12510.0,1871,10639.0,0.748,14018,128,0.778,0.150,0.000185,0.999820
7165,708108333,Attrited Customer,47,F,3,Graduate,Married,Unknown,Blue,36,...,5590.0,0,5590.0,0.010,1507,32,0.000,0.000,0.991200,0.008800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3950,806947308,Attrited Customer,40,M,4,Post-Graduate,Married,$60K - $80K,Blue,35,...,20178.0,1553,18625.0,0.324,1735,44,0.375,0.077,0.997790,0.002211
9364,818239983,Existing Customer,53,M,2,Unknown,Divorced,$80K - $120K,Blue,48,...,15594.0,1590,14004.0,0.679,13016,118,0.616,0.102,0.000061,0.999940
5158,716256483,Existing Customer,50,F,2,Graduate,Single,Unknown,Blue,36,...,2611.0,1824,787.0,0.711,4232,80,0.633,0.699,0.000327,0.999670
3856,794682483,Existing Customer,43,F,3,Post-Graduate,Unknown,Less than $40K,Blue,23,...,4284.0,1094,3190.0,0.630,3095,64,0.524,0.255,0.000369,0.999630


Define transfomers.

In [82]:
# drop specific columns
def drop_columns(bc):
    return bc.drop(columns=["CLIENTNUM", "Dependent_count", "Education_Level", "Marital_Status", "Card_Category", "Total_Relationship_Count", "Contacts_Count_12_mon", "Total_Revolving_Bal", "Avg_Open_To_Buy", "Total_Amt_Chng_Q4_Q1", "Total_Ct_Chng_Q4_Q1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1", "Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2"], errors="ignore")

# convert data types
def convert_data_types(bc):
    if "Attrition_Flag" in bc.columns:
        bc["Attrition_Flag"] = bc["Attrition_Flag"].astype(str)
    if "Customer_Age" in bc.columns:
        bc["Customer_Age"] = bc["Customer_Age"].astype(int)
    if "Gender" in bc.columns:
        bc["Gender"] = bc["Gender"].astype(str)
    if "Income_Category" in bc.columns:
        bc["Income_Category"] = bc["Income_Category"].astype(str)
    if "Months_on_book" in bc.columns:
        bc["Months_on_book"] = bc["Months_on_book"].astype(int)
    if "Months_Inactive_12_mon" in bc.columns:
        bc["Months_Inactive_12_mon"] = bc["Months_Inactive_12_mon"].astype(int)
    if "Credit_Limit" in bc.columns:
        bc["Credit_Limit"] = bc["Credit_Limit"].astype(float)
    if "Total_Trans_Amt" in bc.columns:
        bc["Total_Trans_Amt"] = bc["Total_Trans_Amt"].astype(int)
    if "Total_Trans_Ct" in bc.columns:
        bc["Total_Trans_Ct"] = bc["Total_Trans_Ct"].astype(int)
    if "Avg_Utilization_Ratio" in bc.columns:
        bc["Avg_Utilization_Ratio"] = bc["Avg_Utilization_Ratio"].astype(float)
    return bc

# remove outliers using IQR method
def remove_outliers(bc):
    columns = ["Customer_Age", "Months_on_book", "Months_Inactive_12_mon", "Credit_Limit", "Total_Trans_Amt", "Total_Trans_Ct", "Avg_Utilization_Ratio"]
    bc_one = bc.copy()
    
    for col in columns:
        if col in bc_one.columns: 
            Q1 = bc_one[col].quantile(0.25)
            Q3 = bc_one[col].quantile(0.75)
            IQR = Q3 - Q1
            bc_two = (bc_one[col] >= Q1 - 1.5 * IQR) & (bc_one[col] <= Q3 + 1.5 * IQR)
            bc_one = bc_one[bc_two]
    
    return bc_one

# rename columns
def rename_columns(bc):
    return bc.rename(columns={
        "Customer_Age": "Age",
        "Attrition_Flag": "Customer_Status",
        "Income_Category": "Income_Level",
        "Months_on_book": "Tenure_Months",
        "Months_Inactive_12_mon": "Inactive_Months_in_Last_12",
        "Total_Trans_Amt": "Total_Trans_Amount",
        "Total_Trans_Ct": "Total_Trans_Count"
    })

# capitalize column names
def capitalize_columns(bc):
    bc.columns = [col.title() for col in bc.columns]
    return bc

# drop missing values
def drop_missing_values(bc):
    return bc.dropna()

# remove duplicates
def remove_duplicates(bc):
    return bc.drop_duplicates()

# round numerical values
def round_values(bc):
    return bc.round(2)


# scale numerical values and encode categorical values
scaling_transformer = ColumnTransformer([
    ("num", StandardScaler(), ["Age", "Tenure_Months", "Inactive_Months_In_Last_12", "Credit_Limit", "Total_Trans_Amount", "Total_Trans_Count", "Avg_Utilization_Ratio"]), 
    ("cat", OneHotEncoder(), ["Customer_Status", "Gender", "Income_Level"])  
])


# define transformers
drop_columns_transformer = FunctionTransformer(drop_columns)
convert_data_types_transformer = FunctionTransformer(convert_data_types)
remove_outliers_one_transformer = FunctionTransformer(remove_outliers)
rename_columns_transformer = FunctionTransformer(rename_columns)
capitalize_columns_transformer = FunctionTransformer(capitalize_columns)
drop_missing_values_transformer = FunctionTransformer(drop_missing_values)
remove_duplicates_transformer = FunctionTransformer(remove_duplicates)
round_values_transformer = FunctionTransformer(round_values)
