Within the import section, we call our transfomers, allowing the code blocks there to be executable within this file.

In [None]:
import numpy as np
import seaborn as sb
import matplotlib as mb
import matplotlib.pyplot as plt
import plotly as pl
import pandas as pd
import sklearn as sk
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
import feature_engine as fe
import sys
sys.path.append('../')
from assets.transformers import pipeline, scaling_transformer

Extract the csv file and create a sampled version of it.

In [4]:
bc = pd.read_csv("../assets/bank-churners.csv")
bc.sample(frac=0.025, random_state=5)

Unnamed: 0,CLIENTNUM,Attrition_Flag,Customer_Age,Gender,Dependent_count,Education_Level,Marital_Status,Income_Category,Card_Category,Months_on_book,...,Credit_Limit,Total_Revolving_Bal,Avg_Open_To_Buy,Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1,Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2
1646,716870583,Existing Customer,36,F,3,Unknown,Married,Less than $40K,Blue,36,...,2786.0,1628,1158.0,1.314,2853,55,0.667,0.584,0.000385,0.999610
7880,708307533,Existing Customer,42,M,2,Uneducated,Married,$60K - $80K,Blue,30,...,3086.0,0,3086.0,0.808,4129,81,0.884,0.000,0.000197,0.999800
7586,713381433,Attrited Customer,38,F,1,High School,Married,Less than $40K,Blue,28,...,4196.0,731,3465.0,0.485,1868,30,0.200,0.174,0.996390,0.003609
9646,710667708,Existing Customer,47,M,2,Uneducated,Single,$60K - $80K,Blue,36,...,12510.0,1871,10639.0,0.748,14018,128,0.778,0.150,0.000185,0.999820
7165,708108333,Attrited Customer,47,F,3,Graduate,Married,Unknown,Blue,36,...,5590.0,0,5590.0,0.010,1507,32,0.000,0.000,0.991200,0.008800
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3950,806947308,Attrited Customer,40,M,4,Post-Graduate,Married,$60K - $80K,Blue,35,...,20178.0,1553,18625.0,0.324,1735,44,0.375,0.077,0.997790,0.002211
9364,818239983,Existing Customer,53,M,2,Unknown,Divorced,$80K - $120K,Blue,48,...,15594.0,1590,14004.0,0.679,13016,118,0.616,0.102,0.000061,0.999940
5158,716256483,Existing Customer,50,F,2,Graduate,Single,Unknown,Blue,36,...,2611.0,1824,787.0,0.711,4232,80,0.633,0.699,0.000327,0.999670
3856,794682483,Existing Customer,43,F,3,Post-Graduate,Unknown,Less than $40K,Blue,23,...,4284.0,1094,3190.0,0.630,3095,64,0.524,0.255,0.000369,0.999630


Use the imported pipeline that was recently fitted and saved to a local folder to clean data, check the shape and columns present.

In [5]:
processed_bc = pipeline.fit_transform(bc)
print(f"Processed data shape: {processed_bc.shape}")
print("Columns after processing:", processed_bc.columns.tolist())

Processed data shape: (7847, 10)
Columns after processing: ['Customer_Status', 'Age', 'Gender', 'Income_Level', 'Tenure_Months', 'Inactive_Months_In_Last_12', 'Credit_Limit', 'Total_Trans_Amount', 'Total_Trans_Count', 'Avg_Utilization_Ratio']


Print array with scaling/encoding transformer (for future reference).

In [6]:
scaled_data = scaling_transformer.fit_transform(processed_bc)
print(f"Scaled data shape: {scaled_data.shape}")
scaled_data

Scaled data shape: (7847, 17)


array([[-0.1734543 ,  0.43834308, -1.47491134, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.35538334,  1.14441052, -1.47491134, ...,  0.        ,
         1.        ,  0.        ],
       [ 0.61980216,  0.01470262, -1.47491134, ...,  1.        ,
         0.        ,  0.        ],
       ...,
       [ 0.75201157,  0.01470262,  0.84957737, ...,  0.        ,
         1.        ,  0.        ],
       [ 1.14863979,  1.56805099,  0.84957737, ...,  0.        ,
         0.        ,  1.        ],
       [-2.15659543,  0.01470262,  0.84957737, ...,  0.        ,
         0.        ,  0.        ]], shape=(7847, 17))

Create loadable instance of data.

In [7]:
bc_load = processed_bc.sample(frac=0.025, random_state=5)
bc_load

Unnamed: 0,Customer_Status,Age,Gender,Income_Level,Tenure_Months,Inactive_Months_In_Last_12,Credit_Limit,Total_Trans_Amount,Total_Trans_Count,Avg_Utilization_Ratio
1301,Existing Customer,41,F,Unknown,30,1,5417.0,1768,56,0.35
9200,Attrited Customer,60,M,$40K - $60K,47,1,3735.0,8193,66,0.32
8945,Existing Customer,47,M,Less than $40K,34,1,8390.0,8316,92,0.00
4643,Existing Customer,44,M,$60K - $80K,37,2,1682.0,3309,67,0.55
7393,Existing Customer,47,F,Less than $40K,36,2,2500.0,4265,87,0.47
...,...,...,...,...,...,...,...,...,...,...
1706,Existing Customer,31,M,$40K - $60K,22,1,9096.0,2318,54,0.00
6084,Existing Customer,50,F,Less than $40K,38,3,2775.0,4608,84,0.53
1291,Existing Customer,38,F,Less than $40K,29,3,3590.0,1472,37,0.54
7459,Attrited Customer,39,F,$40K - $60K,31,3,3333.0,3348,50,0.56


Export the new dataset.

In [None]:
# bc_load.to_csv("../assets/processed-bank-churners.csv", index=False)

Create a larger dataframe for Tableau.

In [8]:
bc_load_2 = processed_bc.sample(frac=0.15, random_state=5)
bc_load_2

Unnamed: 0,Customer_Status,Age,Gender,Income_Level,Tenure_Months,Inactive_Months_In_Last_12,Credit_Limit,Total_Trans_Amount,Total_Trans_Count,Avg_Utilization_Ratio
1301,Existing Customer,41,F,Unknown,30,1,5417.0,1768,56,0.35
9200,Attrited Customer,60,M,$40K - $60K,47,1,3735.0,8193,66,0.32
8945,Existing Customer,47,M,Less than $40K,34,1,8390.0,8316,92,0.00
4643,Existing Customer,44,M,$60K - $80K,37,2,1682.0,3309,67,0.55
7393,Existing Customer,47,F,Less than $40K,36,2,2500.0,4265,87,0.47
...,...,...,...,...,...,...,...,...,...,...
8764,Existing Customer,52,F,$40K - $60K,42,3,3416.0,8254,75,0.58
6757,Existing Customer,43,F,Less than $40K,33,4,1884.0,4591,85,0.64
7419,Attrited Customer,51,F,Less than $40K,41,3,2791.0,2584,49,0.34
6389,Existing Customer,37,F,Less than $40K,26,2,2586.0,4883,73,0.32


Load the dataframe to a new csv file.

In [None]:
# bc_load_2.to_csv("../assets/processed-bank-churners_tableau.csv", index=False)

Load the dataframe to a new Excel file.

In [None]:
# bc_load_2.to_excel("processed_bank_churners.xlsx", index=False)