#### Import Required Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, FunctionTransformer, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import pickle

In [2]:
print(f"Numpy version: {np.__version__}")
print(f"Pandas version: {pd.__version__}")

Numpy version: 2.3.5
Pandas version: 2.3.3


#### Import Dataset

In [3]:
# load dataset
train = pd.read_csv("../data/train.csv")

In [4]:
# print dataset shape
train.shape

(8000, 11)

In [5]:
# print first five rows
train.head()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,753,France,Male,57,7,0.0,1,1,0,159475.08,1
1,739,Germany,Male,32,3,102128.27,1,1,0,63981.37,1
2,755,Germany,Female,37,0,113865.23,2,1,1,117396.25,0
3,561,France,Male,37,5,0.0,2,1,0,83093.25,0
4,692,Germany,Male,49,6,110540.43,2,0,1,107472.99,0


In [6]:
# print last five rows
train.tail()

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
7995,688,Spain,Female,35,6,0.0,1,1,0,25488.43,1
7996,712,Spain,Male,74,5,0.0,2,0,0,151425.82,0
7997,667,France,Female,37,9,71786.9,2,1,1,67734.79,0
7998,687,Spain,Male,35,8,100988.39,2,1,0,22247.27,0
7999,802,Spain,Male,51,7,0.0,1,0,1,40855.79,0


In [7]:
# print random five rows
train.sample(5)

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
2016,642,Germany,Female,30,5,129753.69,1,1,0,582.53,0
395,730,France,Female,35,0,155470.55,1,1,1,53718.28,0
91,642,France,Male,26,0,0.0,1,0,0,47472.68,0
6478,552,Germany,Male,50,4,121175.56,1,1,0,117505.07,1
1467,739,Spain,Male,40,1,109681.61,1,1,1,193321.3,0


In [8]:
# dataset information
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   CreditScore      8000 non-null   int64  
 1   Geography        8000 non-null   object 
 2   Gender           8000 non-null   object 
 3   Age              8000 non-null   int64  
 4   Tenure           8000 non-null   int64  
 5   Balance          8000 non-null   float64
 6   NumOfProducts    8000 non-null   int64  
 7   HasCrCard        8000 non-null   int64  
 8   IsActiveMember   8000 non-null   int64  
 9   EstimatedSalary  8000 non-null   float64
 10  Exited           8000 non-null   int64  
dtypes: float64(2), int64(7), object(2)
memory usage: 687.6+ KB


In [9]:
# separate independent & dependent variables
X_train = train.drop("Exited", axis=1)
y_train = train["Exited"]

In [10]:
X_train

Unnamed: 0,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary
0,753,France,Male,57,7,0.00,1,1,0,159475.08
1,739,Germany,Male,32,3,102128.27,1,1,0,63981.37
2,755,Germany,Female,37,0,113865.23,2,1,1,117396.25
3,561,France,Male,37,5,0.00,2,1,0,83093.25
4,692,Germany,Male,49,6,110540.43,2,0,1,107472.99
...,...,...,...,...,...,...,...,...,...,...
7995,688,Spain,Female,35,6,0.00,1,1,0,25488.43
7996,712,Spain,Male,74,5,0.00,2,0,0,151425.82
7997,667,France,Female,37,9,71786.90,2,1,1,67734.79
7998,687,Spain,Male,35,8,100988.39,2,1,0,22247.27


#### Engineering New Features

In [11]:
def engg_features(X):
    X["BalanceSalaryRatio"] = X["Balance"] / X["EstimatedSalary"]
    X["TenureByAge"] = X["Tenure"] / X["Age"]
    X["CreditScoreGivenAge"] = X["CreditScore"] / X["Age"]
    X["HasBalance"] = np.where(X["Balance"] > 0, 1, 0)
    X["ActiveByAge"] = X["IsActiveMember"] * X["Age"]
    X['AgeCategory'] = pd.cut(X['Age'], bins=[0, 35, 55, np.inf], labels=['Young', 'MiddleAge', 'Senior'])
    return X
    
feature_engg_transformer = FunctionTransformer(engg_features, validate=False)

#### Preprocessing Existing Features

In [12]:
ohe_cols = ['Geography', 'AgeCategory'] 
ord_cols = ['Gender'] 
drop_cols = ['HasCrCard', 'EstimatedSalary']

preprocessor = ColumnTransformer([
    ("ohe", OneHotEncoder(drop="first", handle_unknown="ignore", sparse_output=False), ohe_cols),
    ("ord", OrdinalEncoder(), ord_cols),
    ("log_transformer", FunctionTransformer(func=np.log1p), ["Balance"]),
    ("drop", "drop", drop_cols)
], 
remainder="passthrough",
verbose_feature_names_out=False)

#### Pipeline Building

In [13]:
pipeline = Pipeline([
    ("feature_engg", feature_engg_transformer),
    ("preprocessor", preprocessor.set_output(transform="pandas")),
    ("scaler", StandardScaler().set_output(transform="pandas"))
])

X_processed = pipeline.fit_transform(X_train)

#### Dataset After Transformation

In [14]:
X_processed

Unnamed: 0,Geography_Germany,Geography_Spain,AgeCategory_Senior,AgeCategory_Young,Gender,Balance,CreditScore,Age,Tenure,NumOfProducts,IsActiveMember,BalanceSalaryRatio,TenureByAge,CreditScoreGivenAge,HasBalance,ActiveByAge
0,-0.578313,-0.577735,3.379698,-0.844213,0.907507,-1.328266,1.058568,1.715086,0.684723,-0.910256,-1.030206,-0.034927,-0.168165,-0.867237,-1.329364,-0.949038
1,1.729169,-0.577735,-0.295884,1.184535,0.907507,0.729917,0.913626,-0.659935,-0.696202,-0.910256,-1.030206,-0.021722,-0.493808,0.971402,0.752239,-0.949038
2,1.729169,-0.577735,-0.295884,-0.844213,-1.101919,0.749329,1.079274,-0.184931,-1.731895,0.808830,0.970680,-0.026903,-1.544467,0.471272,0.752239,0.764487
3,-0.578313,-0.577735,-0.295884,-0.844213,0.907507,-1.328266,-0.929207,-0.184931,-0.005739,0.808830,-1.030206,-0.034927,-0.030004,-0.504161,-1.329364,-0.949038
4,1.729169,-0.577735,-0.295884,-0.844213,0.907507,0.744041,0.427035,0.955079,0.339492,0.808830,0.970680,-0.026418,-0.172178,-0.697586,0.752239,1.320225
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-0.578313,1.730897,-0.295884,1.184535,-1.101919,-1.328266,0.385623,-0.374933,0.339492,-0.910256,-1.030206,-0.034927,0.376738,0.332068,-1.329364,-0.949038
7996,-0.578313,1.730897,3.379698,-0.844213,0.907507,-1.328266,0.634095,3.330101,-0.005739,0.808830,-1.030206,-0.034927,-0.787235,-1.534903,-1.329364,-0.949038
7997,-0.578313,-0.577735,-0.295884,-0.844213,-1.101919,0.667011,0.168210,-0.184931,1.375185,0.808830,0.970680,-0.026159,1.181567,0.028808,0.752239,0.764487
7998,-0.578313,1.730897,-0.295884,1.184535,0.907507,0.727914,0.375270,-0.374933,1.029954,0.808830,-1.030206,0.002626,1.017139,0.326753,0.752239,-0.949038


#### Export Pipeline

In [15]:
pickle.dump(pipeline, open("../models/pipeline.pkl", "wb"))