Import sklearn and pandas

In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

Load the dataset

* Data source: engineeredbank_churn.csv
* Description: Dataset after feature engineering including engagement, financial, and risk features.

In [31]:
df = pd.read_csv('C:\customerchurnprediction\data\engineered\engineeredbank_churn.csv')

In [32]:
df.drop(columns=['gender','country'],inplace=True)

In [33]:
df.head()
df.shape
df.columns

Index(['credit_score', 'age', 'tenure', 'balance', 'products_number',
       'credit_card', 'active_member', 'estimated_salary', 'churn',
       'age_group', 'inactive_single_product', 'products_per_tenure',
       'zero_balance', 'high_balance', 'balance_per_product',
       'credit_score_band', 'early_customer', 'churn_risk_score'],
      dtype='object')

Separating features and Target variables

In [34]:
x= df.drop(columns=['churn'])
y=df['churn']

Separating feature types

In [35]:
#Different feature types need different preprocessing tenchniques
numerical_features = ['age', 'tenure','balance','credit_score','products_number',
                      'products_per_tenure','balance_per_product','churn_risk_score']
binary_features = ['credit_card','active_member','inactive_single_product',
                   'zero_balance','high_balance','early_customer']
categorical_features = ['age_group','credit_score_band']

Checking for missing values


In [36]:
df[numerical_features + binary_features + categorical_features].isnull().sum()
df.dtypes

credit_score                 int64
age                          int64
tenure                       int64
balance                    float64
products_number              int64
credit_card                  int64
active_member                int64
estimated_salary           float64
churn                        int64
age_group                   object
inactive_single_product       bool
products_per_tenure        float64
zero_balance                 int64
high_balance                 int64
balance_per_product        float64
credit_score_band           object
early_customer               int64
churn_risk_score             int64
dtype: object

In [37]:
np.isinf(df.select_dtypes(include=[np.number])).sum()

credit_score           0
age                    0
tenure                 0
balance                0
products_number        0
credit_card            0
active_member          0
estimated_salary       0
churn                  0
products_per_tenure    0
zero_balance           0
high_balance           0
balance_per_product    0
early_customer         0
churn_risk_score       0
dtype: int64

Train-Test split(stratified)

In [38]:
X_train, X_test, y_train, y_test = train_test_split(
    x,y,test_size=0.2,random_state=42,stratify=y)

Build a preprocessing pipeline

In [39]:
numeric_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(transformers=[
    ('num', numeric_pipeline, numerical_features),
    ('cat',categorical_pipeline, categorical_features)
],remainder='passthrough')

Sanity Check: fit-transform only once

In [40]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

X_train_processed.shape
X_test_processed.shape

(2000, 23)

Save model datasets

In [42]:
X_train.to_csv('C:\customerchurnprediction\data\model_input\X_train.csv', index=False)
X_test.to_csv('C:\customerchurnprediction\data\model_input\X_test.csv',index=False)
y_train.to_csv('C:\customerchurnprediction\data\model_input\y_train.csv',index=False)
y_test.to_csv('C:\customerchurnprediction\data\model_input\X_test.csv',index=False)