In [1]:
%matplotlib inline
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline

import custom_helpers as ch

np.random.seed(0)

In [2]:
data = pd.read_csv('https://raw.githubusercontent.com/DPintacasi/BA1_Churn/main/data/train_month_3_with_target.csv',parse_dates = [29,30,32], index_col = 'client_id')


In [3]:
#non sample-dependent transformations
date_features = data.select_dtypes(include = ['datetime64'])
data['customer_since_all_years'] = (2018-data.customer_since_all.dt.year)
data['customer_since_bank_years'] = (2018-data.customer_since_bank.dt.year)
data['customer_age'] = (2018-data.customer_birth_date.dt.year)

#initialise target and select features

selected_col = ['homebanking_active'
              ,'has_homebanking'
              ,'bal_mortgage_loan'
              ,'has_life_insurance_decreasing_cap'
              ,'has_mortgage_loan'
              ,'has_current_account'
              ,'cap_life_insurance_decreasing_cap'
              ,'bal_savings_account'
              ,'bal_current_account'
              ,'has_personal_loan'
              ,'bal_personal_loan'
              ,'customer_since_all_years'
              ,'customer_since_bank_years'
              ,'customer_age'   
         ]
y = data.target
X = data.drop(columns = ['target'])
X = X[selected_col]
print(X.dtypes)
display(X.head(5))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

homebanking_active                     int64
has_homebanking                        int64
bal_mortgage_loan                      int64
has_life_insurance_decreasing_cap      int64
has_mortgage_loan                      int64
has_current_account                    int64
cap_life_insurance_decreasing_cap      int64
bal_savings_account                    int64
bal_current_account                    int64
has_personal_loan                      int64
bal_personal_loan                      int64
customer_since_all_years             float64
customer_since_bank_years            float64
customer_age                           int64
dtype: object


Unnamed: 0_level_0,homebanking_active,has_homebanking,bal_mortgage_loan,has_life_insurance_decreasing_cap,has_mortgage_loan,has_current_account,cap_life_insurance_decreasing_cap,bal_savings_account,bal_current_account,has_personal_loan,bal_personal_loan,customer_since_all_years,customer_since_bank_years,customer_age
client_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
910df42ad36243aa4ce16324cd7b15b0,0,0,0,0,0,1,0,22000,590,0,0,35.0,24.0,75
4e19dc3a54323c5bbfc374664b950cd1,1,1,0,0,0,1,0,10570,940,0,0,1.0,1.0,24
f5d08db1b86c0cb0f566bf446cff1fb4,1,1,0,0,0,1,0,15200,1210,0,0,38.0,38.0,82
26170ecf63653e215c52f4262c1c4859,0,0,0,0,0,0,0,29020,0,0,0,20.0,5.0,72
c078009957dffb64f20e61b41220a976,0,0,0,0,0,0,0,13650,0,0,0,6.0,6.0,22


In [4]:
X_train.columns

Index(['homebanking_active', 'has_homebanking', 'bal_mortgage_loan',
       'has_life_insurance_decreasing_cap', 'has_mortgage_loan',
       'has_current_account', 'cap_life_insurance_decreasing_cap',
       'bal_savings_account', 'bal_current_account', 'has_personal_loan',
       'bal_personal_loan', 'customer_since_all_years',
       'customer_since_bank_years', 'customer_age'],
      dtype='object')

In [5]:
#sample dependent column specific preprocessing
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer

binner = Pipeline(steps=[
    ('impute',SimpleImputer(missing_values=np.nan, strategy='median'))
    ,('bin',KBinsDiscretizer(n_bins=5, encode='ordinal'))
])

columns_to_bin = ['customer_since_all_years','customer_since_bank_years','customer_age']

preprocessor = ColumnTransformer(
    transformers=[
        ("bin", binner, ['customer_age']),
        ('impute',SimpleImputer(missing_values=np.nan, strategy='mean'),['customer_since_all_years','customer_since_bank_years'])
    ],
    remainder = "passthrough"
)

In [6]:
#classifier
lr = LogisticRegression(class_weight = 'balanced')
# lr = LogisticRegression()


#pipeline
pipe = Pipeline(
    steps=[("preprocessor", preprocessor),("classifier", lr)]
)


# train 
clf = pipe.fit(X_train,y_train)

# make prediction on test
y_pred_test = clf.predict(X_test)
y_pred_test_probs = clf.predict_proba(X_test)
y_pred_test_probs = [x[1] for x in y_pred_test_probs]

In [9]:
ch.evaluate(y_test, y_pred_test, clf.predict_proba(X_test))

------------------------------------------------------------
Performance Over Whole Set
------------------------------------------------------------
               precision    recall  f1-score   support

Did not Churn       0.98      0.71      0.82     30889
        Churn       0.06      0.57      0.10       960

     accuracy                           0.70     31849
    macro avg       0.52      0.64      0.46     31849
 weighted avg       0.95      0.70      0.80     31849

AUC: 0.64 

------------------------------------------------------------
No. of TP (precision@250): 40
AUC: 0.500
------------------------------------------------------------
