In [1]:
%matplotlib inline
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline

import custom_helpers as ch

np.random.seed(0)

df3 = ch.load_data('../data/train_month_3_with_target.csv')
df2 = ch.load_data('../data/train_month_2.csv')
df1 = ch.load_data('../data/train_month_1.csv')
# print(df.info())

------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------
------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------
------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------


In [2]:
#non sample-dependent transformations
def sample_agnostic_transformation(df3,df2,df1,mode='train'):

    selected_col = [
                'homebanking_active'
                ,'bal_mortgage_loan'
                ,'has_life_insurance_decreasing_cap'
                ,'has_current_account'
                ,'bal_savings_account'
                ,'has_personal_loan'
                ,'customer_since_all_years'
                ,'customer_age'
                ,'customer_children'
                ,'customer_education'
         ]
    
    if mode == 'train':
        y = df3.target
        X = df3.drop(columns = ['target'])
        X = X[selected_col]
    
    if mode =='test':
        X = X[selected_col]
        y = 0
        
    # add deltas
    
    # X['bal_savings_account_delta_-1'] = df3['bal_savings_account']-df2['bal_savings_account']
    X['bal_current_account_delta_-1'] = df3['bal_current_account']-df2['bal_current_account']

        
    return X, y

In [3]:
X, y = sample_agnostic_transformation(df3,df2,df1)
X

Unnamed: 0,homebanking_active,bal_mortgage_loan,has_life_insurance_decreasing_cap,has_current_account,bal_savings_account,has_personal_loan,customer_since_all_years,customer_age,customer_children,customer_education,bal_current_account_delta_-1
0,False,0,False,True,22000,False,35.0,75,unknown,0.0,-1520
1,True,0,False,True,10570,False,1.0,24,mature,0.0,-120
2,True,0,False,True,15200,False,38.0,82,unknown,0.0,-10
3,False,0,False,False,29020,False,20.0,72,unknown,0.0,0
4,False,0,False,False,13650,False,6.0,22,mature,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...
63692,True,0,False,True,17060,False,20.0,36,no,0.0,-170
63693,False,13030,False,True,22400,False,29.0,53,unknown,1.0,460
63694,False,179550,True,True,6820,False,27.0,42,unknown,3.0,-310
63695,False,0,False,False,5690,False,4.0,67,no,0.0,0


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify = y, random_state=42)

from sklearn.compose import make_column_selector

num_col = make_column_selector(dtype_include = 'number', dtype_exclude = 'bool')
cat_col = make_column_selector(dtype_include = 'category')
bool_col = make_column_selector(dtype_include = 'bool')
date_col = make_column_selector(dtype_include = 'datetime64')
obj_col = make_column_selector(dtype_include = 'object')

In [5]:
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer

numeric_transformer = Pipeline(steps = [
    ('impute',SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scale', StandardScaler())
])

categorical_transformer = OneHotEncoder(drop = 'first',handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('drop_ID','drop',obj_col),
        ('drop_dates','drop',date_col),
        ('cat',categorical_transformer,cat_col),
        ('num',numeric_transformer,num_col)
    ],
    remainder = "passthrough"
)

f = preprocessor.fit_transform(X_train)
f = pd.DataFrame(f)

In [6]:
lr = LogisticRegression(max_iter=10000, tol=0.5, class_weight = 'balanced')

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("logistic", lr)]
)

param_grid = {
        "logistic__C": np.logspace(-10,3,10)
    }

gridscorer = ch.gridscorer() # customer scorer (precision@250)

search = GridSearchCV(pipe, param_grid, scoring = gridscorer, n_jobs=-2)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

clf = search.best_estimator_.fit(X_train,y_train)

# make prediction on test

ch.evaluate(clf, X_test, y_test)

Best parameter (CV score=29.400):
{'logistic__C': 0.0016681005372000592}
------------------------------------------------------------
Performance Over Whole Set
------------------------------------------------------------
               precision    recall  f1-score   support

Did not Churn       0.98      0.67      0.80     20390
        Churn       0.06      0.65      0.11       631

     accuracy                           0.67     21021
    macro avg       0.52      0.66      0.45     21021
 weighted avg       0.96      0.67      0.78     21021

------------------------------------------------------------
AUC: 0.66
No. of TP (precision@250): 40
------------------------------------------------------------
