In [1]:
%matplotlib inline
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline

import custom_helpers as ch

np.random.seed(0)

In [2]:
df = ch.load_data('../data/train_month_3_with_target.csv')
# print(df.info())

------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------


In [3]:
#non sample-dependent transformations
def sample_agnostic_transformation(data):

    selected_col = [
                'homebanking_active'
                # 'has_homebanking'
                ,'bal_mortgage_loan'
                ,'has_life_insurance_decreasing_cap'
                    # # ,'has_mortgage_loan'
                ,'has_current_account'
                    # ,'cap_life_insurance_decreasing_cap'
                ,'bal_savings_account'
                # ,'bal_current_account'
                ,'has_personal_loan'
                    # ,'bal_personal_loan'
                ,'customer_since_all_years'
                    # ,'customer_since_bank_years'
                ,'customer_age'
                ,'customer_children'
                ,'customer_education'
                # ,'has_savings_account'
                # ,'visits_distinct_so'
         ]
    
    if 'target' in data.columns:
        y = data.target
        X = data.drop(columns = ['target'])
        X = X[selected_col]
    else:
        X = data[selected_col]
        y = 0
        
    return X, y

X, y = sample_agnostic_transformation(df)

In [4]:
# sample dependent column specific preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify = y, random_state=42)

num_col = X_train.select_dtypes(include = 'number', exclude = 'bool').columns
cat_col = X_train.select_dtypes(include = 'category').columns
bool_col = X_train.select_dtypes(include = 'bool').columns
date_col = X_train.select_dtypes(include = 'datetime64').columns
obj_col = X_train.select_dtypes(include = 'object').columns

In [5]:
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer

numeric_transformer = Pipeline(steps = [
    ('impute',SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scale', StandardScaler())
])

categorical_transformer = OneHotEncoder(drop = 'first',handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('drop_ID','drop',obj_col),
        ('drop_dates','drop',date_col),
        ('cat',categorical_transformer,cat_col),
        ('num',numeric_transformer,num_col)
    ],
    remainder = "passthrough"
)

f = preprocessor.fit_transform(X_train)
f = pd.DataFrame(f)

In [6]:
lr = LogisticRegression(max_iter=10000, tol=0.5, class_weight = 'balanced')

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("logistic", lr)]
)

param_grid = {
        "logistic__C": np.logspace(-10,3,10)
    }

gridscorer = ch.gridscorer() # customer scorer (precision@250)

search = GridSearchCV(pipe, param_grid, scoring = gridscorer, n_jobs=-2)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

clf = search.best_estimator_.fit(X_train,y_train)

# make prediction on test

ch.evaluate(clf, X_test, y_test)

Best parameter (CV score=29.600):
{'logistic__C': 0.0016681005372000592}
------------------------------------------------------------
Performance Over Whole Set
------------------------------------------------------------
               precision    recall  f1-score   support

Did not Churn       0.98      0.67      0.80     20390
        Churn       0.06      0.65      0.11       631

     accuracy                           0.67     21021
    macro avg       0.52      0.66      0.45     21021
 weighted avg       0.96      0.67      0.78     21021

------------------------------------------------------------
AUC: 0.66
No. of TP (precision@250): 40
------------------------------------------------------------
