In [1]:
%matplotlib inline
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, KBinsDiscretizer
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline

import custom_helpers as ch

np.random.seed(0)

In [2]:
df = ch.load_data('../data/train_month_3_with_target.csv')
print(df.info())

------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63697 entries, 0 to 63696
Data columns (total 43 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   client_id                          63697 non-null  object        
 1   homebanking_active                 63697 non-null  bool          
 2   has_homebanking                    63697 non-null  bool          
 3   has_insurance_21                   63697 non-null  bool          
 4   has_insurance_23                   63697 non-null  bool          
 5   has_life_insurance_fixed_cap       63697 non-null  bool          
 6   has_life_insurance_decreasing_cap  63697 non-null  bool          
 7   has_fire_car_other_

In [3]:
#non sample-dependent transformations
def sample_agnostic_transformation(data):

    # data.drop(columns = colinear_features, inplace = True)
    
    # num_col = data.select_dtypes(include = 'number', exclude = 'bool').columns
    # for col in num_col:
    #     data[col] = data[col].fillna(value = np.nan)
    
    # selected_col = ['client_id', 'homebanking_active', 'has_homebanking',
    #    'has_insurance_21', 'has_insurance_23', 'has_life_insurance_fixed_cap',
    #    'has_life_insurance_decreasing_cap', 'has_fire_car_other_insurance',
    #    'has_personal_loan', 'has_mortgage_loan', 'has_current_account',
    #    'has_pension_saving', 'has_savings_account',
    #    'has_savings_account_starter', 'has_current_account_starter',
    #    'bal_insurance_21', 'bal_insurance_23', 'cap_life_insurance_fixed_cap',
    #    'cap_life_insurance_decreasing_cap', 'prem_fire_car_other_insurance',
    #    'bal_personal_loan', 'bal_mortgage_loan', 'bal_current_account',
    #    'bal_pension_saving', 'bal_savings_account',
    #    'bal_savings_account_starter', 'bal_current_account_starter',
    #    'visits_distinct_so', 'visits_distinct_so_areas', 'customer_since_all',
    #    'customer_since_bank', 'customer_gender', 'customer_birth_date',
    #    'customer_postal_code', 'customer_occupation_code',
    #    'customer_self_employed', 'customer_education', 'customer_children',
    #    'customer_relationship', 'target', 'customer_since_all_years',
    #    'customer_since_bank_years', 'customer_age']
    
    
    if 'target' in data.columns:
        y = data.target
        X = data.drop(columns = ['target'])
        # X = X[selected_col]
    # else:
    #     X = data[selected_col]
    #     y = 0
        
    return X, y

X, y = sample_agnostic_transformation(df)

In [5]:
# sample dependent column specific preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42, stratify = y)


num_col = X_train.select_dtypes(include = 'number', exclude = 'bool').columns
cat_col = X_train.select_dtypes(include = 'category').columns
bool_col = X_train.select_dtypes(include = 'bool').columns
date_col = X_train.select_dtypes(include = 'datetime64').columns
obj_col = X_train.select_dtypes(include = 'object').columns

colinear_features = ['bal_insurance_23',
     'bal_insurance_21',
     'bal_savings_account_starter',
     'has_homebanking',
     'customer_since_bank_years',
     'cap_life_insurance_decreasing_cap',
     'has_mortgage_loan',
     'has_fire_car_other_insurance',
     'bal_pension_saving',
     'bal_personal_loan']


numeric_transformer = Pipeline(steps = [
    ('impute',SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scale', StandardScaler())
])

categorical_transformer = OneHotEncoder(drop = 'first',handle_unknown="ignore")


preprocessor = ColumnTransformer(
    transformers=[
        ('drop_ID','drop',obj_col),
        ('drop_dates','drop',date_col),
        ('drop_colinear', 'drop', colinear_features),
        ('cat',categorical_transformer,cat_col),
        ('num',numeric_transformer,num_col)
    ],
    remainder = "passthrough"
)

# f = preprocessor.fit_transform(X_train)
# f = pd.DataFrame(f)
# f.T

In [14]:
# FIRST ROUND TEST

from sklearn.decomposition import PCA, TruncatedSVD

lr = LogisticRegression(max_iter=10000, tol=0.1, class_weight = 'balanced')
# decomposer = PCA()
decomposer = TruncatedSVD(random_state = 42)

#pipeline
pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("decomp",decomposer),
        ("logistic", lr)]
)

param_grid = {
    "decomp__n_components": [2, 5, 10, 15, 30],
    # "logistic__penalty":["l1","l2"],
    "logistic__C": np.logspace(-4, 4, 4)
}

search = GridSearchCV(pipe, param_grid, scoring = 'precision', n_jobs=-2)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=0.062):
{'decomp__n_components': 5, 'logistic__C': 0.0001}


In [15]:
# train 

lr = LogisticRegression(max_iter=10000, tol=0.1, class_weight = 'balanced', C = search.best_params_['logistic__C'])
# decomposer = PCA()
decomposer = TruncatedSVD(n_components = search.best_params_['decomp__n_components'], random_state = 42)

#pipeline
pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("decomp",decomposer),
        ("logistic", lr)]
)

clf = pipe.fit(X_train,y_train)

# make prediction on test
y_pred_test = clf.predict(X_test)
y_pred_test_probs = clf.predict_proba(X_test)

ch.evaluate(y_test, y_pred_test, y_pred_test_probs)

------------------------------------------------------------
Performance Over Whole Set
------------------------------------------------------------
               precision    recall  f1-score   support

Did not Churn       0.98      0.76      0.86     30892
        Churn       0.06      0.52      0.11       957

     accuracy                           0.75     31849
    macro avg       0.52      0.64      0.48     31849
 weighted avg       0.95      0.75      0.83     31849

AUC: 0.64 

------------------------------------------------------------
No. of TP (precision@250): 34
AUC: 0.500
------------------------------------------------------------


In [7]:
# # run on submission data
# data_sub = pd.read_csv('../data/test_month_3.csv',parse_dates = [29,30,32], index_col = 'client_id')
# X_sub, y_sub = sample_agnostic_transformation(data_sub)

# #pipeline
# pipe = Pipeline(
#     steps=[("preprocessor", preprocessor),("classifier", lrcv_5)]
# )


# # train 
# clf = pipe.fit(X_train,y_train)

# # make prediction on test
# y_pred_sub = clf.predict(X_sub)
# y_pred_test_sub = clf.predict_proba(X_sub)
# y_pred_test_sub_pos = [x[1] for x in y_pred_test_sub]

# df = pd.DataFrame({'ID': X_sub.index,'PROB':y_pred_test_sub_pos})
# today
# df.to_csv(f'../output/lr_{today.month}{today.day}.csv', index = False)