In [1]:
%matplotlib inline
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline

import custom_helpers as ch

np.random.seed(0)

df3 = ch.load_data('../data/train_month_3_with_target.csv')
df2 = ch.load_data('../data/train_month_2.csv')
df1 = ch.load_data('../data/train_month_1.csv')
# print(df.info())

------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------
------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------
------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------


In [2]:
#non sample-dependent transformations
def sample_agnostic_transformation(df3,df2,df1,mode='train'):

    selected_col = [
                'homebanking_active'
                ,'bal_mortgage_loan'
                ,'has_life_insurance_decreasing_cap'
                ,'has_current_account'
                ,'bal_savings_account'
                ,'has_personal_loan'
                ,'customer_since_all_years'
                ,'customer_age'
                ,'customer_children'
                ,'customer_education'
                ,'visits_distinct_so'
         ]
    
    if mode == 'train':
        y = df3.target
        X = df3.drop(columns = ['target'])
        X = X[selected_col]
    
    if mode =='test':
        X = df3[selected_col]
        y = 0
        
    # add deltas
    
    X['bal_savings_account_delta_-1'] = df3['bal_savings_account']-df2['bal_savings_account']
    # X['bal_current_account_delta_-1'] = df3.loc[:,'bal_current_account']-df2.loc[:,'bal_current_account']

        
    return X, y

In [3]:
X, y = sample_agnostic_transformation(df3,df2,df1)
X

Unnamed: 0,homebanking_active,bal_mortgage_loan,has_life_insurance_decreasing_cap,has_current_account,bal_savings_account,has_personal_loan,customer_since_all_years,customer_age,customer_children,customer_education,visits_distinct_so,bal_savings_account_delta_-1
0,False,0,False,True,22000,False,35.0,75,unknown,0.0,1.0,2000
1,True,0,False,True,10570,False,1.0,24,mature,0.0,1.0,280
2,True,0,False,True,15200,False,38.0,82,unknown,0.0,1.0,-800
3,False,0,False,False,29020,False,20.0,72,unknown,0.0,1.0,0
4,False,0,False,False,13650,False,6.0,22,mature,0.0,1.0,100
...,...,...,...,...,...,...,...,...,...,...,...,...
63692,True,0,False,True,17060,False,20.0,36,no,0.0,2.0,250
63693,False,13030,False,True,22400,False,29.0,53,unknown,1.0,2.0,-100
63694,False,179550,True,True,6820,False,27.0,42,unknown,3.0,3.0,200
63695,False,0,False,False,5690,False,4.0,67,no,0.0,1.0,50


In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify = y, random_state=42)

from sklearn.compose import make_column_selector

num_col = make_column_selector(dtype_include = 'number', dtype_exclude = 'bool')
cat_col = make_column_selector(dtype_include = 'category')
bool_col = make_column_selector(dtype_include = 'bool')
date_col = make_column_selector(dtype_include = 'datetime64')
obj_col = make_column_selector(dtype_include = 'object')

In [5]:
from sklearn.preprocessing import FunctionTransformer, KBinsDiscretizer

numeric_transformer = Pipeline(steps = [
    ('impute',SimpleImputer(missing_values=np.nan, strategy='median')),
    ('scale', StandardScaler())
])

categorical_transformer = OneHotEncoder(drop = 'first',handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('drop_ID','drop',obj_col),
        ('drop_dates','drop',date_col),
        ('cat',categorical_transformer,cat_col),
        ('num',numeric_transformer,num_col)
    ],
    remainder = "passthrough"
)

f = preprocessor.fit_transform(X_train)
f = pd.DataFrame(f)
f

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14,15,16,17,18,19,20,21,22,23
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,-0.250244,0.735191,-0.772135,-0.082961,-0.46078,-0.044213,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.250244,-0.973525,0.371543,-1.682183,-0.46078,-0.044213,0.0,0.0,0.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.250244,-0.682806,-0.156309,-1.682183,-0.46078,-0.044213,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.250244,-1.020989,0.635468,-1.332353,-0.46078,-0.044213,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,-0.250244,-0.965049,0.107617,-1.232402,-0.46078,-0.044213,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42671,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.250244,-0.887072,-0.420234,-1.632208,-0.46078,-0.044213,0.0,0.0,0.0,0.0
42672,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.250244,-0.275121,0.371543,-1.582232,-0.46078,-3.419685,0.0,0.0,1.0,0.0
42673,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.250244,2.050630,2.043072,0.466772,-0.46078,-0.044213,0.0,0.0,0.0,0.0
42674,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-0.250244,0.185961,-1.299987,0.466772,-0.46078,-0.044213,0.0,0.0,0.0,0.0


In [6]:
lr = LogisticRegression(max_iter=10000, tol=0.5, class_weight = 'balanced')

pipe = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("logistic", lr)]
)

param_grid = {
        "logistic__C": np.logspace(-10,3,10)
    }

gridscorer = ch.gridscorer() # customer scorer (precision@250)

search = GridSearchCV(pipe, param_grid, scoring = gridscorer, n_jobs=-2)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

clf = search.best_estimator_.fit(X_train,y_train)

# make prediction on test

ch.evaluate(clf, X_test, y_test)

Best parameter (CV score=29.600):
{'logistic__C': 1e-10}
------------------------------------------------------------
Performance Over Whole Set
------------------------------------------------------------
               precision    recall  f1-score   support

Did not Churn       0.99      0.49      0.66     20390
        Churn       0.05      0.77      0.09       631

     accuracy                           0.50     21021
    macro avg       0.52      0.63      0.37     21021
 weighted avg       0.96      0.50      0.64     21021

------------------------------------------------------------
AUC: 0.63
No. of TP (precision@250): 36
------------------------------------------------------------


In [7]:
# run on submission data

df3_sub = ch.load_data('../data/test_month_3.csv')
df2_sub = ch.load_data('../data/test_month_2.csv')
df1_sub = ch.load_data('../data/test_month_1.csv')
X_sub, y_sub = sample_agnostic_transformation(df3_sub,df2_sub,df1_sub,mode='test')

# make prediction on test
y_pred_test_sub_pos = clf.predict_proba(X_sub)[:,1]

df = pd.DataFrame({'ID': df3_sub.client_id,'PROB':y_pred_test_sub_pos})
df
today = dt.datetime.today()
df.to_csv(f'../output/lrtemp_{today.month}{today.day}.csv', index = False)

------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------
------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------
------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['bal_savings_account_delta_-1'] = df3['bal_savings_account']-df2['bal_savings_account']
