In [4]:
%matplotlib inline
import numpy as np

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.pipeline import Pipeline
import xgboost as xgb

import custom_helpers as ch

# load data with helper function
data = ch.load_data('../data/train_month_3_with_target.csv')

------------------------------------------------------------
loading data...
transforming dates...
cast types into bool, object, categorical...
data loaded and casted
------------------------------------------------------------


# Preprocessing

In [5]:
y = data.target
X = data.drop(columns = ['target'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, stratify = y, random_state=42)

num_col = X_train.select_dtypes(include = 'number', exclude = 'bool').columns
cat_col = X_train.select_dtypes(include = 'category').columns
bool_col = X_train.select_dtypes(include = 'bool').columns
date_col = X_train.select_dtypes(include = 'datetime64').columns
obj_col = X_train.select_dtypes(include = 'object').columns

# found during exploratory phase
colinear_features = ['bal_insurance_23',
     'bal_insurance_21',
     'bal_savings_account_starter',
     'has_homebanking',
     'customer_since_bank_years',
     'cap_life_insurance_decreasing_cap',
     'has_mortgage_loan',
     'has_fire_car_other_insurance',
     'bal_pension_saving',
     'bal_personal_loan']


numeric_transformer = Pipeline(steps = [
    ('impute',SimpleImputer(missing_values=np.nan, strategy='median'))
])

categorical_transformer = OneHotEncoder(handle_unknown="ignore")


preprocessor = ColumnTransformer(
    transformers=[
        ('drop_ID','drop',obj_col),
        ('drop_dates','drop',date_col),
        ('drop_colinear', 'drop', colinear_features),
        ('cat',categorical_transformer,cat_col),
        ('num',numeric_transformer,num_col)
    ],
    remainder = "passthrough"
)

# First Pass without Parameter Tuning

In [7]:
#classifier
xg = xgb.XGBClassifier(objective='binary:logistic', missing = None, seed = 42)

pipe = Pipeline(
    steps=[("preprocessor", preprocessor),
           ("classifer", xg)]
)

clf = pipe.fit(X_train, y_train)

ch.evaluate(clf, X_test, y_test)





XGBoostError: [22:52:44] ../src/c_api/c_api_utils.h:161: Invalid missing value: null
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000120ad0a54 dmlc::LogMessageFatal::~LogMessageFatal() + 116
  [bt] (1) 2   libxgboost.dylib                    0x0000000120aca28e xgboost::GetMissing(xgboost::Json const&) + 286
  [bt] (2) 3   libxgboost.dylib                    0x0000000120ad8454 void InplacePredictImpl<xgboost::data::ArrayAdapter>(std::__1::shared_ptr<xgboost::data::ArrayAdapter>, std::__1::shared_ptr<xgboost::DMatrix>, char const*, xgboost::Learner*, unsigned long, unsigned long, unsigned long long const**, unsigned long long*, float const**) + 516
  [bt] (3) 4   libxgboost.dylib                    0x0000000120ad7f28 XGBoosterPredictFromDense + 344
  [bt] (4) 5   libffi.7.dylib                      0x0000000109c41ead ffi_call_unix64 + 85



In [4]:
#pipeline
pipe = Pipeline(
    steps=[("preprocessor", preprocessor),
           ("classifer", rf)]
)


gridscorer = ch.gridscorer() # custom scorer (precision@250)

search = GridSearchCV(pipe, param_grid, scoring = gridscorer, n_jobs=-2)
search.fit(X_train, y_train)
print("Best parameter (CV score=%0.3f):" % search.best_score_)
print(search.best_params_)

Best parameter (CV score=26.000):
{'balancer': 'passthrough', 'forest__class_weight': 'balanced'}


### Fit RF with best parameters

In [5]:
clf = search.best_estimator_.fit(X_train, y_train)
#clf = search.best_estimator_

# make prediction on test
y_pred_test = clf.predict(X_test)
y_pred_test_probs = clf.predict_proba(X_test)

ch.evaluate(clf, X_test, y_test)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('drop_ID', 'drop',
                                                  Index(['client_id', 'customer_postal_code'], dtype='object')),
                                                 ('drop_dates', 'drop',
                                                  Index(['customer_since_all', 'customer_since_bank', 'customer_birth_date'], dtype='object')),
                                                 ('drop_colinear', 'drop',
                                                  ['bal_insurance_23',
                                                   'bal_insurance_21',
                                                   'bal_savin...
       'bal_pension_saving', 'bal_savings_account',
       'bal_savings_account_starter', 'bal_current_account_starter',
       'visits_distinct_so', 'visits_distinct_so_areas',
       'customer_since_all_years', 'customer_sinc

In [6]:
# # run on submission data
# X_sub = ch.load_data('../data/test_month_3.csv')

# # make prediction on test
# y_pred_sub = clf.predict(X_sub)
# y_pred_test_sub = clf.predict_proba(X_sub)
# y_pred_test_sub_pos = y_pred_test_sub[:,1]

# df = pd.DataFrame({'ID': X_sub.client_id,'PROB':y_pred_test_sub_pos})
# today = dt.datetime.today()
# df.to_csv(f'../output/rf_{today.month}{today.day}.csv', index = False)