In [31]:
import pickle
import numpy as np
import pandas as pd
import glob 
import gc
import os

from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt 
import matplotlib

from numpy.random import randint

from scipy.stats import linregress 

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel, SelectPercentile, chi2
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin

In [6]:
path = Path('/home/jovyan/workspace/amex-challenge/archive')

In [7]:
class DateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X["S_2"] = pd.to_datetime(X["S_2"]).astype(int)/ 10**9
        return X
    
class CoalesceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.sort_values('S_2').groupby("customer_ID").tail(1)
        return X
    
def amex_metric(y, y_pred, **kwargs):
    y_true_pd = pd.DataFrame(y, columns=['target'])
    y_pred_pd = pd.DataFrame(y_pred, columns=['prediction'])
    
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true_pd, y_pred_pd)
    d = top_four_percent_captured(y_true_pd, y_pred_pd)

    return 0.5 * (g + d)

amex_scorer = make_scorer(amex_metric)

In [8]:
train_data = pd.read_feather(path / f'data/train_data.ftr')
train_labels = pd.read_feather(path / f'data/train_labels.ftr')
train_data = train_data.set_index("customer_ID")
train_labels = train_labels.set_index("customer_ID")
joined = train_data.join(train_labels)


In [26]:
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
numerical_cols = set(joined.select_dtypes(include=(np.number)).columns)
non_numeric_cols = set(joined.columns).difference(numerical_cols)
cols_with_null = set(joined.columns[joined.isna().any()].tolist())

In [27]:
non_leaking_pipe = Pipeline([
    ('coalesce', CoalesceTransformer()),
    ('date', DateTransformer()),
])

In [28]:
joined_processed = non_leaking_pipe.fit_transform(joined)

In [42]:
y = joined_processed.target
X = joined_processed.drop("target", axis=1)

In [50]:
handle_null_numerical = FeatureUnion([
         ('features', SimpleImputer()),
         ('indicators', MissingIndicator())
])

preprocessor = ColumnTransformer([
    ('categorical_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('numerical_imputer', handle_null_numerical, list(cols_with_null - non_numeric_cols)),
    # ('select_best', SelectPercentile(chi2, percentile=40), list(numerical_cols))
], 
remainder="passthrough")

In [51]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selection', SelectFromModel(GradientBoostingClassifier(n_estimators=30,verbose=True))),
    ('xgb', GradientBoostingClassifier(verbose=True))
])

In [52]:
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
scorer = {'precision': 'precision', 'recall':'recall', 'f1':'f1', 'accuracy':'accuracy', 'balanced_accuracy': 'balanced_accuracy', "amex": amex_scorer}
scores = cross_validate(pipe, X, y, cv=cv, n_jobs=5,
                        scoring=scorer,
                        return_train_score=True, verbose=10)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed: 18.2min remaining: 27.4min
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed: 18.3min remaining: 12.2min
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 18.4min remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed: 18.4min finished


In [53]:
scores

{'fit_time': array([1086.59127951, 1093.40652776, 1072.86420822, 1089.91100478,
        1088.84847474]),
 'score_time': array([1.153548  , 1.19327378, 1.1656158 , 1.17159557, 1.17122173]),
 'test_precision': array([0.7721007 , 0.79288773, 0.79676343, 0.79796573, 0.79013445]),
 'train_precision': array([0.79455744, 0.79187123, 0.79123406, 0.79148355, 0.79376923]),
 'test_recall': array([0.78717496, 0.79462257, 0.79344442, 0.79558174, 0.78388386]),
 'train_recall': array([0.79765837, 0.79234605, 0.79284046, 0.79187486, 0.7922746 ]),
 'test_f1': array([0.77956496, 0.7937542 , 0.79510046, 0.79677195, 0.78699675]),
 'train_f1': array([0.79610488, 0.79210857, 0.79203644, 0.79167916, 0.79302121]),
 'test_accuracy': array([0.88472811, 0.89307388, 0.89410893, 0.89491404, 0.89013096]),
 'train_accuracy': array([0.89420369, 0.8923079 , 0.8921935 , 0.89209029, 0.89291288]),
 'test_balanced_accuracy': array([0.85299468, 0.86104829, 0.8613634 , 0.86260114, 0.85556867]),
 'train_balanced_accuracy': a

[CV] START .....................................................................
[CV] END  accuracy: (train=nan, test=nan) amex: (train=nan, test=nan) balanced_accuracy: (train=nan, test=nan) f1: (train=nan, test=nan) precision: (train=nan, test=nan) recall: (train=nan, test=nan) total time=   0.3s
[CV] START .....................................................................
[CV] END  accuracy: (train=nan, test=nan) amex: (train=nan, test=nan) balanced_accuracy: (train=nan, test=nan) f1: (train=nan, test=nan) precision: (train=nan, test=nan) recall: (train=nan, test=nan) total time=   3.0s
[CV] START .....................................................................
      Iter       Train Loss   Remaining Time 
         1           1.0479           13.12m
         2           0.9737           12.67m
         3           0.9137           12.22m
         4           0.8650           11.77m
         5           0.8237           11.31m
         6           0.7887           10.86m
   