In [12]:
import pickle
import numpy as np
import pandas as pd
import glob 
import gc
import os

from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt 
import matplotlib

from numpy.random import randint

from scipy.stats import linregress 

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
path = Path('/home/jovyan/workspace/amex-challenge/archive')

In [3]:
class DateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X["S_2"] = pd.to_datetime(X["S_2"]).astype(int)/ 10**9
        return X
    
class CoalesceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X = X.sort_values('S_2').groupby("customer_ID").tail(1)
        return X

In [4]:
train_data = pd.read_feather(path / f'data/train_data.ftr')
train_labels = pd.read_feather(path / f'data/train_labels.ftr')
train_data = train_data.set_index("customer_ID")
train_labels = train_labels.set_index("customer_ID")
joined = train_data.join(train_labels)


In [6]:
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
numerical_cols = set(joined.select_dtypes(include=(np.number)).columns)
non_numeric_cols = set(joined.columns).difference(numerical_cols)
cols_with_null = set(joined.columns[joined.isna().any()].tolist())

In [7]:
non_leaking_pipe = Pipeline([
    ('coalesce', CoalesceTransformer()),
    ('date', DateTransformer()),
])

In [8]:
joined_processed = non_leaking_pipe.fit_transform(joined)

In [9]:
y = joined_processed.target
X = joined_processed.drop("target", axis=1)

In [10]:
handle_null_numerical = FeatureUnion([
         ('features', SimpleImputer(strategy='mean')),
         ('indicators', MissingIndicator())
])

preprocessor = ColumnTransformer([
    ('categorical_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('numerical_imputer', handle_null_numerical, list(cols_with_null - non_numeric_cols))
], 
remainder="passthrough")

In [13]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('xgb', GradientBoostingClassifier(verbose=True))
])

In [14]:
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
scores = cross_validate(pipe, X, y, cv=cv, n_jobs=5,
                        scoring=('precision', 'recall', 'f1', 'accuracy', 'balanced_accuracy'),
                        return_train_score=True)

In [15]:
scores

{'fit_time': array([2709.66118097, 2707.46165633, 2721.52244163, 2716.5065341 ,
        2715.95931387]),
 'score_time': array([1.1730206 , 1.15367079, 1.12866497, 1.12701178, 1.12319946]),
 'test_precision': array([0.76521159, 0.8027463 , 0.80547422, 0.80904565, 0.79952389]),
 'train_precision': array([0.80475559, 0.80191019, 0.80106654, 0.80249696, 0.80365818]),
 'test_recall': array([0.82020534, 0.80190188, 0.79866195, 0.80012624, 0.79141595]),
 'train_recall': array([0.80426459, 0.79932044, 0.79958343, 0.79787089, 0.80052176]),
 'test_f1': array([0.79175467, 0.80232387, 0.80205362, 0.80456123, 0.79544926]),
 'train_f1': array([0.80451001, 0.80061322, 0.8003243 , 0.80017724, 0.8020869 ]),
 'test_accuracy': array([0.88827996, 0.89768258, 0.89792227, 0.89934846, 0.89460896]),
 'train_accuracy': array([0.89879334, 0.89691118, 0.89669055, 0.89681612, 0.89770681]),
 'test_balanced_accuracy': array([0.86613572, 0.86652572, 0.86563352, 0.86707137, 0.86104017]),
 'train_balanced_accuracy': a