In [None]:
import pickle
import numpy as np
import pandas as pd
import glob 
import gc
import os


from datetime import datetime
from pathlib import Path

import matplotlib.pyplot as plt 
import matplotlib

from numpy.random import randint

from scipy.stats import linregress 
from scipy.special import logsumexp 

from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn import preprocessing
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel, SelectPercentile, chi2, f_regression
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
path = Path('/home/jovyan/workspace/amex-challenge/archive')

In [3]:
class DateTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        X["S_2"] = pd.to_datetime(X["S_2"]).astype(int)/ 10**9
        return X
    
class CoalesceTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        numerical_cols = joined.select_dtypes(include=(np.number)).columns.to_list()
        numerical_cols.remove('S_2')
        group = X.sort_values('S_2').groupby("customer_ID")
        def applyFunc(v):
            return v[list(numerical_cols)].fillna(0).pct_change().fillna(0).clip(upper=1).mean()
        X_g = group.apply(applyFunc).add_suffix('_pct')
        X = group.tail(1)
        X = pd.concat([X_g, X], axis=1)
        return X
    
def amex_metric(y, y_pred, **kwargs):
    y_true_pd = pd.DataFrame(y, columns=['target'])
    y_pred_pd = pd.DataFrame(y_pred, columns=['prediction'])
    
    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true_pd, y_pred_pd)
    d = top_four_percent_captured(y_true_pd, y_pred_pd)

    return 0.5 * (g + d)

amex_scorer = make_scorer(amex_metric)

In [4]:
# train_data = pd.read_feather(path / f'data/train_data.ftr')
# train_labels = pd.read_feather(path / f'data/train_labels.ftr')
# train_data = train_data.set_index("customer_ID")
# train_labels = train_labels.set_index("customer_ID")
# joined = train_data.join(train_labels)


In [5]:
# categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
# numerical_cols = set(joined.select_dtypes(include=(np.number)).columns) - set('target')
# non_numeric_cols = set(joined.columns).difference(numerical_cols)
# cols_with_null = set(joined.columns[joined.isna().any()].tolist())

In [6]:
# non_leaking_pipe = Pipeline([
#     ('date', DateTransformer()),
#     ('coalesce', CoalesceTransformer()),
# ])

In [7]:
# joined_processed = non_leaking_pipe.fit_transform(joined)

In [8]:
# joined_processed = joined_processed.reset_index()

In [9]:
# joined_processed.to_feather(path / f'train_slopes.ftr')

In [10]:
joined_processed = pd.read_feather(path / f'train_slopes.ftr')
joined_processed = joined_processed.set_index('customer_ID')

In [11]:
y = joined_processed.target
X = joined_processed.drop("target", axis=1)

In [12]:
# Fix all inf values
X.replace([np.inf, -np.inf], 2, inplace=True)

In [13]:
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
numerical_cols = set(X.select_dtypes(include=(np.number)).columns)
non_numeric_cols = set(X.columns).difference(numerical_cols)
cols_with_null = set(X.columns[X.isna().any()].tolist())

In [14]:
handle_null_numerical = FeatureUnion([
         ('features', SimpleImputer()),
         ('indicators', MissingIndicator())
])

preprocessor = ColumnTransformer([
    ('categorical_encoder', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
    ('numerical_imputer', handle_null_numerical, list(cols_with_null - non_numeric_cols)),
    ('scaler', StandardScaler(), list(numerical_cols))
], 
remainder="passthrough")

In [15]:
pipe = Pipeline([
    ('preprocessor', preprocessor),
    ('impute', SimpleImputer(strategy='median')),
    ('skb', SelectPercentile(score_func=f_regression, percentile=40)),
    ('feature_selection_sfm', SelectFromModel(GradientBoostingClassifier(n_estimators=30,verbose=True))),
    ('xgb', GradientBoostingClassifier(verbose=True))
])

In [16]:
cv = StratifiedKFold(n_splits=5, random_state=None, shuffle=False)
scorer = {'precision': 'precision', 'recall':'recall', 'f1':'f1', 'accuracy':'accuracy', 'balanced_accuracy': 'balanced_accuracy', "amex": amex_scorer}
scores = cross_validate(pipe, X, y, cv=cv, n_jobs=5,
                        scoring=scorer,
                        return_train_score=True, verbose=10)

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   27.3s remaining:   40.9s
[Parallel(n_jobs=5)]: Done   3 out of   5 | elapsed:   27.3s remaining:   18.2s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   27.3s remaining:    0.0s


TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.

The exit codes of the workers are {SIGKILL(-9)}

In [None]:
scores