In [None]:
# Import packages
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import os, gc
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns

from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from xgboost import plot_importance

# Import xgb modules
import xgboost as xgb

# **Read File**

In [None]:
TRAIN_PATH = '../input/pa-amex-default-reducing-dataset-size/train.parquet'

In [None]:
def read_file(path = '', usecols = None):
    # Read the file
    if usecols is not None: df = pd.read_parquet(path, columns=usecols)
    else: df = pd.read_parquet(path)

    print('shape of data:', df.shape)
    print("The training data begins on {} and ends on {}.".format(df['S_2'].min(),df['S_2'].max()))

    # Keep Only Final customer_ID (Save for Future Time Series Work Start)
    df = df.sort_values(['customer_ID','S_2'])
    df_out = df.groupby(['customer_ID']).nth(-1).reset_index(drop=True)
    print("There are {:,.0f} customers in the training set and {} features.".format(df_out.shape[0],df_out.shape[1]))
    del df
    _ = gc.collect()
    return df_out

print('Reading train data...')
train = read_file(path = TRAIN_PATH)

# **Evaluation Metric**

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        y_true_pred
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [None]:
# Create the arrays for features and the target: X, y
features = [x for x in train.columns.values if x not in ['customer_ID', 'target', 'S_2']]
X, y = train[features], train['target']

# Create the training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=100, stratify=y)

In [None]:
xg_cl = XGBClassifier(objective='binary:logistic', 
                      n_estimators=50000,
                      seed=123,
                      use_label_encoder=False,
                      eval_metric='aucpr',
                      early_stopping_rounds=50,
                      tree_method='gpu_hist',
                      enable_categorical=True
                      )
# NOTE: 
# setting(n_estimators,early_stopping_rounds) = (50000, 50)
# preforms better than 
# setting(n_estimators,early_stopping_rounds) = (10, 10)

In [None]:
%%time
# Fit the classifier
xg_cl.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)

In [None]:
# Predict the labels of the test set
preds = xg_cl.predict(X_test)

accuracy = accuracy_score(y_test, preds)
print(f'accuracy: {accuracy: .2%}')

y_pred = pd.DataFrame(y_test.copy(deep=True))
y_pred = y_pred.rename(columns={'target':'prediction'})
y_pred['prediction'] = xg_cl.predict_proba(X_test)[:,1]

In [None]:
def plot_features(booster, figsize, max_num_features=20):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax, max_num_features=max_num_features)
plot_features(xg_cl, (10,14))
plt.show()

In [None]:
print('Metric Evaluation Values\n')
# print(f'Numpy: {amex_metric(y_test.to_frame(), preds_prob_df)}')
print(f'{amex_metric(y_test.to_frame(), y_pred)}')