In [1]:
import pandas as pd
import numpy as np
# import pickle
from matplotlib import pyplot as plt
import os, gc
import plotly.express as px
import plotly.graph_objects as go
from lightgbm import LGBMClassifier, early_stopping, log_evaluation, plot_importance

from sklearn.model_selection import KFold,StratifiedKFold,train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, roc_curve, auc

# **Read File**

In [2]:
TRAIN_PATH = '../input/pa-amex-default-dataset/train.parquet'

In [3]:
%%time
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = pd.read_parquet(path, columns=usecols)
    else: df = pd.read_parquet(path)
    
    print('shape of data:', df.shape)
    print("The training data begins on {} and ends on {}.".format(df['S_2'].min(),df['S_2'].max()))

    df = df.sort_values(['customer_ID','S_2'])
    df_out = df.groupby(['customer_ID']).nth(-1).reset_index(drop=False)
    print("There are {:,.0f} customers in the training set and {} features.".format(df_out.shape[0],df_out.shape[1]))
    del df
    _ = gc.collect()
    return df_out

print('Reading train data...')

train = read_file(path = TRAIN_PATH)

In [4]:
train.info(max_cols = 200, show_counts = True)

In [5]:
train.head()

# **Evaluation Metric**

In [6]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [7]:
def plot_roc(y_val,y_prob):
    colors=px.colors.qualitative.Prism
    fig=go.Figure()
    fig.add_trace(go.Scatter(x=np.linspace(0,1,11), y=np.linspace(0,1,11), 
                             name='Random Chance',mode='lines', showlegend=False,
                             line=dict(color="Black", width=1, dash="dot")))
    for i in range(len(y_val)):
        y=y_val[i]
        prob=y_prob[i]
        fpr, tpr, _ = roc_curve(y, prob)
        roc_auc = auc(fpr,tpr)
        fig.add_trace(go.Scatter(x=fpr, y=tpr, line=dict(color=colors[::-1][i+1], width=3), 
                                 hovertemplate = 'True positive rate = %{y:.3f}<br>False positive rate = %{x:.3f}',
                                 name='Fold {}:  Gini = {:.3f}, AUC = {:.3f}'.format(i+1, gini[i],roc_auc)))
    fig.update_layout(template=temp, title="Cross-Validation ROC Curves", 
                      hovermode="x unified", width=700,height=600,
                      xaxis_title='False Positive Rate (1 - Specificity)',
                      yaxis_title='True Positive Rate (Sensitivity)',
                      legend=dict(orientation='v', y=.07, x=1, xanchor="right",
                                  bordercolor="black", borderwidth=.5))
    fig.show()

In [9]:
%%time

features = [x for x in train.columns.values if x not in ['customer_ID', 'target', 'S_2']]
X, y = train[features], train['target']

oof = []
y_valid, gbm_val_probs, gini=[],[],[]
ft_importance=pd.DataFrame(index=X.columns)

sk_fold = KFold(n_splits=5, shuffle=True, random_state=21)
# sk_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=21)
for fold, (train_idx, val_idx) in enumerate(sk_fold.split(X, y)):
    
    print("\nFold {}".format(fold+1))
    
    X_train, y_train = X.loc[train_idx,:], y[train_idx]
    X_val, y_val = X.loc[val_idx,:], y[val_idx]
    print("Train shape: {}, {}, Valid shape: {}, {}\n".format(
        X_train.shape, y_train.shape, X_val.shape, y_val.shape))
    
    params = {'boosting_type': 'gbdt',
              'n_estimators': 500,
              'num_leaves': 50,
              'learning_rate': 0.05,
              'colsample_bytree': 0.9,
              'min_child_samples': 2000,
              'max_bins': 500,
              'reg_alpha': 2,
              'objective': 'binary',
              'random_state': 21}
    
    gbm = LGBMClassifier(**params).fit(X_train, y_train, 
                                       eval_set=[(X_train, y_train)], #, (X_val, y_val)],
                                       callbacks=[early_stopping(200), log_evaluation(500)],
                                       eval_metric=['auc','binary_logloss'])
    gbm_prob = gbm.predict_proba(X_val)[:,1]
    gbm_val_probs.append(gbm_prob)
    y_valid.append(y_val)
    
    # Predict validation + Add to out of folds array
    val_pred = gbm.predict(X_val)

    
    # Save OOF
    df = train.loc[val_idx, ['customer_ID','target'] ].copy()
    df['prediction'] = gbm_prob
    oof.append(df)

    
    y_pred=pd.DataFrame(data={'prediction':gbm_prob})
    y_true=pd.DataFrame(data={'target':y_val.reset_index(drop=True)})
    gini_score=amex_metric(y_true = y_true, y_pred = y_pred)
    gini.append(gini_score)
    
    auc_score=roc_auc_score(y_val, gbm_prob)
    ft_importance["Importance_Fold"+str(fold)]=gbm.feature_importances_    
    print("Validation Gini: {:.5f}, AUC: {:.4f}".format(gini_score,auc_score))
    
    del X_train, y_train, X_val, y_val
    _ = gc.collect()

In [10]:
oof

In [11]:
# oof['target'].to_frame()
oof = pd.concat(oof,axis = 0,ignore_index = True).set_index('customer_ID')
oof

In [12]:
# Compute oof metric
acc = amex_metric(oof['target'].to_frame(), oof['prediction'].to_frame())
print('OVERALL CV Evaluation Metric =',acc)

In [13]:
# del X, y
temp=dict(layout=go.Layout(font=dict(family="Franklin Gothic", size=12), 
                           height=500, width=1000))
plot_roc(y_valid, gbm_val_probs)

In [14]:
ft_importance.head()

In [15]:
import seaborn as sns
ft_importance['avg']=ft_importance.mean(axis=1)
ft_importance=ft_importance.avg.nlargest(50).sort_values(ascending=True)

pal=sns.color_palette("YlGnBu", 65).as_hex()
fig=go.Figure()
for i in range(len(ft_importance.index)):
    fig.add_shape(dict(type="line", y0=i, y1=i, x0=0, x1=ft_importance[i], 
                       line_color=pal[::-1][i],opacity=0.8,line_width=4))
fig.add_trace(go.Scatter(x=ft_importance, y=ft_importance.index, mode='markers', 
                         marker_color=pal[::-1], marker_size=8,
                         hovertemplate='%{y} Importance = %{x:.0f}<extra></extra>'))
fig.update_layout(template=temp,title='LGBM Feature Importance<br>Top 50', 
                  margin=dict(l=150,t=80),
                  xaxis=dict(title='Importance', zeroline=False),
                  yaxis_showgrid=False, height=1000, width=800)
fig.show()