In [1]:
%%time
import datetime
import random
import gc
import warnings
from joblib import dump, load
import uuid
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearnex import patch_sklearn
patch_sklearn()
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, KFold, StratifiedKFold
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, jaccard_score, recall_score, precision_score, accuracy_score, make_scorer, roc_auc_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, OrdinalEncoder
import os

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


CPU times: user 1.49 s, sys: 456 ms, total: 1.95 s
Wall time: 2.74 s


In [None]:
def convert_date_sort(frame : pd.DataFrame, primary_key : str, date_types : list = None):
        df = frame.copy()
        if date_types is not None :
            for date_col in date_types :
                df[date_col] = pd.to_datetime(df[date_col])
            df = df.sort_values([primary_key]+date_types)
        else :
            df = df.sort_values([primary_key])
        return df

def feature_engineer_test(frame :pd.DataFrame, cat_features : list, primary_key : str):
        num_features = [x for x in list(frame.select_dtypes('number')) and x not in cat_features]
        test_num_agg = frame.groupby(primary_key)[num_features].agg(['first','mean', 'std', 'min', 'max', 'last'])
        test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

        # Diff/Div columns
        for col in test_num_agg.columns:  
            # Last/First
            if 'last' in col and col.replace('last', 'first') in test_num_agg.columns:
                test_num_agg[col + '_life_sub'] = test_num_agg[col] - test_num_agg[col.replace('last', 'first')]
            if 'last' in col and col.replace('last', 'mean') in test_num_agg.columns:
                test_num_agg[col + '_lmean_sub'] = test_num_agg[col] - test_num_agg[col.replace('last', 'mean')]

        test_cat_agg = frame.groupby(primary_key)[cat_features].agg(['first', 'last', 'nunique'])
        test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

        temp1 = frame.groupby([primary_key])['P_2'].count()
        temp1 = temp1.reset_index()
        temp1.columns = [primary_key,'num_statements']
        temp1 = temp1.set_index(primary_key)

        df = pd.concat([test_num_agg, test_cat_agg,temp1], axis=1) #test_bal_agg  
        df = df.reset_index()

        del test_num_agg, test_cat_agg, temp1
        gc.collect()

        return df
    
def predictor(model, frame : pd.DataFrame) :
    #X = frame[frame.columns.tolist()[1:]].values
    X = frame.values
    pred_proba = model.predict_proba(X)[ : , 1]
    pred_class = model.predict(X)
    submission1  = pd.DataFrame({'customer_ID' : df[df.columns.tolist()[0]].values, 'prediction' : pred_proba})
    submission2  = pd.DataFrame({'customer_ID' : df[df.columns.tolist()[0]].values, 'prediction' : pred_class})
    del pred_proba, pred_class
    gc.collect()
    print('Predictions completed successfully 👌')
    return [submission1, submission2]


def compute_scores(model , X_val) :
    pred = model.predict(X_val)
    pred_proba = model.predict_proba(X_val)[:,1]
    confusion = confusion_matrix(Y_val,pred.copy(), labels = model.classes_)
    
    scores = dict(acc = accuracy_score(Y_val,pred.copy()),
                      jaccard = jaccard_score(Y_val, pred.copy()), 
                      recall = recall_score(Y_val, pred.copy()), 
                      prec = precision_score(Y_val, pred.copy()), 
                      auc = roc_auc_score(Y_val, pred_proba.copy()), 
                      f1 = f1_score(Y_val, pred.copy()), 
                      comp = amex_sk(Y_val,pred_proba), 
                      tn = confusion[0,0]/sum(confusion[0]), 
                      tp = confusion[1,1]/sum(confusion[1]))
    return scores


def amex_sk(target: np.ndarray, preds: np.ndarray) -> float:
    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos

    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight * (1 / weight.sum())).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / n_pos

    lorentz = (target * (1 / n_pos)).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    g = gini / gini_max
    return 0.5 * (g + d)



def export_results(model, sub : list((pd.DataFrame,pd.DataFrame)) = None) :
    uid = str(uuid.uuid4())
    dump(model, '/kaggle/input/model_amex_'+uid+'joblib')
    if sub is not None :
        sub[0].to_csv('/kaggle/input/subimission_proba_amex_'+str(uid)+'.csv', index =False)
        sub[1].to_csv('/kaggle/input/subimission_class_amex_'+str(uid)+'.csv', index =False)
        print('Subimission completed successfully 👌')

In [None]:
os.chdir('/home/drxc/.kaggle/Kaggle_competitions/Datasets/Amex-Default-Prediction') # set the wd to /home/drxc/.kaggle/Kaggle_competitions/Datasets/Amex-Default-Prediction
gc.collect()

In [None]:
%%time
train_data = pd.read_parquet('train_data.parquet')
print(train_data.shape)
features = load('features.csv')
test_data = pd.read_parquet('test_data.parquet')
labels = pd.read_csv('train_labels.csv')
cat = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [None]:
%%time
data = pd.concat([train_data, test_data], ignore_index=True)
del train_data, test_data
gc.collect()

In [None]:
%%time
nacount = data.isna().any().count()/100
nullcols = nacount[nacount>= 30]

In [None]:
%%time
data = data.drop(columns = nullcols)

In [None]:
data = convert_date_sort(frame = data, primary_key = 'customer_ID', date_types = ['S_2'])
data = feature_engineer_test(frame = data, cat_features = cat, primary_key = 'customer_ID')

In [None]:
%%time
encoder = OrdinalEncoder(handle_unknown = 'use_encoded_value', unknown_value = np.nan)
scaler = MinMaxScaler()

data[data.columns[1:]] = encoder.fit_transform(data[data.columns[1:]])
data[data.columns[1:]] = data[data.columns[1:]].fillna(data[data.columns[1:]].median())

In [None]:
%%time
train_data = pd.merge(data, labels, on = 'customer_ID', how = 'inner')

In [None]:
%%time
test_data = data.set_index('customer_ID').drop(index=labels.custom_ID.unique().tolist()).reset_index()

In [None]:
%%time
X_train, X_val, y_train, y_val = train_test_split(X = train_data[data.columns[1:-1]], y=train_data[data.columns[-1]] , test_size=0.1, random_state=43)
X_test = test_data[data.columns[1:]]
del train_data, data, test_data
gc.collect()

In [None]:
%%time
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [None]:
%%time
xgb = XGBClassifier(base_score = 0.7, booster = 'gbtree', gamma = 10, 
                    importance_type = 'weight', learning_rate = 0.1, max_leaves = 60, 
                    n_jobs = -1, random_state = 43, reg_alpha = 0.3, 
                    reg_lambda = 0.7,  subsample = 0.5, sampling_method= 'uniform', 
                    verbosity = 3, tree_method = 'hist', objective  = 'reg:logistic', seed = 43)

xgb = xgb.fit(X_train, y_train)

scores = compute_scores(xgb , X_val)


In [None]:
%%time
sub = predictor(xgb, X_test)
export_results(xgb, sub = sub)