In [1]:
# Import libraries
import os
import warnings

import numpy as np
import pandas as pd

import gc  # Garbage collector

warnings.filterwarnings('ignore')

#### For EDA, refer: https://www.kaggle.com/code/awaldeep/first-look-eda/data

### Data pre-processing

In [2]:
# Reading feather format data(memory efficient, available on kaggle: https://www.kaggle.com/datasets/munumbutt/amexfeather) 
train_raw = pd.read_feather('../input/amexfeather/train_data.ftr')

In [3]:
train_raw.head(2)

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938477,0.001734,0.008728,1.006836,0.009224,0.124023,0.008774,0.004707,...,,,0.002426,0.003706,0.003819,,0.000569,0.00061,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936523,0.005775,0.004925,1.000977,0.006153,0.126709,0.000798,0.002714,...,,,0.003956,0.003166,0.005032,,0.009575,0.005493,0.009216,0


In [4]:
train_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Columns: 191 entries, customer_ID to target
dtypes: category(11), datetime64[ns](1), float16(177), int64(1), object(1)
memory usage: 2.0+ GB


In [5]:
# Missing values
tmp = train_raw.isna().sum().mul(100).div(len(train_raw)).sort_values(ascending=False)

### Handling missing values

In [6]:
# dropping columns with missing values >70%
missingDF = pd.DataFrame(tmp).reset_index()
drop_cols = missingDF[missingDF[0]>70]["index"].values
print(drop_cols)

['D_87' 'D_88' 'D_108' 'D_111' 'D_110' 'B_39' 'D_73' 'B_42' 'D_136'
 'D_138' 'D_137' 'D_135' 'D_134' 'R_9' 'B_29' 'D_106' 'D_132' 'D_49'
 'R_26' 'D_76' 'D_66' 'D_42' 'D_142' 'D_53' 'D_82']


In [7]:
train_raw

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938477,0.001734,0.008728,1.006836,0.009224,0.124023,0.008774,0.004707,...,,,0.002426,0.003706,0.003819,,0.000569,0.000610,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936523,0.005775,0.004925,1.000977,0.006153,0.126709,0.000798,0.002714,...,,,0.003956,0.003166,0.005032,,0.009575,0.005493,0.009216,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.954102,0.091492,0.021652,1.009766,0.006817,0.123962,0.007599,0.009422,...,,,0.003269,0.007328,0.000427,,0.003429,0.006985,0.002604,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960449,0.002455,0.013687,1.002930,0.001372,0.117188,0.000685,0.005531,...,,,0.006119,0.004517,0.003201,,0.008423,0.006527,0.009598,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947266,0.002483,0.015190,1.000977,0.007607,0.117310,0.004654,0.009308,...,,,0.003672,0.004944,0.008888,,0.001670,0.008125,0.009827,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5531446,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-11-05,0.979492,0.416016,0.020813,0.828125,0.003487,0.090759,0.005341,0.025146,...,,,0.006836,0.003679,0.000457,,0.000906,0.001497,0.002775,0
5531447,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2017-12-23,0.984863,0.296631,0.007210,0.812500,0.005905,0.079895,0.002243,0.023697,...,,,0.003309,0.007095,0.007858,,0.002777,0.008224,0.008858,0
5531448,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-01-06,0.982910,0.444092,0.013153,0.815430,0.003456,0.100525,0.002111,0.012344,...,,,0.009956,0.009995,0.001088,,0.005692,0.006775,0.005566,0
5531449,fffff1d38b785cef84adeace64f8f83db3a0c31e8d92ea...,2018-02-06,0.969727,0.442627,0.009857,1.003906,0.005116,0.101807,0.009933,0.008575,...,,,0.005543,0.006565,0.009880,,0.008125,0.001168,0.003983,0


In [8]:
train_raw.drop(columns = drop_cols,axis=1, inplace=True)

In [9]:
# For categorical columns
cols = train_raw.columns
num_cols = train_raw._get_numeric_data().columns

categorical_columns = list(set(cols) - set(num_cols))
filtered_categorical_columns = list(set(train_raw[categorical_columns])-{"S_2","customer_ID"})

In [10]:
train_raw[filtered_categorical_columns].nunique()

D_126    3
D_116    2
D_63     6
D_68     7
B_30     3
D_64     5
D_120    2
D_117    7
B_38     7
D_114    2
dtype: int64

In [11]:
train_raw[filtered_categorical_columns].isna().sum().mul(100).div(len(train_raw))

D_126    2.111851
D_116    3.194749
D_63     0.000000
D_68     3.914036
B_30     0.036446
D_64     0.000000
D_120    3.194749
D_117    3.194749
B_38     0.036446
D_114    3.194749
dtype: float64

In [12]:
for i in filtered_categorical_columns:
    print(train_raw[i].value_counts())

1.0     4262414
0.0      891323
-1.0     260898
Name: D_126, dtype: int64
0.0    5348109
1.0       6626
Name: D_116, dtype: int64
CO    4119621
CR     930133
CL     438390
XZ      25786
XM      10556
XL       6965
Name: D_63, dtype: int64
6.0    2782455
5.0    1201706
3.0     484442
4.0     477187
2.0     220111
1.0     133122
0.0      15925
Name: D_68, dtype: int64
0.0    4710663
1.0     763955
2.0      54817
Name: B_30, dtype: int64
O     2913244
U     1523448
R      840112
       217442
-1      37205
Name: D_64, dtype: int64
0.0    4729723
1.0     625012
Name: D_120, dtype: int64
-1.0    1456084
3.0     1166400
4.0     1138666
2.0      666808
5.0      459290
6.0      344520
1.0      122967
Name: D_117, dtype: int64
2.0    1953232
3.0    1255315
1.0    1160047
5.0     444856
4.0     294917
7.0     259028
6.0     162040
Name: B_38, dtype: int64
1.0    3316478
0.0    2038257
Name: D_114, dtype: int64


In [13]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy="most_frequent")
transformed_df = pd.DataFrame(imputer.fit_transform(train_raw[filtered_categorical_columns]),columns = filtered_categorical_columns)

In [14]:
train_raw[filtered_categorical_columns] = transformed_df[filtered_categorical_columns]

In [15]:
# For numeric columns
numeric_columns = train_raw.select_dtypes(np.number).columns
train_raw[numeric_columns] = train_raw[numeric_columns].fillna(train_raw[numeric_columns].mean())

In [16]:
train_raw.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_131,D_133,R_28,D_139,D_140,D_141,D_143,D_144,D_145,target
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-03-09,0.938477,0.001734,0.008728,1.006836,0.009224,0.124023,0.008774,0.004707,...,0.00597,0.004345,0.001534,0.002426,0.003706,0.003819,0.000569,0.00061,0.002674,0
1,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-04-07,0.936523,0.005775,0.004925,1.000977,0.006153,0.126709,0.000798,0.002714,...,0.004837,0.007496,0.004932,0.003956,0.003166,0.005032,0.009575,0.005493,0.009216,0
2,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-05-28,0.954102,0.091492,0.021652,1.009766,0.006817,0.123962,0.007599,0.009422,...,0.005497,0.009224,0.009125,0.003269,0.007328,0.000427,0.003429,0.006985,0.002604,0
3,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-06-13,0.960449,0.002455,0.013687,1.00293,0.001372,0.117188,0.000685,0.005531,...,0.008263,0.007206,0.002409,0.006119,0.004517,0.003201,0.008423,0.006527,0.009598,0
4,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,2017-07-16,0.947266,0.002483,0.01519,1.000977,0.007607,0.11731,0.004654,0.009308,...,0.004848,0.006313,0.004463,0.003672,0.004944,0.008888,0.00167,0.008125,0.009827,0


In [17]:
# Handling date column

train_raw["S_2_day"] = train_raw["S_2"].dt.day
train_raw["S_2_month"] = train_raw["S_2"].dt.month
train_raw["S_2_year"] = train_raw["S_2"].dt.year


In [18]:
# considering only one data point per customer (latest one) as time series is not being used
train_raw = train_raw.groupby(['customer_ID']).nth(-1).reset_index(drop=True)

In [19]:
# drop S_2
train_raw.drop(columns=["S_2"], axis=1, inplace=True)

In [20]:
# converting pandas "categorical" dtype to numeric
cols = ["D_68", "B_30", "B_38", "D_114", "D_116", "D_117", "D_120", "D_126"]
train_raw[cols] = train_raw[cols].apply(pd.to_numeric, errors='coerce')

## Modeling

In [21]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from xgboost import XGBClassifier
import xgboost as xgb
from datetime import datetime, timedelta

In [22]:
# https://www.kaggle.com/code/inversion/amex-competition-metric-python

def amex_metric_official(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()

    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [23]:
X = train_raw.drop(columns=["target"],axis=1)
y = train_raw["target"]

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33,random_state=100)

In [25]:
# label encoding
from sklearn.preprocessing import OrdinalEncoder

categorical_columns = ["D_63","D_64"]

oe = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-999)
oe.fit(X_train[categorical_columns])

X_train_enc = oe.transform(X_train[categorical_columns])
X_test_enc = oe.transform(X_test[categorical_columns])

X_train[categorical_columns] = X_train_enc
X_test[categorical_columns] = X_test_enc

In [26]:
# X_train.to_csv("x_train.csv", index=False)
# X_test.to_csv("x_test.csv", index=False)
# y_train.to_csv("y_train.csv", index=False)
# y_test.to_csv("y_test.csv", index=False)

In [27]:
xgb_classifier = XGBClassifier(objective='binary:logistic', 
                      n_estimators=200,
                      eta=0.2,
                      seed=12,
                      learning_rate=0.02,
                      use_label_encoder=False,
                      eval_metric='aucpr',                      
#                       early_stopping_rounds=10,tree_method='gpu_hist',enable_categorical=True
                            )
xgb_classifier.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=None, enable_categorical=False, eta=0.2,
              eval_metric='aucpr', gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.02, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
              missing=nan, monotone_constraints='()', n_estimators=200,
              n_jobs=0, num_parallel_tree=1, predictor='auto', random_state=12,
              reg_alpha=0, ...)

In [28]:
y_pred = xgb_classifier.predict(X_test)

In [29]:
y_pred_prob = xgb_classifier.predict_proba(X_test)[:,1]


In [30]:
y_test = pd.DataFrame(y_test, columns=["target"])
y_pred = pd.DataFrame(y_pred, columns=["prediction"])
y_pred_prob = pd.DataFrame(y_pred_prob, columns=["prediction"])

In [31]:
# # computing metric score
amex_metric_official(y_test, y_pred_prob)

0.007136442433944101

In [32]:
# Compute accuracy
accuracy = metrics.accuracy_score(y_test["target"], y_pred["prediction"])
print(f'accuracy: {accuracy: .2%}')

accuracy:  89.35%


In [33]:
import joblib
joblib.dump(xgb_classifier, "xgb_classifier_v1.h5")

['xgb_classifier_v1.h5']

In [34]:
import joblib
joblib.dump(oe, "oe.h5")

['oe.h5']

In [35]:
# # load the model
# import joblib
# xgb_classifier = joblib.load("../input/01-starter-xgboost-implementation/xgb_classifier_v1.h5")

In [36]:
# submission in 02. xgboost implementation


## DO UPVOTE !