In [1]:
# LOAD LIBRARIES
import pandas as pd, numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

print('RAPIDS version',cudf.__version__)

In [2]:
# TRAIN RANDOM SEED
SEED = 42

# # FILL NAN VALUE
# NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

# **Read File**

In [3]:
TRAIN_PATH = '../input/pa-amex-default-reducing-dataset-size/train.parquet'

In [4]:
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)

    df = df.sort_values(['customer_ID','S_2'])
    df = df.groupby(['customer_ID']).nth(-1).reset_index(drop=False)

    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
train = read_file(path = TRAIN_PATH)

In [None]:
train.head()

# **Training Model**

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
import xgboost as xgb
print('XGB Version',xgb.__version__)

xgb_parms = { 
    'max_depth':4, 
    'learning_rate':0.05, 
    'subsample':0.8,
    'colsample_bytree':0.6, 
    'eval_metric':'logloss',
    'objective':'binary:logistic',
    'tree_method':'gpu_hist',
    'predictor':'gpu_predictor',
    'random_state':0
}

In [None]:
class IterLoadForDMatrix(xgb.core.DataIter):
    def __init__(self, df=None, features=None, target=None, batch_size=256*1024):
        self.features = features
        self.target = target
        self.df = df
        self.it = 0 # set iterator to 0
        self.batch_size = batch_size
        self.batches = int( np.ceil( len(df) / self.batch_size ) )
        super().__init__()

    def reset(self):
        '''Reset the iterator'''
        self.it = 0

    def next(self, input_data):
        '''Yield next batch of data.'''
        if self.it == self.batches:
            return 0 # Return 0 when there's no more batch.
        
        a = self.it * self.batch_size
        b = min( (self.it + 1) * self.batch_size, len(self.df) )
        dt = cudf.DataFrame(self.df.iloc[a:b])
        input_data(data=dt[self.features], label=dt[self.target]) #, weight=dt['weight'])
        self.it += 1
        return 1

**Evaluation Metric**

In [None]:
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

**Training**

In [None]:
importances = []
oof = []
train = train.to_pandas() # free GPU memory
TRAIN_SUBSAMPLE = 1.0
gc.collect()

FEATURES = [x for x in train.columns.values if x not in ['customer_ID', 'target', 'S_2']]
skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)
for fold,(train_idx, valid_idx) in enumerate(skf.split(
            train, train.target )):
    
    # Train with subtrain fold
    if TRAIN_SUBSAMPLE<1.0:
        np.random.seed(SEED)
        train_idx = np.random.choice(train_idx, 
                       int(len(train_idx)*TRAIN_SUBSAMPLE), replace=False)
        np.random.seed(None)
    
    print('#'*25)
    print('### Fold',fold+1)
    print('### Train size',len(train_idx),'Valid size',len(valid_idx))
    print(f'### Training with {int(TRAIN_SUBSAMPLE*100)}% fold data...')
    print('#'*25)
    
    # For Fold K (Train, Valid, Test)
    Xy_train = IterLoadForDMatrix(train.loc[train_idx], FEATURES, 'target')
    X_valid = train.loc[valid_idx, FEATURES]
    y_valid = train.loc[valid_idx, 'target']
    
    dtrain = xgb.DeviceQuantileDMatrix(Xy_train, max_bin=256)
    dvalid = xgb.DMatrix(data=X_valid, label=y_valid)
    
    # Train Model Fold K
    model = xgb.train(xgb_parms, 
                dtrain=dtrain,
                evals=[(dtrain,'train'),(dvalid,'valid')],
                num_boost_round=9999,
                early_stopping_rounds=100,
                verbose_eval=100) 
    model.save_model(f'fold{fold}.xgb')
    
    # Get Feature Importance For Fold K
    dd = model.get_score(importance_type='weight')
    df = pd.DataFrame({'feature':dd.keys(),f'importance_{fold}':dd.values()})
    importances.append(df)
            
    # Infer OOF Fold K
    oof_preds = model.predict(dvalid)
    y_pred=pd.DataFrame(data={'prediction':oof_preds})
    y_true=pd.DataFrame(data={'target':y_valid.reset_index(drop=True)})
    acc = amex_metric(y_true = y_true, y_pred = y_pred)
    print('Evaluation Metric =',acc,'\n')
    
    # SAVE OOF
    df = train.loc[valid_idx, ['customer_ID','target'] ].copy()
    df['prediction'] = oof_preds
    oof.append( df )
    
    del dtrain, Xy_train, dd, df
    del X_valid, y_valid, dvalid, model
    _ = gc.collect()
    
print('#'*25)
oof = pd.concat(oof,axis=0,ignore_index=True).set_index('customer_ID')
# acc = amex_metric_mod(oof.target.values, oof.oof_pred.values)
acc = amex_metric(oof['target'].to_frame(), oof['prediction'].to_frame())
print('OVERALL CV Evaluation Metric =',acc)

In [None]:
# clean
del train
_ = gc.collect()

# **Out of Fold Prediction**

In [None]:
oof_xgb = pd.read_parquet(TRAIN_PATH, columns=['customer_ID']).drop_duplicates()
oof_xgb['customer_ID_hash'] = oof_xgb['customer_ID'].astype('int64')
oof_xgb = oof_xgb.set_index('customer_ID_hash')
oof_xgb = oof_xgb.merge(oof, left_index=True, right_index=True)
oof_xgb = oof_xgb.sort_index().reset_index(drop=True)
oof_xgb.to_csv(f'oof_xgb.csv',index=False)
oof_xgb.head()

In [None]:
# OOF Predictions
plt.hist(oof_xgb.prediction.values, bins=100)
plt.title('OOF Predictions')
plt.show()

In [None]:
# Clear VRAM, RAM
del oof_xgb, oof
_ = gc.collect()

# **Feature Importance**

In [None]:
import matplotlib.pyplot as plt

df = importances[0].copy()
for k in range(1,FOLDS): df = df.merge(importances[k], on='feature', how='left')
df['importance'] = df.iloc[:,1:].mean(axis=1)
df = df.sort_values('importance',ascending=False)
df.to_csv(f'xgb_feature_importance.csv',index=False)

In [None]:
NUM_FEATURES = 25
plt.figure(figsize=(10,5*NUM_FEATURES//10))
plt.barh(np.arange(NUM_FEATURES,0,-1), df.importance.values[:NUM_FEATURES], color = ['xkcd:sky blue'])
plt.yticks(np.arange(NUM_FEATURES,0,-1), df.feature.values[:NUM_FEATURES])
plt.title(f'XGB Feature Importance - Top {NUM_FEATURES}')
plt.show()

In [None]:
# ========================================
# ================= Test =================
# ========================================

In [None]:
# def read_file(path = '', usecols = None):
#     # LOAD DATAFRAME
#     if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
#     else: df = cudf.read_parquet(path)
#     # REDUCE DTYPE FOR CUSTOMER AND DATE
#     df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
# #     df['customer_ID'] = df['customer_ID'].astype('int64')
    
#     df.S_2 = cudf.to_datetime( df.S_2 )
#     # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
#     #df = df.sort_values(['customer_ID','S_2'])
#     #df = df.reset_index(drop=True)
#     # FILL NAN
# #     df = df.fillna(NAN_VALUE)
#     print('shape of data:', df.shape)
    
#     return df

# test = read_file(path = '../input/amex-data-integer-dtypes-parquet-format/test.parquet', usecols = ['customer_ID','S_2'])
# test.head()

In [None]:
# test[['customer_ID']].drop_duplicates(ignore_index = True).sort_index().values.flatten()

In [None]:
# def get_rows(customers, test, NUM_PARTS = 4, verbose = ''):
#     chunk = len(customers)//NUM_PARTS
#     if verbose != '':
#         print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
#         print(f'There will be {chunk} customers in each part (except the last part).')
#         print('Below are number of rows in each part:')
#     rows = []

#     for k in range(NUM_PARTS):
#         if k==NUM_PARTS-1: cc = customers[k*chunk:]
#         else: cc = customers[k*chunk:(k+1)*chunk]
#         s = test.iloc[test.customer_ID.isin(cc)].shape[0]
#         rows.append(s)
#     if verbose != '': print( rows )
#     return rows,chunk

In [None]:
# # customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
# customers = test.drop_duplicates(subset=['customer_ID']).sort_index()
# rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')

In [None]:
# # CALCULATE SIZE OF EACH SEPARATE TEST PART

# # COMPUTE SIZE OF 4 PARTS FOR TEST DATA
# NUM_PARTS = 4
# # TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'
# TEST_PATH = '../input/pa-amex-default-reducing-dataset-size/test.parquet'


# print(f'Reading test data...')
# test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
# customers = test[['customer_ID']].drop_duplicates(ignore_index = True).sort_index().values.flatten()
# rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')

In [None]:
# # INFER TEST DATA IN PARTS
# skip_rows = 0
# skip_cust = 0
# test_preds = []

# for k in range(NUM_PARTS):
    
#     # READ PART OF TEST DATA
#     print(f'\nReading test data...')
#     test = read_file(path = TEST_PATH)
#     test = test.iloc[skip_rows:skip_rows+rows[k]]
#     skip_rows += rows[k]
#     print(f'=> Test part {k+1} has shape', test.shape )
    
#     # PROCESS AND FEATURE ENGINEER PART OF TEST DATA
#     test = process_and_feature_engineer(test)
#     if k==NUM_PARTS-1: test = test.loc[customers[skip_cust:]]
#     else: test = test.loc[customers[skip_cust:skip_cust+num_cust]]
#     skip_cust += num_cust
    
#     # TEST DATA FOR XGB
#     X_test = test[FEATURES]
#     dtest = xgb.DMatrix(data=X_test)
#     test = test[['P_2_mean']] # reduce memory
#     del X_test
#     gc.collect()

#     # INFER XGB MODELS ON TEST DATA
#     model = xgb.Booster()
#     model.load_model(f'XGB_v{VER}_fold0.xgb')
#     preds = model.predict(dtest)
#     for f in range(1,FOLDS):
#         model.load_model(f'XGB_v{VER}_fold{f}.xgb')
#         preds += model.predict(dtest)
#     preds /= FOLDS
#     test_preds.append(preds)

#     # CLEAN MEMORY
#     del dtest, model
#     _ = gc.collect()