In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0"
import tensorflow as tf
import tensorflow.keras.backend as K

import cupy, cudf # GPU LIBRARIES
import numpy as np, pandas as pd # CPU LIBRARIES
import matplotlib.pyplot as plt, gc

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

# Limit GPU Memory
LIMIT = 8
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
  try:
    tf.config.experimental.set_virtual_device_configuration(
        gpus[0],
        [tf.config.experimental.VirtualDeviceConfiguration(memory_limit=1024*LIMIT)])
    logical_gpus = tf.config.experimental.list_logical_devices('GPU')
  except RuntimeError as e:
    print(e)

# Initial Data

In [None]:
cat_columns = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

# Reduce the size of training data and feature engineering (I am not using all the feature engineering techniques which have been used in KNN, SVM, and CatBoost in this model)

In [None]:
# Train labels
targets = cudf.read_csv('/kaggle/input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')

# All column names
train = cudf.read_csv('/kaggle/input/amex-default-prediction/train_data.csv', nrows=1)
tot_columns  = train.columns

In [None]:
def feature_eng(train, targets = None):
    # Reduce non feature columns
    train['customer_ID'] = train['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    train.S_2 = cudf.to_datetime(train.S_2)
    train['year'] = (train.S_2.dt.year-2000).astype('int8')
    train['month'] = (train.S_2.dt.month).astype('int8')
    train['day'] = (train.S_2.dt.day).astype('int8')
    # Delete the raw date time field
    del train['S_2']
        
    # label encoding
    d_63_catgories = {'CL':2, 'CO':3, 'CR':4, 'XL':5, 'XM':6, 'XZ':7}
    train['D_63'] = train.D_63.map(d_63_catgories).fillna(1).astype('int8')
    d_64_catgories = {'-1':2,'O':3, 'R':4, 'U':5}
    train['D_64'] = train.D_64.map(d_64_catgories).fillna(1).astype('int8')
    adding_val = [2,1,2,2,3,2,3,2,2]
    for c,s in zip(list(set(cat_columns)-{'D_63','D_64'}), adding_val):
        train[c] = train[c] + s
        train[c] = train[c].fillna(1).astype('int8')
    
    # Reduce size of other columns
    skips = ['customer_ID','year','month','day']
    for col in train.columns:
        if col in skips: continue
        if str(train[col].dtype)=='int64':
            train[col] = train[col].astype('int32')
        if str(train[col].dtype)=='float64':
            train[col] = train[col].astype('float32')
    
    # Padding the sequences as 13 samples
    tmp = train[['customer_ID']].groupby('customer_ID').customer_ID.agg('count')
    more = cupy.array([],dtype='int64') 
    for j in range(1,13):
        i = tmp.loc[tmp==j].index.values
        more = cupy.concatenate([more,cupy.repeat(i,13-j)])
    df = train.iloc[:len(more)].copy().fillna(0)
    # Padding numerical columns with -1
    df = df * 0 - 1
    # Padding categorical columns with 0
    df[cat_columns] = (df[cat_columns] * 0).astype('int8')
    df['customer_ID'] = more
    train = cudf.concat([train,df],axis=0,ignore_index=True)
        
    # Integrate the targets
    if targets is not None:
        train = train.merge(targets,on='customer_ID',how='left')
        train.target = train.target.astype('int8')
        
    # Handle missing values
    for col in train.columns:
        if col not in skips + cat_columns:
            train[col] = train[[col]].fillna((train[[col]].median()).astype('float32'))
    
    # Sort
    train = train.sort_values(['customer_ID','year','month','day']).reset_index(drop=True)
    
    # Drop non required columns
    train = train.drop(['year','month','day'],axis=1)
    
    # Rearrange the categorical coluns to the begining of the data frame
    temp = list(train.columns[1:])
    temp = ['customer_ID'] + cat_columns + [col for col in temp if col not in cat_columns]
    train = train[temp]
    
    return train

In [None]:
all_customers = pd.read_csv('/kaggle/input/amex-default-prediction/train_data.csv', usecols=['customer_ID'])
customers = all_customers.drop_duplicates().sort_index().values.flatten()
rows = []
each_raw_count = len(customers) // 10 
for k in range(10):
    if (k==9):
        part = customers[k*each_raw_count:]
    else: 
        part = customers[k*each_raw_count:(k+1)*each_raw_count]
    count = all_customers.loc[all_customers.customer_ID.isin(part)].shape[0]
    rows.append(count)
rows

In [None]:
rows = [553403, 552855, 554025, 554330, 552004, 552378, 552822, 553151, 553493, 552990] # Found from above cell code (It is time consuming)
# Create train files => 10 files     
for k in range(10):
    skip = int(np.sum(rows[:k]) + 1)
    train = cudf.read_csv('/kaggle/input/amex-default-prediction/train_data.csv', nrows=rows[k], skiprows=skip, header=None, names=tot_columns)

    train = feature_eng(train, targets = targets)
    print(train)
    tar = train[['customer_ID','target']].drop_duplicates().sort_index()
    tar.to_parquet(f'targets_{k+1}.pqt',index=False)
    data = train.iloc[:,1:-1].values.reshape((-1,13,188))
    cupy.save(f'data_{k+1}',data.astype('float32'))

del train, tar, data
del targets
gc.collect()

# Model

In [None]:
# GRU Model
def build_model():
    inp = tf.keras.Input(shape=(13,188))
    embeddings = []
    for k in range(11):
        emb = tf.keras.layers.Embedding(10,4)
        embeddings.append(emb(inp[:,:,k]))
    x = tf.keras.layers.Concatenate()([inp[:,:,11:]]+embeddings)

    # RNN
    x = tf.keras.layers.GRU(units=128, return_sequences=False)(x)
    x = tf.keras.layers.Dense(64,activation='relu')(x)
    x = tf.keras.layers.Dense(32,activation='relu')(x)

    # Output Layer
    x = tf.keras.layers.Dense(1,activation='sigmoid')(x)
    
    # Compile the model
    model = tf.keras.Model(inputs=inp, outputs=x)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    loss = tf.keras.losses.BinaryCrossentropy()
    model.compile(loss=loss, optimizer = opt)
    
    return model

In [None]:
# Learning rate scheduler
def lr_s (epoch):
    lr = [1e-3]*5 + [1e-4]*2 + [1e-5]*1
    return lr[epoch]
learning_rate_callback = tf.keras.callbacks.LearningRateScheduler(lr_s, verbose = False)

# Validation Metirc

In [None]:
# COMPETITION METRIC FROM Konstantin Yakovlev
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four)

# Train the model

In [None]:
true = np.array([])
oof = np.array([])
VERBOSE = 1 

for fold in range(5):
    # Generate validation and training idx
    valid_idx = [2 * fold + 1, 2 * fold + 2]
    train_idx = [x for x in [1,2,3,4,5,6,7,8,9,10] if x not in valid_idx]

    print('='*25)
    print(f'##### Fold {fold+1} with valid files', valid_idx)

    # Read train data
    X_train = []; y_train = []
    for k in train_idx:
        X_train.append(np.load(f'data_{k}.npy'))
        y_train.append(pd.read_parquet(f'targets_{k}.pqt') )
    X_train = np.concatenate(X_train,axis=0)
    y_train = pd.concat(y_train).target.values
    print('### Training data shapes', X_train.shape, y_train.shape)

    # Read validation data
    X_valid = []; y_valid = []
    for k in valid_idx:
        X_valid.append( np.load(f'data_{k}.npy'))
        y_valid.append( pd.read_parquet(f'targets_{k}.pqt') )
    X_valid = np.concatenate(X_valid,axis=0)
    y_valid = pd.concat(y_valid).target.values
    print('### Validation data shapes', X_valid.shape, y_valid.shape)
    print('='*25)

    # Train model
    K.clear_session()
    model = build_model()
    h = model.fit(X_train,y_train, 
                  validation_data = (X_valid,y_valid),
                  batch_size=512, epochs=8, verbose=VERBOSE,
                  callbacks = [learning_rate_callback])
    model.save_weights(f'gru_fold_{fold+1}.h5')

    # Validate
    print('Validate...')
    preds = model.predict(X_valid, batch_size=512, verbose=VERBOSE).flatten()

    print(f'~~~~Fold {fold+1} Result: ', amex_metric_mod(y_valid, preds) )
    
    true = np.concatenate([true, y_valid])
    oof = np.concatenate([oof, preds])

    del model, X_train, y_train, X_valid, y_valid, preds
    gc.collect()

print('='*25)
print(f'Final Result: ', amex_metric_mod(true, oof))
print('='*25)
K.clear_session()

# Evaluation

In [None]:
all_customers = pd.read_csv('/kaggle/input/amex-default-prediction/test_data.csv', usecols=['customer_ID'])
customers = all_customers.drop_duplicates().sort_index().values.flatten()
rows = []
each_raw_count = len(customers) // 20 
for k in range(20):
    if (k==19):
        part = customers[k*each_raw_count:]
    else: 
        part = customers[k*each_raw_count:(k+1)*each_raw_count]
    count = all_customers.loc[all_customers.customer_ID.isin(part)].shape[0]
    rows.append(count)
rows

In [None]:
rows = [567933,568482,569369,567886,567539,568041,568138,567596,568543,567539,568421,568745,568279,568333,568327,568901,568300,568001,567372,568017] # Get from the above cell code (Time consuming to execute)
test_customer_hashes = cupy.array([],dtype='int64')

# Create folds from test data
for k in range(20):
    skip = int(np.sum( rows[:k] ) + 1)
    test = cudf.read_csv('/kaggle/input/amex-default-prediction/test_data.csv', nrows=rows[k], 
                          skiprows=skip, header=None, names=tot_columns)

    test = feature_eng(test, targets = None)

    customer_idxs = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
    test_customer_hashes = cupy.concatenate([test_customer_hashes,customer_idxs])

    data = test.iloc[:,1:].values.reshape((-1,13,188))
    cupy.save(f'test_data_{k+1}',data.astype('float32'))

cupy.save(f'test_customer_hashes', test_customer_hashes)

del test, data
gc.collect()

In [None]:
# Prediction Generation
start = 0; end = 0
sub = cudf.read_csv('/kaggle/input/amex-default-prediction/sample_submission.csv')

# Rearrange submission rows
sub['hash'] = sub['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
test_hash_index = cupy.load(f'test_customer_hashes.npy')
sub = sub.set_index('hash').loc[test_hash_index].reset_index(drop=True)

for k in range(20):
    K.clear_session()
    model = build_model()
    
    # Loading test data folds
    X_test = np.load(f'test_data_{k+1}.npy')
    end = start + X_test.shape[0]

    # Loading models
    model.load_weights(f'gru_fold_1.h5')
    preds = model.predict(X_test, batch_size=512, verbose=0).flatten() 
    for j in range(1,5):
        model.load_weights(f'gru_fold_{j+1}.h5')
        preds += model.predict(X_test, batch_size=512, verbose=0).flatten()
    preds /= 5.0

    sub.loc[start:end-1,'prediction'] = preds
    start = end

    del model, X_test, preds
    gc.collect()

# Generate Submission

In [None]:
sub.to_csv('submission.csv',index=False)
sub.head()

In [None]:
sub.shape