# LGBM GPU Starter - LB 0.795

This notebook builds and trains an LGBM model using based on the features and data preprocessing methods introduced in the XGBoost starter notebook(2). It trains LGBM on GPU for better accuracy and shorter training time.

Results: CV:0.794 ,LB: 0.795

References

(1) https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format

(2) https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793

(3) https://www.kaggle.com/competitions/amex-default-prediction/discussion/328606

# Load Libraries

In [1]:
# LOAD LIBRARIES
import pandas as pd, numpy as np # CPU libraries
import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

print('RAPIDS version',cudf.__version__)

In [2]:
# VERSION NAME FOR SAVED MODEL FILES
VER = 1

# TRAIN RANDOM SEED
SEED = 42

# FILL NAN VALUE
NAN_VALUE = -127 # will fit in int8

# FOLDS PER MODEL
FOLDS = 5

# Process and Feature Engineer Train Data
We will load @raddar Kaggle dataset from [here][1] with discussion [here][2]. Then we will engineer features suggested by @huseyincot in his notebooks [here][3] and [here][4]. We will use [RAPIDS][5] and the GPU to create new features quickly.

[1]: https://www.kaggle.com/datasets/raddar/amex-data-integer-dtypes-parquet-format
[2]: https://www.kaggle.com/competitions/amex-default-prediction/discussion/328514
[3]: https://www.kaggle.com/code/huseyincot/amex-catboost-0-793
[4]: https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
[5]: https://rapids.ai/

In [3]:
def read_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
    #df = df.sort_values(['customer_ID','S_2'])
    #df = df.reset_index(drop=True)
    # FILL NAN
    df = df.fillna(NAN_VALUE) 
    print('shape of data:', df.shape)
    
    return df

In [4]:
def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

In [5]:
print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_file(path = TRAIN_PATH)

train = process_and_feature_engineer(train)

In [6]:
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets.index = targets['customer_ID'].sort_index()
targets = targets.drop('customer_ID', axis=1)
train = train.join(targets,on =['customer_ID'] ).sort_index()

del targets
gc.collect()

train = train.fillna(NAN_VALUE)

# FEATURES
FEATURES = train.columns[1:-1]

In [7]:
target = train.target.values

# Faster metric Implementation

reference:https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020


In [8]:
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)

def lgb_amex_metric(y_true, y_pred):
    return ('Score',
            amex_metric(y_true, y_pred),
            True)

In [9]:
import datetime
import warnings
import gc
import pickle

from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier, log_evaluation

# Train LightGBM

In [10]:
features = [f for f in train.columns if f != 'customer_ID' and f != 'target']
print("Number of Features :",len(features))
def lgbm_params(random_state=1, n_estimators=1200):
    return LGBMClassifier(n_estimators=n_estimators,
                          #boosting_type = 'dart',
                          learning_rate=0.03, reg_lambda=50,
                          min_child_samples=2400,
                          num_leaves=95,
                          colsample_bytree=0.19,
                          device='gpu',
                          random_state=random_state)

In [11]:
score_list = []
kf = StratifiedKFold(n_splits=5)

for fold, (train_idx, valid_idx) in enumerate(kf.split(train, train.target.to_array())):
    print('#'*25)
    print('### Fold',fold)
    print('### Train size:',len(train_idx),', Validation size:',len(valid_idx))
    print('#'*25)
    X_tr, X_val, y_tr, y_val, model = None, None, None, None, None
    start_time = datetime.datetime.now()
    X_tr = train.iloc[train_idx][features].as_gpu_matrix()
    X_val = train.iloc[valid_idx][features].as_gpu_matrix()
    y_tr = cupy.asarray(train.iloc[train_idx]["target"])
    y_val = cupy.asarray(train.iloc[valid_idx]["target"])
    
    model = lgbm_params()
    with warnings.catch_warnings():
        warnings.filterwarnings('ignore', category=UserWarning)
        model.fit(X_tr, cupy.asnumpy(y_tr),
                  eval_set = [(X_val, cupy.asnumpy(y_val))], 
                  eval_metric=[lgb_amex_metric],
                  callbacks=[log_evaluation(100)])
        file = f'LGBM_v{VER}_fold{fold}.pkl'
        pickle.dump(model, open(file, 'wb'))
    y_val_pred = model.predict_proba(X_val, raw_score=True)
    score = amex_metric(y_val, y_val_pred)
    n_trees = model.best_iteration_
    if n_trees is None: n_trees = model.n_estimators
    print(f"\n\n\nFold {fold} | Training Time: {str(datetime.datetime.now() - start_time)[-12:-7]} |"
          f" Score = {score:.5f}\n\n\n")
    score_list.append(score)
    
    del X_val, y_val, score, model
    gc.collect()
    
print(f"\n\n\nScore: {np.mean(cupy.asarray(score_list)):.5f}\n\n\n")

In [12]:
del train
gc.collect()

# Make Predictions
Since the Test Data is big, predicting all the results at once leads to an memory error. Split the data into 4 parts, make each prediction, and append them.

In [13]:
# CALCULATE SIZE OF EACH SEPARATE TEST PART
def get_rows(customers, test, NUM_PARTS = 4, verbose = ''):
    chunk = len(customers)//NUM_PARTS
    if verbose != '':
        print(f'We will process {verbose} data as {NUM_PARTS} separate parts.')
        print(f'There will be {chunk} customers in each part (except the last part).')
        print('Below are number of rows in each part:')
    rows = []

    for k in range(NUM_PARTS):
        if k==NUM_PARTS-1: cc = customers[k*chunk:]
        else: cc = customers[k*chunk:(k+1)*chunk]
        s = test.loc[test.customer_ID.isin(cc)].shape[0]
        rows.append(s)
    if verbose != '': print( rows )
    return rows,chunk

# COMPUTE SIZE OF 4 PARTS FOR TEST DATA
NUM_PARTS = 4
TEST_PATH =  '../input/amex-data-integer-dtypes-parquet-format/test.parquet'

print(f'Reading test data...')
test = read_file(path = TEST_PATH, usecols = ['customer_ID','S_2'])
customers = test[['customer_ID']].drop_duplicates().sort_index().values.flatten()
rows,num_cust = get_rows(customers, test[['customer_ID']], NUM_PARTS = NUM_PARTS, verbose = 'test')

In [14]:
# INFER TEST DATA IN PARTS
skip_rows = 0
skip_cust = 0
test_preds = []

for k in range(NUM_PARTS):
    
    # READ PART OF TEST DATA
    print(f'\nReading test data...')
    test = read_file(path = TEST_PATH)
    test = test.iloc[skip_rows:skip_rows+rows[k]]
    skip_rows += rows[k]
    print(f'=> Test part {k+1} has shape', test.shape )
    
    # PROCESS AND FEATURE ENGINEER PART OF TEST DATA
    test = process_and_feature_engineer(test)
    test = test.fillna(NAN_VALUE)
    if k==NUM_PARTS-1: test = test.loc[customers[skip_cust:]]
    else: test = test.loc[customers[skip_cust:skip_cust+num_cust]]
    skip_cust += num_cust
    
    dtest = test[features].as_gpu_matrix()
    del test 
    gc.collect()
    # reduce memory test = test[['P_2_mean']] 必要だったら追加
     # INFER LGBM MODELS ON TEST DATA
    with open(f'LGBM_v{VER}_fold0.pkl', 'rb') as pickle_file:
        model = pickle.load(pickle_file)
        preds = model.predict_proba(dtest,raw_score=True)
    for f in range(1,FOLDS):
        with open(f'LGBM_v{VER}_fold{f}.pkl', 'rb') as pickle_file:
            model = pickle.load(pickle_file)
            preds += model.predict_proba(dtest,raw_score=True)
    preds = preds / FOLDS
    test_preds.append(preds)

# CLEAN MEMORY
del dtest, model
_ = gc.collect()

# Make submission

In [15]:
test_predictions = np.concatenate(test_preds)

submission = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")
submission.loc[:, "prediction"] = test_predictions
submission.to_csv("submission.csv", index=False)

In [16]:
submission.head()

# Future Ideas

- Use Optuna for hyperparmaeter tuning
- Change Boosting method to DART: slower but better accuracy