In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import gc
import os
import operator

from glob import glob

import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt

from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from scipy.sparse import csr_matrix, hstack
from sklearn.metrics import roc_auc_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/its-a-fraud/train.csv
/kaggle/input/its-a-fraud/test.csv
/kaggle/input/its-a-fraud/mock_submission.csv
/kaggle/input/transaction-fruad/Train_without_Data_balancing.csv
/kaggle/input/transaction-fruad/Test_without_Data_balancing_90null_without_drop.csv
/kaggle/input/transaction-fruad/Train_without_Data_balancing_90null_without_drop.csv
/kaggle/input/transaction-fruad/Test_without_Data_balancing_90null_with_drop .csv
/kaggle/input/transaction-fruad/Test_without_Data_balancing.csv
/kaggle/input/transaction-fruad/Train_without_Data_balancing_90null_with_drop .csv


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [4]:
train=pd.read_csv("../input/transaction-fruad/Train_without_Data_balancing.csv")
train=train.drop(train.columns[0],axis=1)

In [5]:
Train_target_df=train.loc[:,"isFraud"]
train.drop("isFraud",axis=1,inplace=True)
x_train_df=train
y_train_df=Train_target_df

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_train_df, y_train_df, test_size=0.20, random_state=0)

In [7]:
feature_names=x_train.columns
dm1 = xgb.DMatrix(x_train, y_train, feature_names=feature_names)
dm1.save_binary('train.bin')
dm2 = xgb.DMatrix(x_test, y_test, feature_names=feature_names)
dm2.save_binary('validate.bin')

In [8]:
del dm1, x_train, y_train, dm2, x_test, y_test
gc.collect()

91

In [9]:
params = {
    'eta': 0.3,
    'tree_method': "hist",
    'grow_policy': "lossguide",
    'max_leaves': 1000,  
    'max_depth': 0, 
    'subsample': 0.9, 
    'alpha':1,
    'objective': 'binary:logistic', 
    'scale_pos_weight':100,
    'eval_metric': 'auc', 
    'nthread':4,
    'silent': 1
}

In [10]:
dmtrain = xgb.DMatrix('train.bin', feature_names=feature_names)
dmvalid = xgb.DMatrix('validate.bin', feature_names=feature_names)

In [11]:
def objective(params):
    num_round = int(params['n_estimators'])
    del params['n_estimators']
    watchlist = [(dmtrain, 'train'), (dmvalid, 'valid')]
    model = xgb.train(params, dmtrain, num_round, watchlist, maximize=True, early_stopping_rounds=20, verbose_eval=1)
    pred = model.predict(dmvalid, ntree_limit=model.best_ntree_limit)
    auc = roc_auc_score(dmvalid.get_label(), pred)
    del pred,model
    gc.collect()
    print(f"SCORE: {auc}")
    return { 'loss': 1-auc, 'status': STATUS_OK }

In [12]:
space = {
    'n_estimators': hp.quniform('n_estimators', 200, 600, 50),
    'eta': hp.quniform('eta', 0.025, 0.25, 0.025),
    'max_depth': hp.choice('max_depth', np.arange(1, 14, dtype=int)),
    'min_child_weight': hp.quniform('min_child_weight', 1, 10, 1),
    'subsample': hp.quniform('subsample', 0.7, 1, 0.05),
    'gamma': hp.quniform('gamma', 0.5, 1, 0.05),
    'colsample_bytree': hp.quniform('colsample_bytree', 0.7, 1, 0.05),
    'alpha' : hp.quniform('alpha', 0, 10, 1),
    'lambda': hp.quniform('lambda', 1, 2, 0.1),
    'scale_pos_weight': hp.quniform('scale_pos_weight', 50, 200, 10),
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': "hist",
    'booster': 'gbtree'
}

In [13]:
trials = Trials()
best = fmin(
    fn=objective,
    space=space,
    algo=tpe.suggest,
    max_evals=100,
    trials=trials
)

[0]	train-auc:0.86233	valid-auc:0.84186
[1]	train-auc:0.88748	valid-auc:0.85662
[2]	train-auc:0.89415	valid-auc:0.85869
[3]	train-auc:0.91340	valid-auc:0.87660
[4]	train-auc:0.91711	valid-auc:0.87753
[5]	train-auc:0.92568	valid-auc:0.88406
[6]	train-auc:0.92879	valid-auc:0.88552
[7]	train-auc:0.93276	valid-auc:0.88996
[8]	train-auc:0.93695	valid-auc:0.89277
[9]	train-auc:0.94115	valid-auc:0.89635
[10]	train-auc:0.94375	valid-auc:0.89766
[11]	train-auc:0.94741	valid-auc:0.90104
[12]	train-auc:0.95084	valid-auc:0.90350
[13]	train-auc:0.95181	valid-auc:0.90356
[14]	train-auc:0.95429	valid-auc:0.90538
[15]	train-auc:0.95661	valid-auc:0.90680
[16]	train-auc:0.95853	valid-auc:0.90926
[17]	train-auc:0.96111	valid-auc:0.91080
[18]	train-auc:0.96254	valid-auc:0.91192
[19]	train-auc:0.96384	valid-auc:0.91381
[20]	train-auc:0.96496	valid-auc:0.91504
[21]	train-auc:0.96588	valid-auc:0.91597
[22]	train-auc:0.96691	valid-auc:0.91685
[23]	train-auc:0.96786	valid-auc:0.91726
[24]	train-auc:0.96878	val

In [14]:
best

{'alpha': 2.0,
 'colsample_bytree': 0.8500000000000001,
 'eta': 0.125,
 'gamma': 0.75,
 'lambda': 1.8,
 'max_depth': 12,
 'min_child_weight': 7.0,
 'n_estimators': 500.0,
 'scale_pos_weight': 60.0,
 'subsample': 0.7000000000000001}

In [15]:
trials

<hyperopt.base.Trials at 0x7f38893d7f90>

In [16]:
final_pera={
    'alpha': 8.0,
    'colsample_bytree': 0.8,
    'eta': 0.17500000000000002,
    'gamma': 0.6000000000000001,
    'lambda': 1.2000000000000002,
    'max_depth': 10,
    'min_child_weight': 1.0,
    'n_estimators': 550.0,
    'scale_pos_weight': 50.0,
    'subsample': 0.9,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'tree_method': "hist",
    'booster': 'gbtree'
}

In [17]:
for key,value in best.items():
    final_pera[key]=best[key]

In [18]:
final_pera

{'alpha': 2.0,
 'colsample_bytree': 0.8500000000000001,
 'eta': 0.125,
 'gamma': 0.75,
 'lambda': 1.8,
 'max_depth': 12,
 'min_child_weight': 7.0,
 'n_estimators': 500.0,
 'scale_pos_weight': 60.0,
 'subsample': 0.7000000000000001,
 'objective': 'binary:logistic',
 'eval_metric': 'auc',
 'tree_method': 'hist',
 'booster': 'gbtree'}

In [19]:
num_round = int(final_pera['n_estimators'])
del final_pera['n_estimators']
watchlist = [(dmtrain, 'train'), (dmvalid, 'valid')]
model = xgb.train(final_pera, dmtrain, num_round, watchlist, maximize=True, early_stopping_rounds=20, verbose_eval=1)
pred = model.predict(dmvalid, ntree_limit=model.best_ntree_limit)
auc = roc_auc_score(dmvalid.get_label(), pred)
print(f"SCORE: {auc}")

[0]	train-auc:0.90242	valid-auc:0.86641
[1]	train-auc:0.92518	valid-auc:0.89057
[2]	train-auc:0.93022	valid-auc:0.89670
[3]	train-auc:0.93886	valid-auc:0.90305
[4]	train-auc:0.94398	valid-auc:0.90689
[5]	train-auc:0.94742	valid-auc:0.90903
[6]	train-auc:0.95102	valid-auc:0.91123
[7]	train-auc:0.95331	valid-auc:0.91425
[8]	train-auc:0.95598	valid-auc:0.91580
[9]	train-auc:0.95841	valid-auc:0.91851
[10]	train-auc:0.96082	valid-auc:0.92014
[11]	train-auc:0.96226	valid-auc:0.92102
[12]	train-auc:0.96465	valid-auc:0.92260
[13]	train-auc:0.96555	valid-auc:0.92285
[14]	train-auc:0.96710	valid-auc:0.92431
[15]	train-auc:0.96856	valid-auc:0.92515
[16]	train-auc:0.96936	valid-auc:0.92629
[17]	train-auc:0.97070	valid-auc:0.92755
[18]	train-auc:0.97203	valid-auc:0.92848
[19]	train-auc:0.97304	valid-auc:0.92910
[20]	train-auc:0.97364	valid-auc:0.92927
[21]	train-auc:0.97416	valid-auc:0.92984
[22]	train-auc:0.97550	valid-auc:0.93067
[23]	train-auc:0.97616	valid-auc:0.93126
[24]	train-auc:0.97713	val

In [20]:
def output(model,inputfile,outputfile):
    test=pd.read_csv(inputfile)
    test=test.drop(test.columns[0],axis=1)
    dm_test = xgb.DMatrix(test, feature_names=feature_names)
    predictions_test = model.predict(dm_test,ntree_limit=model.best_ntree_limit)
    Test_df_predictions=pd.DataFrame(data=predictions_test,columns=["isFraud"])
    Test_df_predictions.reset_index(inplace=True)
    Test_df_predictions.rename(columns={"index":"Id"},inplace=True)
    Test_df_predictions.head()
    Test_df_predictions.to_csv(outputfile,index=False)

In [21]:
output(model,'../input/transaction-fruad/Test_without_Data_balancing.csv','./xg_predictions.csv')

1. https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

​

2. https://rdrr.io/cran/xgboost/man/xgb.train.html

​

3. https://www.kaggle.com/code/alexandrnikitin/xgboost-hyperparameter-optimization/script

​

4. https://github.com/hyperopt/hyperopt/wiki/FMin

​

5. https://www.youtube.com/watch?v=tdwgR1AqQ8Y

​

6. https://www.kaggle.com/code/corochann/optuna-tutorial-for-hyperparameter-optimization/notebook

