In [0]:
import time
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.lines as mlines
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import matplotlib.cm as cm
import warnings
import math
sns.set(style="whitegrid", color_codes=True)
%matplotlib inline
warnings.filterwarnings("ignore")
pd.set_option('max_colwidth',999)
pd.set_option('display.max_columns', 999)
pd.set_option("display.max_rows",999)

In [0]:
X_train = pd.read_csv("X_train.csv")
Y_train = pd.Series(pd.read_csv("Y_train.csv",header=None)[0])
X_test = pd.read_csv("X_test.csv")
Y_test = pd.Series(pd.read_csv("Y_test.csv",header=None)[0])

In [18]:
print("X_train :=", X_train.shape)
print("Y_train :=", Y_train.shape)
print("X_test :=", X_test.shape)
print("Y_test :=", Y_test.shape)

X_train := (109574, 24)
Y_train := (109574,)
X_test := (29429, 24)
Y_test := (29429,)


In [20]:
pd.Series(Y_train).value_counts()

0    97663
1    11911
Name: 0, dtype: int64

In [21]:
pd.Series(Y_test).value_counts()

0    27430
1     1999
Name: 0, dtype: int64

## **Hyper-Parameter Tuning** 

In [0]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV, cross_val_score, StratifiedKFold, learning_curve
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Light GBM

In [23]:

start_time = time.time()
params = {'boosting_type': 'gbdt',
          'objective': 'binary',
          'nthread': 3, 
          'num_leaves': 64,
          'learning_rate': 0.05,
          'max_bin': 512,
          'subsample_for_bin': 200,
          'subsample': 1,
          'subsample_freq': 1,
          'colsample_bytree': 0.8,
          'reg_alpha': 5,
          'reg_lambda': 10,
          'min_split_gain': 0.5,
          'min_child_weight': 1,
          'min_child_samples': 5,
          'scale_pos_weight': 1,
          'num_class' : 1}
gridParams = {
    'learning_rate': [0.01,0.05,0.005],
    'n_estimators': [80,100],
    'num_leaves': [6,8,12,16],
    'boosting_type' : ['gbdt'],
    'objective' : ['binary'],
    'random_state' : [501],
    'colsample_bytree' : [0.65, 0.66],
    'subsample' : [0.7,0.75],
    'reg_alpha' : [1,1.2],
    'reg_lambda' : [1,1.2,1.4],
    'max_depth'  :[2,3,5,7],
    'scale_pos_weight': [0.7,0.8,1]
    }

mdl = lgb.LGBMClassifier(boosting_type= 'gbdt',
          objective = 'binary',
          n_jobs = 4, 
          silent = True,
          max_bin = params['max_bin'],
          subsample_for_bin = params['subsample_for_bin'],
          subsample = params['subsample'],
          subsample_freq = params['subsample_freq'],
          min_split_gain = params['min_split_gain'],
          min_child_weight = params['min_child_weight'],
          min_child_samples = params['min_child_samples'],
          scale_pos_weight = params['scale_pos_weight'])


mdl.get_params().keys()


grid = GridSearchCV(mdl, gridParams,
                    scoring='roc_auc',
                    verbose=1,
                    n_jobs=-1)


grid.fit(X_train, Y_train)
LGB_Best = grid.best_estimator_
print("Train best score := ",grid.best_score_)
print(LGB_Best)
Pred_train=LGB_Best.predict(X_train)
Pred_test=LGB_Best.predict(X_test)

print(" Train Recall Score := ",metrics.recall_score(Y_train,Pred_train))
print(" Test Recall Score := ",metrics.recall_score(Y_test,Pred_test))
print(" Train Precision Score := ",metrics.precision_score(Y_train,Pred_train))
print(" Test Precision Score := ",metrics.precision_score(Y_test,Pred_test))
print(" Train AUC-ROC Score := ",metrics.roc_auc_score(Y_train,Pred_train))
print(" Test AUC-ROC Score := ",metrics.roc_auc_score(Y_test,Pred_test))
print(" Train F1 Score := ",metrics.f1_score(Y_train,Pred_train))
print(" Test F1 Score := ",metrics.f1_score(Y_test,Pred_test))

print("Train Confusion Matrix:\n",metrics.confusion_matrix(Y_train,Pred_train))
print("Test Confusion Matrix:\n",metrics.confusion_matrix(Y_test,Pred_test))
elapsed_time = time.time() - start_time
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

Fitting 3 folds for each of 6912 candidates, totalling 20736 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 446 tasks      | elapsed:  5.7min
[Parallel(n_jobs=-1)]: Done 796 tasks      | elapsed: 10.9min
[Parallel(n_jobs=-1)]: Done 1246 tasks      | elapsed: 17.3min
[Parallel(n_jobs=-1)]: Done 1796 tasks      | elapsed: 26.2min
[Parallel(n_jobs=-1)]: Done 2446 tasks      | elapsed: 37.0min
[Parallel(n_jobs=-1)]: Done 3196 tasks      | elapsed: 49.7min
[Parallel(n_jobs=-1)]: Done 4046 tasks      | elapsed: 62.7min
[Parallel(n_jobs=-1)]: Done 4996 tasks      | elapsed: 76.9min
[Parallel(n_jobs=-1)]: Done 6046 tasks      | elapsed: 94.9min
[Parallel(n_jobs=-1)]: Done 7196 tasks      | elapsed: 113.8min
[Parallel(n_jobs=-1)]: Done 8446 tasks      | elapsed: 132.0min
[Parallel(n_jobs=-1)]: Done 9796 tasks      | elapsed: 153.9min
[Parallel(n_jobs=-1)]: Done 11246 tasks    

Train best score :=  0.9724246202932918
LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=0.66,
               importance_type='split', learning_rate=0.05, max_bin=512,
               max_depth=7, min_child_samples=5, min_child_weight=1,
               min_split_gain=0.5, n_estimators=100, n_jobs=4, num_leaves=16,
               objective='binary', random_state=501, reg_alpha=1,
               reg_lambda=1.2, scale_pos_weight=1, silent=True, subsample=0.7,
               subsample_for_bin=200, subsample_freq=1)
 Train Recall Score :=  0.8767525816472168


NameError: ignored

In [24]:
Pred_train=LGB_Best.predict(X_train)
Pred_test=LGB_Best.predict(X_test)

print(" Train Recall Score := ",metrics.recall_score(Y_train,Pred_train))
print(" Test Recall Score := ",metrics.recall_score(Y_test,Pred_test))
print(" Train Precision Score := ",metrics.precision_score(Y_train,Pred_train))
print(" Test Precision Score := ",metrics.precision_score(Y_test,Pred_test))
print(" Train AUC-ROC Score := ",metrics.roc_auc_score(Y_train,Pred_train))
print(" Test AUC-ROC Score := ",metrics.roc_auc_score(Y_test,Pred_test))
print(" Train F1 Score := ",metrics.f1_score(Y_train,Pred_train))
print(" Test F1 Score := ",metrics.f1_score(Y_test,Pred_test))

print("Train Confusion Matrix:\n",metrics.confusion_matrix(Y_train,Pred_train))
print("Test Confusion Matrix:\n",metrics.confusion_matrix(Y_test,Pred_test))

 Train Recall Score :=  0.8767525816472168
 Test Recall Score :=  0.6053026513256629
 Train Precision Score :=  0.9135683667220715
 Test Precision Score :=  0.8432055749128919
 Train AUC-ROC Score :=  0.9333180804471095
 Test AUC-ROC Score :=  0.7985499767747526
 Train F1 Score :=  0.8947819381372633
 Test F1 Score :=  0.704717530576587
Train Confusion Matrix:
 [[96675   988]
 [ 1468 10443]]
Test Confusion Matrix:
 [[27205   225]
 [  789  1210]]


In [25]:
(X_test[(Pred_test == 1)&(Y_test == 1)]['AMOUNT_USD'].sum()/X_test[Y_test == 1]['AMOUNT_USD'].sum())*100

54.21072423536179

## Logistic Regression

In [28]:
LogisticRegression(C=10, class_weight={1: 0.7, 0: 0.3}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

LogisticRegression(C=10, class_weight={0: 0.3, 1: 0.7}, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

## **Ensemble Model**

In [0]:
lr = LogisticRegression(C=10, class_weight={1: 0.7, 0: 0.3}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [0]:
votingCPre = VotingClassifier(estimators=[('lgbm', LGB_Best),
                                          ('lr',lr)
                                         ], voting='soft', n_jobs=4)

votingCPre = votingCPre.fit(X_train,Y_train)
Pred_train=votingCPre.predict(X_train)
Pred_test=votingCPre.predict(X_test)

print(" Train Recall Score := ",metrics.recall_score(Y_train,Pred_train))
print(" Test Recall Score := ",metrics.recall_score(Y_test,Pred_test))
print(" Train Precision Score := ",metrics.precision_score(Y_train,Pred_train))
print(" Test Precision Score := ",metrics.precision_score(Y_test,Pred_test))
print(" Train AUC-ROC Score := ",metrics.roc_auc_score(Y_train,Pred_train))
print(" Test AUC-ROC Score := ",metrics.roc_auc_score(Y_test,Pred_test))
print(" Train F1 Score := ",metrics.f1_score(Y_train,Pred_train))
print(" Test F1 Score := ",metrics.f1_score(Y_test,Pred_test))

print("Train Confusion Matrix:\n",metrics.confusion_matrix(Y_train,Pred_train))
print("Test Confusion Matrix:\n",metrics.confusion_matrix(Y_test,Pred_test))

In [0]:
(X_test[(Pred_test == 1)&(Y_test == 1)]['AMOUNT_USD'].sum()/X_test[Y_test == 1]['AMOUNT_USD'].sum())*100

# **Experiments on Scaled Data**

In [0]:
lr = LogisticRegression(C=10, class_weight={1: 0.7, 0: 0.3}, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
          solver='warn', tol=0.0001, verbose=0, warm_start=False)

In [34]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = pd.DataFrame(scaler.transform(X_train),columns = X_train.columns,index = X_train.index)
X_train_scaled.head()

Unnamed: 0,USER_AMOUNT_Z-Score,USER_AMOUNT_hourOfDay_Z-Score,USER_AMOUNT_dayOfWeek_Z-Score,User_PHONE_COUNTRY_Count,User_Unique_Currency_Count,AMOUNT_USD,AMOUNT_USD-AMOUNT,AMOUNT_USD_First_Digit,AMOUNT_USD_Last_Digit,Currency,MERCHANT_CATEGORY_,MERCHANT_COUNTRY,ENTRY_METHOD,TYPE,SOURCE,HAS_E-MAIL,COUNTRY,KYC,FAILED_SIGN_IN_ATTEMPT,IS_CRYPTO,TERMS_VERSION,BIRTH_YEAR,Current_Created_User,trx_Month
0,0.996838,0.998787,0.998793,0.0,0.333333,3.556288e-08,0.003871,0.555556,1.0,0.25,0.994104,0.909169,0.324022,0.090343,0.065894,1.0,0.422392,0.000898,0.0,0.0,1.0,0.828125,0.19613,0.0
1,0.996836,0.998963,0.998816,0.0,0.333333,7.336905e-08,0.003871,0.111111,0.222222,0.25,0.994122,0.909165,1.0,0.090343,0.065894,1.0,0.508906,0.000898,0.0,0.0,0.0,0.75,0.10835,0.0
2,0.996832,0.998794,0.998821,1.0,0.0,7.363297e-08,0.003871,0.111111,0.666667,0.0,0.994122,0.909169,0.324022,0.090343,0.065894,1.0,1.0,0.000898,0.0,0.0,0.967078,0.875,0.070619,0.0
3,0.996792,0.998949,0.998804,1.0,0.0,1.163876e-07,0.003871,0.111111,0.444444,0.0,0.994161,0.909274,0.664804,0.23053,0.044171,1.0,1.0,0.000898,0.0,0.0,0.0,0.84375,0.040392,0.0
4,0.996783,0.998807,0.998793,1.0,0.0,8.3134e-08,0.003871,0.111111,0.0,0.0,0.994161,0.909274,0.664804,0.23053,0.044171,1.0,1.0,0.000898,0.0,0.0,0.0,0.84375,0.040395,0.0


In [35]:
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns,index = X_test.index)
X_test_scaled.head()

Unnamed: 0,USER_AMOUNT_Z-Score,USER_AMOUNT_hourOfDay_Z-Score,USER_AMOUNT_dayOfWeek_Z-Score,User_PHONE_COUNTRY_Count,User_Unique_Currency_Count,AMOUNT_USD,AMOUNT_USD-AMOUNT,AMOUNT_USD_First_Digit,AMOUNT_USD_Last_Digit,Currency,MERCHANT_CATEGORY_,MERCHANT_COUNTRY,ENTRY_METHOD,TYPE,SOURCE,HAS_E-MAIL,COUNTRY,KYC,FAILED_SIGN_IN_ATTEMPT,IS_CRYPTO,TERMS_VERSION,BIRTH_YEAR,Current_Created_User,trx_Month
0,0.996841,0.998851,0.99879,1.0,0.333333,1.541938e-07,0.003871,0.222222,0.777778,0.0,0.994407,0.909274,0.664804,1.0,0.06517,1.0,1.0,0.000898,0.0,0.0,0.0,0.828125,0.116198,0.0
1,0.996818,0.998807,0.998789,1.0,0.166667,2.49534e-07,0.003871,0.333333,0.222222,0.0,0.994161,0.909274,0.664804,0.23053,0.04055,1.0,1.0,0.000898,0.0,0.0,0.205761,0.671875,0.050435,0.0
2,0.996834,0.998827,0.998827,1.0,0.166667,8.3134e-08,0.003871,0.111111,0.0,0.0,0.994161,0.909274,0.664804,0.23053,0.04055,1.0,1.0,0.000898,0.0,0.0,0.0,0.75,0.361027,0.0
3,0.99681,0.998814,0.998804,1.0,0.166667,2.203711e-08,0.003871,0.333333,0.444444,0.0,0.994122,0.909354,1.0,0.090343,0.065894,1.0,1.0,0.000898,0.0,0.0,0.0,0.75,0.361028,0.0
4,0.996811,0.998807,0.998743,0.0,0.0,6.663916e-09,0.003871,0.111111,0.111111,0.25,0.994171,0.909177,1.0,0.090343,0.065894,1.0,0.267176,0.000898,0.0,0.0,0.658436,0.703125,0.339989,0.0


### **Light-GBM**

In [36]:
LGB_Best.fit(X_train_scaled,Y_train)

Pred_train=LGB_Best.predict(X_train_scaled)
Pred_test=LGB_Best.predict(X_test_scaled)


print(" Train Recall Score := ",metrics.recall_score(Y_train,Pred_train))
print(" Test Recall Score := ",metrics.recall_score(Y_test,Pred_test))
print(" Train Precision Score := ",metrics.precision_score(Y_train,Pred_train))
print(" Test Precision Score := ",metrics.precision_score(Y_test,Pred_test))
print(" Train AUC-ROC Score := ",metrics.roc_auc_score(Y_train,Pred_train))
print(" Test AUC-ROC Score := ",metrics.roc_auc_score(Y_test,Pred_test))
print(" Train F1 Score := ",metrics.f1_score(Y_train,Pred_train))
print(" Test F1 Score := ",metrics.f1_score(Y_test,Pred_test))

print("Train Confusion Matrix:\n",metrics.confusion_matrix(Y_train,Pred_train))
print("Test Confusion Matrix:\n",metrics.confusion_matrix(Y_test,Pred_test))

 Train Recall Score :=  0.8740659894215431
 Test Recall Score :=  0.6123061530765382
 Train Precision Score :=  0.9126051893408135
 Test Precision Score :=  0.8464730290456431
 Train AUC-ROC Score :=  0.9319287075191023
 Test AUC-ROC Score :=  0.8021064123020315
 Train F1 Score :=  0.8929199365324413
 Test F1 Score :=  0.7105950653120464
Train Confusion Matrix:
 [[96666   997]
 [ 1500 10411]]
Test Confusion Matrix:
 [[27208   222]
 [  775  1224]]


In [37]:
(X_test[(Pred_test == 1)&(Y_test == 1)]['AMOUNT_USD'].sum()/X_test[Y_test == 1]['AMOUNT_USD'].sum())*100

58.477464686710455

### **Logistic Regression**

In [39]:
lr.fit(X_train_scaled,Y_train)
Pred_train=lr.predict(X_train_scaled)
Pred_test=lr.predict(X_test_scaled)

print(" Train Recall Score := ",metrics.recall_score(Y_train,Pred_train))
print(" Test Recall Score := ",metrics.recall_score(Y_test,Pred_test))
print(" Train Precision Score := ",metrics.precision_score(Y_train,Pred_train))
print(" Test Precision Score := ",metrics.precision_score(Y_test,Pred_test))
print(" Train AUC-ROC Score := ",metrics.roc_auc_score(Y_train,Pred_train))
print(" Test AUC-ROC Score := ",metrics.roc_auc_score(Y_test,Pred_test))
print(" Train F1 Score := ",metrics.f1_score(Y_train,Pred_train))
print(" Test F1 Score := ",metrics.f1_score(Y_test,Pred_test))

print("Train Confusion Matrix:\n",metrics.confusion_matrix(Y_train,Pred_train))
print("Test Confusion Matrix:\n",metrics.confusion_matrix(Y_test,Pred_test))

 Train Recall Score :=  0.7025438670136849
 Test Recall Score :=  0.6338169084542271
 Train Precision Score :=  0.655234515699632
 Test Precision Score :=  0.6079654510556622
 Train AUC-ROC Score :=  0.8287301315961905
 Test AUC-ROC Score :=  0.8020160007090676
 Train F1 Score :=  0.6780649866299328
 Test F1 Score :=  0.6206220915993143
Train Confusion Matrix:
 [[93260  4403]
 [ 3543  8368]]
Test Confusion Matrix:
 [[26613   817]
 [  732  1267]]


In [40]:
(X_test[(Pred_test == 1)&(Y_test == 1)]['AMOUNT_USD'].sum()/X_test[Y_test == 1]['AMOUNT_USD'].sum())*100

65.54353570471905

### **Ensembled Model**

In [41]:
votingCPre = VotingClassifier(estimators=[('lgbm', LGB_Best),
                                          ('lr',lr)
                                         ], voting='soft', n_jobs=4)

votingCPre = votingCPre.fit(X_train_scaled,Y_train)
Pred_train=votingCPre.predict(X_train_scaled)
Pred_test=votingCPre.predict(X_test_scaled)

print(" Train Recall Score := ",metrics.recall_score(Y_train,Pred_train))
print(" Test Recall Score := ",metrics.recall_score(Y_test,Pred_test))
print(" Train Precision Score := ",metrics.precision_score(Y_train,Pred_train))
print(" Test Precision Score := ",metrics.precision_score(Y_test,Pred_test))
print(" Train AUC-ROC Score := ",metrics.roc_auc_score(Y_train,Pred_train))
print(" Test AUC-ROC Score := ",metrics.roc_auc_score(Y_test,Pred_test))
print(" Train F1 Score := ",metrics.f1_score(Y_train,Pred_train))
print(" Test F1 Score := ",metrics.f1_score(Y_test,Pred_test))

print("Train Confusion Matrix:\n",metrics.confusion_matrix(Y_train,Pred_train))
print("Test Confusion Matrix:\n",metrics.confusion_matrix(Y_test,Pred_test))

 Train Recall Score :=  0.7945596507430107
 Test Recall Score :=  0.6198099049524762
 Train Precision Score :=  0.8529968454258675
 Test Precision Score :=  0.7861675126903553
 Train AUC-ROC Score :=  0.8889296825333782
 Test AUC-ROC Score :=  0.8037620432527601
 Train F1 Score :=  0.822741893419108
 Test F1 Score :=  0.6931468531468532
Train Confusion Matrix:
 [[96032  1631]
 [ 2447  9464]]
Test Confusion Matrix:
 [[27093   337]
 [  760  1239]]


In [42]:
(X_test[(Pred_test == 1)&(Y_test == 1)]['AMOUNT_USD'].sum()/X_test[Y_test == 1]['AMOUNT_USD'].sum())*100

59.715727239011116