In [134]:
import datetime as dt
import gc as gc
import time
import warnings

import catboost as ctb
import imblearn
import numpy as np
import pandas as pd
from hyperopt import fmin, hp, space_eval, tpe
from sklearn import Pipeline
from sklearn.ensemble import (GradientBoostingClassifier,
                              RandomForestClassifier, StackingClassifier,
                              VotingClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix, f1_score,
                             make_scorer, precision_score, recall_score)
from sklearn.model_selection import (KFold, StratifiedKFold, TimeSeriesSplit,
                                     cross_val_score, train_test_split)
from sklearn.preprocessing import StandardScaler

import xgboost as xgb
from xgboost import plot_importance

warnings.filterwarnings("ignore")

## Data loading 

In [135]:
data = pd.read_csv('hist_trx.csv')

## Data preporation


In [136]:
data.head()

Unnamed: 0.1,Unnamed: 0,event_id,date_time,user_id,sub_channel,event_type,sub_type,atm_mcc,mcc_group,atm_merchant_name,amount
0,0,22c9f1ac686a43e18cdb798489193238,2018-12-06 09:33:21,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5921.0,R,YUG 426,280.0
1,1,62dac13fac68416d9bc340c51ddcb977,2018-12-06 09:36:08,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331.0,R,MAGAZIN RODINA,376.9
2,2,9b666b8c9d4d4faea78e2b28a5468794,2018-12-06 12:21:02,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5921.0,R,YUG 426,143.0
3,3,d4d805fc3d5f4c91aa9b0389333b780c,2018-12-06 07:46:58,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331.0,R,MAGAZIN RODINA,162.0
4,4,1d162351117a4418949dcf94111b9964,2018-12-06 12:21:27,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331.0,R,MAGAZIN RODINA,56.0


In [137]:
#we don't need event_id because it is unique
data = data.drop(data.columns[:2],axis=1)
data.head()

Unnamed: 0,date_time,user_id,sub_channel,event_type,sub_type,atm_mcc,mcc_group,atm_merchant_name,amount
0,2018-12-06 09:33:21,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5921.0,R,YUG 426,280.0
1,2018-12-06 09:36:08,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331.0,R,MAGAZIN RODINA,376.9
2,2018-12-06 12:21:02,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5921.0,R,YUG 426,143.0
3,2018-12-06 07:46:58,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331.0,R,MAGAZIN RODINA,162.0
4,2018-12-06 12:21:27,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331.0,R,MAGAZIN RODINA,56.0


In [138]:
data.shape

(562490, 9)

In [139]:
#convert column types  for faster evaluation
def reduce_mem_usage(props):
    start_mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage of properties dataframe is :",start_mem_usg," MB")
    NAlist = [] # Keeps track of columns that have missing values filled in. 
    for col in props.columns:
        if props[col].dtype != object:  # Exclude strings
            
            # Print current column type
            print("******************************")
            print("Column: ",col)
            print("dtype before: ",props[col].dtype)
            
            # make variables for Int, max and min
            IsInt = False
            mx = props[col].max()
            mn = props[col].min()
            
            # Integer does not support NA, therefore, NA needs to be filled
            if not np.isfinite(props[col]).all(): 
                NAlist.append(col)
                props[col].fillna(mn-1,inplace=True)  
                   
            # test if column can be converted to an integer
            asint = props[col].fillna(0).astype(np.int64)
            result = (props[col] - asint)
            result = result.sum()
            if result > -0.01 and result < 0.01:
                IsInt = True

            
            # Make Integer/unsigned Integer datatypes
            if IsInt:
                if mn >= 0:
                    if mx < 255:
                        props[col] = props[col].astype(np.uint8)
                    elif mx < 65535:
                        props[col] = props[col].astype(np.uint16)
                    elif mx < 4294967295:
                        props[col] = props[col].astype(np.uint32)
                    else:
                        props[col] = props[col].astype(np.uint64)
                else:
                    if mn > np.iinfo(np.int8).min and mx < np.iinfo(np.int8).max:
                        props[col] = props[col].astype(np.int8)
                    elif mn > np.iinfo(np.int16).min and mx < np.iinfo(np.int16).max:
                        props[col] = props[col].astype(np.int16)
                    elif mn > np.iinfo(np.int32).min and mx < np.iinfo(np.int32).max:
                        props[col] = props[col].astype(np.int32)
                    elif mn > np.iinfo(np.int64).min and mx < np.iinfo(np.int64).max:
                        props[col] = props[col].astype(np.int64)    
            
            # Make float datatypes 32 bit
            else:
                props[col] = props[col].astype(np.float32)
            
            # Print new column type
            print("dtype after: ",props[col].dtype)
            print("******************************")
    # Print final result
    print("___MEMORY USAGE AFTER COMPLETION:___")
    mem_usg = props.memory_usage().sum() / 1024**2 
    print("Memory usage is: ",mem_usg," MB")
    print("This is ",100*mem_usg/start_mem_usg,"% of the initial size")
    return props, NAlist

In [140]:
props, NAlist = reduce_mem_usage(data)

Memory usage of properties dataframe is : 38.62324523925781  MB
******************************
Column:  atm_mcc
dtype before:  float64
dtype after:  uint16
******************************
******************************
Column:  amount
dtype before:  float64
dtype after:  float32
******************************
___MEMORY USAGE AFTER COMPLETION:___
Memory usage is:  33.2589225769043  MB
This is  86.11115500750036 % of the initial size


In [141]:
#We can't generate num features before spitting due to memory leak
# Let's work with data feature and delete 12.07.2018
data['date_time'] = pd.to_datetime(data['date_time'],format ='%Y-%m-%d %H:%M:%S',errors='coerce')
data = data[data['date_time']<'2018-12-07']
data['data_day'] = data['date_time'].dt.day 
data['data_hour'] = data['date_time'].dt.hour
data['data_minute'] = data['date_time'].dt.minute
data['data_dayofweek'] = data['date_time'].dt.dayofweek

In [142]:
#Let's check our data
data.head()

Unnamed: 0,date_time,user_id,sub_channel,event_type,sub_type,atm_mcc,mcc_group,atm_merchant_name,amount,data_day,data_hour,data_minute,data_dayofweek
0,2018-12-06 09:33:21,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5921,R,YUG 426,280.0,6,9,33,3
1,2018-12-06 09:36:08,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331,R,MAGAZIN RODINA,376.899994,6,9,36,3
2,2018-12-06 12:21:02,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5921,R,YUG 426,143.0,6,12,21,3
3,2018-12-06 07:46:58,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331,R,MAGAZIN RODINA,162.0,6,7,46,3
4,2018-12-06 12:21:27,102050167,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331,R,MAGAZIN RODINA,56.0,6,12,21,3


## Fraud data preporation 

In [143]:
fraud_data = pd.read_csv('hits0712.csv')

In [144]:
fraud_data.head()

Unnamed: 0.1,Unnamed: 0,event_id,user_id,rec_user_id,date_time,resolution,sub_channel
0,5,0fcea7fec4ff479cac8cf37b4555c817,75301207,49913000,2018-12-07 07:56:49,G,ATMAPI
1,6,c29a4e64d27b435b9b55aa3e62ce54d4,1200695,12797310,2018-12-07 07:14:41,G,MOBILEAPI
2,8,3107b2b1afcb490ab0a31135eb4b386c,45657780,23814017,2018-12-07 14:02:30,G,MOBILEAPI
3,10,d9bc95d991144d53b950d8084fa846bf,21683486,9865131,2018-12-07 10:32:41,G,WEBAPI
4,22,9fc959da4cd144ea8bf5bae2dc041a5e,4531894,VSP15472255,2018-12-07 18:22:52,G,MOBILEAPI


In [145]:
#same dropping
fraud_data = fraud_data.drop(fraud_data.columns[:2],axis=1)
fraud_data.head()

Unnamed: 0,user_id,rec_user_id,date_time,resolution,sub_channel
0,75301207,49913000,2018-12-07 07:56:49,G,ATMAPI
1,1200695,12797310,2018-12-07 07:14:41,G,MOBILEAPI
2,45657780,23814017,2018-12-07 14:02:30,G,MOBILEAPI
3,21683486,9865131,2018-12-07 10:32:41,G,WEBAPI
4,4531894,VSP15472255,2018-12-07 18:22:52,G,MOBILEAPI


In [146]:
#creating target
dict_repl = {'G': 0, 'F': 1, 'S': 1}
fraud_data = fraud_data.replace({'resolution':dict_repl}).rename({'resolution':'target'},axis='columns')

In [147]:
#date engineering 
fraud_data['date_time'] = pd.to_datetime(fraud_data['date_time'],format ='%Y-%m-%d %H:%M:%S',errors='coerce')
# fraud_data['fraud_data_day'] = fraud_data['date_time'].dt.day 
# fraud_data['fraud_data_hour'] = fraud_data['date_time'].dt.hour
# fraud_data['fraud_data_minute'] = fraud_data['date_time'].dt.minute
# fraud_data['fraud_data_dayofweek'] = fraud_data['date_time'].dt.dayofweek
fraud_data.head()

Unnamed: 0,user_id,rec_user_id,date_time,target,sub_channel
0,75301207,49913000,2018-12-07 07:56:49,0,ATMAPI
1,1200695,12797310,2018-12-07 07:14:41,0,MOBILEAPI
2,45657780,23814017,2018-12-07 14:02:30,0,MOBILEAPI
3,21683486,9865131,2018-12-07 10:32:41,0,WEBAPI
4,4531894,VSP15472255,2018-12-07 18:22:52,0,MOBILEAPI


##  Create a train data

In [148]:
left = data[['sub_channel', 'event_type', 'sub_type', 'atm_mcc','mcc_group', 'atm_merchant_name','amount', 'data_day','data_hour','data_minute','data_dayofweek',
                     'user_id']]
right = fraud_data[['rec_user_id', 'sub_channel', 'target']].drop_duplicates()
train_data = left.merge(right, left_on='user_id', right_on='rec_user_id', how='inner')

In [149]:
train_data = train_data.drop(columns=['rec_user_id'],axis=1)

In [150]:
#Check NaNs
train_data.isna().sum()

sub_channel_x          0
event_type             0
sub_type               0
atm_mcc                0
mcc_group            663
atm_merchant_name    662
amount                 0
data_day               0
data_hour              0
data_minute            0
data_dayofweek         0
user_id                0
sub_channel_y          0
target                 0
dtype: int64

In [151]:
#fill Nan with most frequent value
train_data = train_data.fillna(train_data['mcc_group'].value_counts().index[0]) 
train_data = train_data.fillna(train_data['atm_merchant_name'].value_counts().index[0]) 
train_data.isna().sum()

sub_channel_x        0
event_type           0
sub_type             0
atm_mcc              0
mcc_group            0
atm_merchant_name    0
amount               0
data_day             0
data_hour            0
data_minute          0
data_dayofweek       0
user_id              0
sub_channel_y        0
target               0
dtype: int64

In [152]:
for column in train_data.columns:
    if ('data' not in column) and (train_data[column].dtype == 'object'):
        print(f'------------------------------------------\n {train_data[column].value_counts()} \n {train_data[column].unique()} ')

------------------------------------------
 ISSUER_ACQUIRER    250568
ISSUER              45447
Name: sub_channel_x, dtype: int64 
 ['ISSUER_ACQUIRER' 'ISSUER'] 
------------------------------------------
 PAYMENT            151155
DEPOSIT            123260
WITHDRAW            15733
VIEW_STATEMENT       4019
CLIENT_DEFINED       1254
UPDATE_USER           374
CARD_PIN_CHANGE       220
Name: event_type, dtype: int64 
 ['PAYMENT' 'DEPOSIT' 'WITHDRAW' 'VIEW_STATEMENT' 'CLIENT_DEFINED'
 'UPDATE_USER' 'CARD_PIN_CHANGE'] 
------------------------------------------
 ATM_P2P_CREDIT          110007
POS_PURCHASE             59776
ATM_P2P_DEBIT            57807
ATM_WITHDRAW             15614
EPOS_PURCHASE            13642
ATM_PAYMENT              12754
ATM_CASH_CHECK            6831
EPOS_PRE_PURCHASE         6822
ATM_CASH                  6346
ATM_BALANCE               3606
TOKENIZATION_REQUEST      1207
ATM_SERVICE_ON             374
POS_PRE_PURCHASE           349
ATM_RECEIPT                318


## Feature engineering 

In [153]:
#Lets look on our data
train_data.head()

Unnamed: 0,sub_channel_x,event_type,sub_type,atm_mcc,mcc_group,atm_merchant_name,amount,data_day,data_hour,data_minute,data_dayofweek,user_id,sub_channel_y,target
0,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5921,R,YUG 426,280.0,6,9,33,3,102050167,MOBILEAPI,0
1,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331,R,MAGAZIN RODINA,376.899994,6,9,36,3,102050167,MOBILEAPI,0
2,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5921,R,YUG 426,143.0,6,12,21,3,102050167,MOBILEAPI,0
3,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331,R,MAGAZIN RODINA,162.0,6,7,46,3,102050167,MOBILEAPI,0
4,ISSUER_ACQUIRER,PAYMENT,POS_PURCHASE,5331,R,MAGAZIN RODINA,56.0,6,12,21,3,102050167,MOBILEAPI,0


In [154]:
dict_repl_channel = {'ISSUER_ACQUIRER': 0, 'ISSUER': 1}
train_data = train_data.replace({'sub_channel':dict_repl_channel})

In [155]:
#OHE
#In real life with lot's of features better use mean encoding or label encoding 
train_data = pd.get_dummies(train_data,columns=['sub_channel_x','event_type','sub_type','mcc_group','sub_channel_y'])

In [156]:
train_data.head()

Unnamed: 0,atm_mcc,atm_merchant_name,amount,data_day,data_hour,data_minute,data_dayofweek,user_id,target,sub_channel_x_ISSUER,...,mcc_group_O,mcc_group_Q,mcc_group_R,mcc_group_T,mcc_group_U,mcc_group_X,mcc_group_Z,sub_channel_y_ATMAPI,sub_channel_y_MOBILEAPI,sub_channel_y_WEBAPI
0,5921,YUG 426,280.0,6,9,33,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0
1,5331,MAGAZIN RODINA,376.899994,6,9,36,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0
2,5921,YUG 426,143.0,6,12,21,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0
3,5331,MAGAZIN RODINA,162.0,6,7,46,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0
4,5331,MAGAZIN RODINA,56.0,6,12,21,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0


In [157]:
#encoding atm_mcc
conditions = [
    (train_data['atm_mcc'] <= 1799),
    (train_data['atm_mcc'] > 1799) & (train_data['atm_mcc'] <= 2842),
    (train_data['atm_mcc'] >= 3000) & (train_data['atm_mcc'] <= 3299),
    (train_data['atm_mcc'] >= 3351) & (train_data['atm_mcc'] <= 3441),
    (train_data['atm_mcc'] >= 3501) & (train_data['atm_mcc'] <= 3799),
    (train_data['atm_mcc'] >= 4011) & (train_data['atm_mcc'] <= 4789),
    (train_data['atm_mcc'] >= 4812) & (train_data['atm_mcc'] <= 4900),
    (train_data['atm_mcc'] >= 5013) & (train_data['atm_mcc'] <= 5199),
    (train_data['atm_mcc'] >= 5200) & (train_data['atm_mcc'] <= 5499),
    (train_data['atm_mcc'] >= 5511) & (train_data['atm_mcc'] <= 5599),
    (train_data['atm_mcc'] >= 5611) & (train_data['atm_mcc'] <= 5699),
    (train_data['atm_mcc'] >= 5712) & (train_data['atm_mcc'] <= 5999),
    (train_data['atm_mcc'] > 5999)
    ]

# create a list of the values we want to assign for each condition
values = ['contracts', 'optoviki', 'avia', 'rent','rent','transport','communal','optoviki','shops','auto','clothes','diff_shops','services']

# create a new column and use np.select to assign values to it using our lists as arguments
train_data['atm_mcc'] = np.select(conditions, values)

# display updated DataFrame
train_data.head()

Unnamed: 0,atm_mcc,atm_merchant_name,amount,data_day,data_hour,data_minute,data_dayofweek,user_id,target,sub_channel_x_ISSUER,...,mcc_group_O,mcc_group_Q,mcc_group_R,mcc_group_T,mcc_group_U,mcc_group_X,mcc_group_Z,sub_channel_y_ATMAPI,sub_channel_y_MOBILEAPI,sub_channel_y_WEBAPI
0,diff_shops,YUG 426,280.0,6,9,33,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0
1,shops,MAGAZIN RODINA,376.899994,6,9,36,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0
2,diff_shops,YUG 426,143.0,6,12,21,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0
3,shops,MAGAZIN RODINA,162.0,6,7,46,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0
4,shops,MAGAZIN RODINA,56.0,6,12,21,3,102050167,0,0,...,0,0,1,0,0,0,0,0,1,0


In [158]:
train_data.atm_mcc.value_counts()

services      157585
communal       69363
shops          29616
diff_shops     22363
transport       9001
auto            5477
clothes         1651
contracts        690
avia             127
optoviki          98
rent              43
0                  1
Name: atm_mcc, dtype: int64

In [159]:
#let's find this row
train_data[train_data.atm_mcc == '0']

Unnamed: 0,atm_mcc,atm_merchant_name,amount,data_day,data_hour,data_minute,data_dayofweek,user_id,target,sub_channel_x_ISSUER,...,mcc_group_O,mcc_group_Q,mcc_group_R,mcc_group_T,mcc_group_U,mcc_group_X,mcc_group_Z,sub_channel_y_ATMAPI,sub_channel_y_MOBILEAPI,sub_channel_y_WEBAPI
256276,0,WIZZ AIR HUNGARY KFT.,257.959991,29,10,55,3,73343569,0,1,...,0,0,0,0,0,0,0,1,0,0


In [160]:
#change this string
train_data.loc[train_data.atm_mcc == '0','atm_mcc'] = 'avia'

In [161]:
#OHE atm_mcc
train_data = pd.get_dummies(train_data,columns=['atm_mcc'])

In [162]:
#Check data
train_data.head()

Unnamed: 0,atm_merchant_name,amount,data_day,data_hour,data_minute,data_dayofweek,user_id,target,sub_channel_x_ISSUER,sub_channel_x_ISSUER_ACQUIRER,...,atm_mcc_avia,atm_mcc_clothes,atm_mcc_communal,atm_mcc_contracts,atm_mcc_diff_shops,atm_mcc_optoviki,atm_mcc_rent,atm_mcc_services,atm_mcc_shops,atm_mcc_transport
0,YUG 426,280.0,6,9,33,3,102050167,0,0,1,...,0,0,0,0,1,0,0,0,0,0
1,MAGAZIN RODINA,376.899994,6,9,36,3,102050167,0,0,1,...,0,0,0,0,0,0,0,0,1,0
2,YUG 426,143.0,6,12,21,3,102050167,0,0,1,...,0,0,0,0,1,0,0,0,0,0
3,MAGAZIN RODINA,162.0,6,7,46,3,102050167,0,0,1,...,0,0,0,0,0,0,0,0,1,0
4,MAGAZIN RODINA,56.0,6,12,21,3,102050167,0,0,1,...,0,0,0,0,0,0,0,0,1,0


In [163]:
#reduce memory usage
props, NAlist = reduce_mem_usage(train_data)

Memory usage of properties dataframe is : 36.1346435546875  MB
******************************
Column:  amount
dtype before:  float32
dtype after:  float32
******************************
******************************
Column:  data_day
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  data_hour
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  data_minute
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  data_dayofweek
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  target
dtype before:  int64
dtype after:  uint8
******************************
******************************
Column:  sub_channel_x_ISSUER
dtype before:  uint8
dtype after:  uint8
******************************
******************************
Column:  sub_channel_x_ISSUER_ACQUIRE

In [164]:
#delete first column, same info in second
train_data = train_data.drop(columns = ['atm_merchant_name'])

## Feature generating 
Due to avoid data leak we should firstly split our data  
As the data is imbalanced using special split


In [221]:
out_ouf_targ = train_data.loc[:, train_data.columns != 'target']
target_data = train_data['target']
X_train, X_test, y_train, y_test = train_test_split(out_ouf_targ,target_data,test_size =0.2,stratify = target_data)

### Now we can generate features and maybe use sampling techniques


In [222]:
#all amount of money and number of transactions
X_train = X_train.merge(X_train.groupby('user_id')['amount'].agg(total_amount='sum', amount_count='count',mean_sum = 'mean'),left_on='user_id',right_index=True)
X_test = X_test.merge(X_test.groupby('user_id')['amount'].agg(total_amount='sum', amount_count='count',mean_sum='mean'),left_on='user_id',right_index=True)

In [223]:
#transaction/sum all
X_train['trans_div_sum']= (X_train['amount']/X_train['total_amount'])
X_test['trans_div_sum']= (X_test['amount']/X_test['total_amount'])
X_train = X_train.fillna('0')
X_test = X_test.fillna('0')
X_train['trans_div_sum']=X_train['trans_div_sum'].astype('float')
X_test['trans_div_sum'] = X_test['trans_div_sum'].astype('float')

## Let's create model maybe we need sampling techniques

In [224]:
#we don't need this feature and we dom't need to normalize data since we use XGBoost
X_train = X_train.loc[:,X_train.columns != 'user_id']
X_test = X_test.loc[:,X_test.columns != 'user_id']

### We will use XGBoost and HYPERopt


In [36]:
#simple baseline
model = xgb.XGBClassifier(scale_pos_weight=105)
model.fit(X_train,y_train)
preds = model.predict(X_test)



In [37]:
print(classification_report(y_test,preds))

              precision    recall  f1-score   support

           0       0.99      0.87      0.93     58645
           1       0.01      0.13      0.02       558

    accuracy                           0.86     59203
   macro avg       0.50      0.50      0.47     59203
weighted avg       0.98      0.86      0.92     59203



## Adding hyperopt

In [38]:
space = {
    # The maximum depth of a tree, same as GBM.
    # Used to control over-fitting as higher depth will allow model 
    # to learn relations very specific to a particular sample.
    # Should be tuned using CV.
    # Typical values: 3-10
    'max_depth': hp.quniform('max_depth', 7, 23, 1),
    
    # reg_alpha: L1 regularization term. L1 regularization encourages sparsity 
    # (meaning pulling weights to 0). It can be more useful when the objective
    # is logistic regression since you might need help with feature selection.
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    
    # reg_lambda: L2 regularization term. L2 encourages smaller weights, this
    # approach can be more useful in tree-models where zeroing 
    # features might not make much sense.
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    
    # eta: Analogous to learning rate in GBM
    # Makes the model more robust by shrinking the weights on each step
    # Typical final values to be used: 0.01-0.2
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
        # colsample_bytree: Similar to max_features in GBM. Denotes the 
    # fraction of columns to be randomly samples for each tree.
    # Typical values: 0.5-1
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    
    # A node is split only when the resulting split gives a positive
    # reduction in the loss function. Gamma specifies the 
    # minimum loss reduction required to make a split.
    # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
    'gamma': hp.uniform('gamma', 0.01, .7),
    
    # more increases accuracy, but may lead to overfitting.
    # num_leaves: the number of leaf nodes to use. Having a large number 
    # of leaves will improve accuracy, but will also lead to overfitting.
    'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
        # specifies the minimum samples per leaf node.
    # the minimum number of samples (data) to group into a leaf. 
    # The parameter can greatly assist with overfitting: larger sample
    # sizes per leaf will reduce overfitting (but may lead to under-fitting).
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    
    # subsample: represents a fraction of the rows (observations) to be 
    # considered when building each subtree. Tianqi Chen and Carlos Guestrin
    # in their paper A Scalable Tree Boosting System recommend 
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    
    # randomly select a fraction of the features.
    # feature_fraction: controls the subsampling of features used
    # for training (as opposed to subsampling the actual training data in 
    # the case of bagging). Smaller fractions reduce overfitting.
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    
    # randomly bag or subsample training data.
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
    # bagging_fraction and bagging_freq: enables bagging (subsampling) 
    # of the training data. Both values need to be set for bagging to be used.
    # The frequency controls how often (iteration) bagging is used. Smaller
    # fractions and frequencies reduce overfitting.
}

In [39]:
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'num_leaves': '{:.3f}'.format(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'min_child_samples': '{:.3f}'.format(params['min_child_samples']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 7
    count=1
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)
    tss = TimeSeriesSplit(n_splits=FOLDS)
    score_mean = 0
    for tr_idx, val_idx in tss.split(X_train, y_train):
        clf = xgb.XGBClassifier(
            n_estimators=600, random_state=4, verbose=True, verbosity = 0, 
            tree_method='gpu_hist', scale_pos_weight = 105,
            **params
        )

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)
        #y_pred_train = clf.predict_proba(X_vl)[:,1]
        #print(y_pred_train)
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        # plt.show()
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean RC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / FOLDS)

In [116]:
##NOT RUN !! 1.2 hour
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=15)
#Spelling mistake MEAN RC = MEAN ROC_AUC

                                                                                                                       
############## New Run ################
params = {'max_depth': 16, 'gamma': '0.387', 'subsample': '0.20', 'reg_alpha': '0.172', 'reg_lambda': '0.166', 'learning_rate': '0.077', 'num_leaves': '40.000', 'colsample_bytree': '0.881', 'min_child_samples': '140.000', 'feature_fraction': '0.792', 'bagging_fraction': '0.788'}
1 CV - score: 0.5276                                                                                                   
2 CV - score: 0.4947                                                                                                   
3 CV - score: 0.4665                                                                                                   
4 CV - score: 0.4925                                                                                                   
5 CV - score: 0.4912                                                            

6 CV - score: 0.4956                                                                                                   
7 CV - score: 0.4849                                                                                                   
Total Time Run: 4.36                                                                                                   
Mean RC: 0.48539951165842127                                                                                           
                                                                                                                       
############## New Run ################
params = {'max_depth': 7, 'gamma': '0.279', 'subsample': '0.60', 'reg_alpha': '0.218', 'reg_lambda': '0.227', 'learning_rate': '0.196', 'num_leaves': '100.000', 'colsample_bytree': '0.337', 'min_child_samples': '240.000', 'feature_fraction': '0.500', 'bagging_fraction': '0.672'}
1 CV - score: 0.5185                                                            

### Best params


In [120]:
best_params = space_eval(space, best)
print("Best params: ", best_params)
best_params['max_depth'] = int(best_params['max_depth'])

Best params:  {'bagging_fraction': 0.7876629930605514, 'colsample_bytree': 0.8805040393547248, 'feature_fraction': 0.7922494512670488, 'gamma': 0.38715632991228244, 'learning_rate': 0.0774410435034502, 'max_depth': 16.0, 'min_child_samples': 140, 'num_leaves': 40, 'reg_alpha': 0.17158277254946894, 'reg_lambda': 0.16586527906881776, 'scale_pos_weight': 102.67222127585362, 'subsample': 0.2}


### Training with best params and early stopping 

In [259]:
#WORKS IF RUN HYPEROPTIMIZATION
clf = xgb.XGBClassifier(
    n_estimators=300,
    **best_params,
    tree_method='gpu_hist'
)
eval_set = [(X_test, y_test)]
clf.fit(X_train, y_train,early_stopping_rounds=10,eval_metric="auc", eval_set=eval_set, verbose=True)
y_preds = clf.predict(X_test)

NameError: name 'best_params' is not defined

### Top 20 features

In [138]:
feature_important = clf.get_booster().get_score(importance_type="weight")
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

# Top features
data.head(10)

Unnamed: 0,score
amount,1106
data_minute,921
amount_count,763
data_hour,732
trans_div_sum,697
total_amount,658
mean_sum,656
data_day,567
data_dayofweek,269
sub_channel_y_ATMAPI,111


In [139]:
confusion_matrix(y_test, y_preds)

array([[57418,  1227],
       [  550,     8]], dtype=int64)

In [142]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98     58645
           1       0.01      0.01      0.01       558

    accuracy                           0.97     59203
   macro avg       0.50      0.50      0.50     59203
weighted avg       0.98      0.97      0.98     59203



## Lets try: Scale data, use oversampling+undersampling (Tomek+SMOTE) 
Actually we don't need scalling in non-linear models but SMOTE uses knn 


In [225]:
#don't scale ohe
scale_columns = []
other_columns = []
for column in X_train.columns:
    if (column.count('_') < 2) or (column == 'trans_div_sum'):
        scale_columns.append(column)
    else:
        other_columns.append(column)

In [226]:
scaler = StandardScaler().fit(X_train[scale_columns])
scaled_train_columns = scaler.transform(X_train[scale_columns])
X_train_scaled = np.concatenate([scaled_train_columns, X_train[other_columns]], axis=1)
scaler = StandardScaler().fit(X_test[scale_columns])
scaled_test_columns = scaler.transform(X_test[scale_columns])
X_test_scaled = np.concatenate([scaled_test_columns, X_test[other_columns]], axis=1)

## SMOTE sampling

In [239]:
eval_set = [(X_test_scaled, y_test)]
oversample = SMOTE(sampling_strategy=0.1,n_jobs=-1)
X_train_scaled_smote,y_train_smote = oversample.fit_resample(X_train_scaled,y_train)

[0]	validation_0-auc:0.51156
[1]	validation_0-auc:0.50439
[2]	validation_0-auc:0.50228
[3]	validation_0-auc:0.50934
[4]	validation_0-auc:0.50856
[5]	validation_0-auc:0.51339
[6]	validation_0-auc:0.51241
[7]	validation_0-auc:0.51131
[8]	validation_0-auc:0.50597
[9]	validation_0-auc:0.50207
[10]	validation_0-auc:0.49640
[11]	validation_0-auc:0.50126
[12]	validation_0-auc:0.49981
[13]	validation_0-auc:0.49725
[14]	validation_0-auc:0.49476


In [253]:
model_smote = xgb.XGBClassifier()
model_smote.fit(X_train_scaled_smote,y_train_smote,early_stopping_rounds=10,eval_metric="auc", eval_set=eval_set, verbose=True)
preds = model_smote.predict(X_test_scaled)

[0]	validation_0-auc:0.51156
[1]	validation_0-auc:0.50439
[2]	validation_0-auc:0.50228
[3]	validation_0-auc:0.50934
[4]	validation_0-auc:0.50856
[5]	validation_0-auc:0.51339
[6]	validation_0-auc:0.51241
[7]	validation_0-auc:0.51131
[8]	validation_0-auc:0.50597
[9]	validation_0-auc:0.50207
[10]	validation_0-auc:0.49640
[11]	validation_0-auc:0.50126
[12]	validation_0-auc:0.49981
[13]	validation_0-auc:0.49725
[14]	validation_0-auc:0.49476


In [240]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.99      0.72      0.83     58645
           1       0.01      0.29      0.02       558

    accuracy                           0.71     59203
   macro avg       0.50      0.50      0.42     59203
weighted avg       0.98      0.71      0.82     59203



## SMOTE + TOMEK

In [251]:
smt = SMOTETomek(sampling_strategy=0.2)
eval_set = [(X_test_scaled, y_test)]
X_train_scaled_smote_mek,y_train_smote_mek = oversample.fit_resample(X_train_scaled,y_train)

In [252]:
model_smote_mek = xgb.XGBClassifier()
model_smote_mek.fit(X_train_scaled_smote_mek,y_train_smote_mek,early_stopping_rounds=10,eval_metric="auc", eval_set=eval_set, verbose=True)
preds = model_smote_mek.predict(X_test_scaled)

[0]	validation_0-auc:0.47689
[1]	validation_0-auc:0.50446
[2]	validation_0-auc:0.50306
[3]	validation_0-auc:0.49931
[4]	validation_0-auc:0.50118
[5]	validation_0-auc:0.50128
[6]	validation_0-auc:0.50888
[7]	validation_0-auc:0.50938
[8]	validation_0-auc:0.50033
[9]	validation_0-auc:0.50285
[10]	validation_0-auc:0.50165
[11]	validation_0-auc:0.49856
[12]	validation_0-auc:0.49715
[13]	validation_0-auc:0.49581
[14]	validation_0-auc:0.49470
[15]	validation_0-auc:0.50084
[16]	validation_0-auc:0.50347
[17]	validation_0-auc:0.49969


In [249]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

           0       0.99      0.76      0.86     58645
           1       0.01      0.25      0.02       558

    accuracy                           0.75     59203
   macro avg       0.50      0.50      0.44     59203
weighted avg       0.98      0.75      0.85     59203



## Decided to use only SMOTE 

### Training with auto params and early stopping . Tired for Hyperopt

In [266]:
params = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5],
        }

model_smote_final = xgb.XGBClassifier(
    n_estimators=300,
    scale_pos_weight =10,
    tree_method= 'gpu_hist'
)
eval_set = [(X_test_scaled, y_test)]
model_smote_final.fit(X_train_scaled_smote,y_train_smote,early_stopping_rounds=20,eval_metric="auc", eval_set=eval_set, verbose=True)
preds_final = model_smote.predict(X_test_scaled)

[0]	validation_0-auc:0.50998
[1]	validation_0-auc:0.50257
[2]	validation_0-auc:0.50962
[3]	validation_0-auc:0.51760
[4]	validation_0-auc:0.51559
[5]	validation_0-auc:0.51388
[6]	validation_0-auc:0.51825
[7]	validation_0-auc:0.51836
[8]	validation_0-auc:0.52166
[9]	validation_0-auc:0.51710
[10]	validation_0-auc:0.51726
[11]	validation_0-auc:0.51481
[12]	validation_0-auc:0.51967
[13]	validation_0-auc:0.52010
[14]	validation_0-auc:0.51974
[15]	validation_0-auc:0.52157
[16]	validation_0-auc:0.52123
[17]	validation_0-auc:0.51997
[18]	validation_0-auc:0.52140
[19]	validation_0-auc:0.52072
[20]	validation_0-auc:0.51947
[21]	validation_0-auc:0.51893
[22]	validation_0-auc:0.52195
[23]	validation_0-auc:0.52302
[24]	validation_0-auc:0.52096
[25]	validation_0-auc:0.52016
[26]	validation_0-auc:0.51727
[27]	validation_0-auc:0.51847
[28]	validation_0-auc:0.51836
[29]	validation_0-auc:0.51951
[30]	validation_0-auc:0.51850
[31]	validation_0-auc:0.51915
[32]	validation_0-auc:0.51984
[33]	validation_0-au

In [267]:
print(classification_report(y_test, preds_final))

              precision    recall  f1-score   support

           0       0.99      0.72      0.83     58645
           1       0.01      0.29      0.02       558

    accuracy                           0.71     59203
   macro avg       0.50      0.50      0.42     59203
weighted avg       0.98      0.71      0.82     59203

