In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
import catboost as cb
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold,TimeSeriesSplit,KFold,GroupKFold
from sklearn.metrics import roc_auc_score
import sqlite3
import xgboost as xgb
import datetime
from sklearn.linear_model import LogisticRegression
from scipy.stats import pearsonr
import gc
from sklearn.model_selection import TimeSeriesSplit
import hashlib

  from numpy.core.umath_tests import inner1d


In [2]:
train = pd.read_pickle('train_raw.pkl')
test = pd.read_pickle('test_raw.pkl')

In [3]:
START_DATE = '2017-11-30'
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
train['TransactionDT'] = train['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
test['TransactionDT'] = test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
for df in [train,test]:
    df['DT_D'] = ((df['TransactionDT'].dt.year-2017)*365 + df['TransactionDT'].dt.dayofyear).astype(np.int16)
    df['DT_W'] = (df['TransactionDT'].dt.year-2017)*52 + df['TransactionDT'].dt.weekofyear
    df['DT_M'] = (df['TransactionDT'].dt.year-2017)*12 + df['TransactionDT'].dt.month

In [4]:
#### R
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_R_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_R_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'R','ProductCD_R_Day'] = -999
test.loc[test.ProductCD != 'R','ProductCD_R_Day'] = -999

In [5]:
#### H
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_H_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_H_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'H','ProductCD_H_Day'] = -999
test.loc[test.ProductCD != 'H','ProductCD_H_Day'] = -999

In [6]:
#### C
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_C_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_C_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'C','ProductCD_C_Day'] = 999999
test.loc[test.ProductCD != 'C','ProductCD_C_Day'] = 999999

In [7]:
#### W
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_W_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_W_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'W','ProductCD_W_Day'] = -999
test.loc[test.ProductCD != 'W','ProductCD_W_Day'] = -999

In [8]:
#### S
te = train.groupby(['ProductCD','DT_D'])['isFraud'].agg(['count','mean'])
te.reset_index(inplace=True)
train['ProductCD_S_Day'] = pd.merge(train[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
te = test.groupby(['ProductCD','DT_D'])['TransactionAmt'].agg(['count','mean'])
te.reset_index(inplace=True)
test['ProductCD_S_Day'] = pd.merge(test[['ProductCD','DT_D']],te[['ProductCD','DT_D','count']],on = ['ProductCD','DT_D'],how='left')['count']
train.loc[train.ProductCD != 'S','ProductCD_S_Day'] = -999
test.loc[test.ProductCD != 'S','ProductCD_S_Day'] = -999

In [9]:
train['open_card'] = train.DT_D - train.D1
train['first_tran'] = train.DT_D - train.D2
test['open_card'] = test.DT_D - test.D1
test['first_tran'] = test.DT_D - test.D2

In [10]:
train['uid1'] = train.card1.astype(str) +' '+ train.card2.astype(str)+' '+ train.card3.astype(str)+' '+train.card4.astype(str)+' '+ train.card5.astype(str)+' '+ train.card6.astype(str) +' '+ train.addr1.astype(str)+' '+train.addr2.astype(str)+' '+train.open_card.astype(str)
test['uid1'] = test.card1.astype(str) +' '+ test.card2.astype(str)+' '+ test.card3.astype(str)+' '+ test.card4.astype(str)+' '+ test.card5.astype(str)+' '+ test.card6.astype(str) +' '+ test.addr1.astype(str)+' '+test.addr2.astype(str)+' '+test.open_card.astype(str)


In [11]:
def device_hash(x):
    s =  str(x['id_30'])+str(x['id_31'])+str(x['id_32'])+str(x['id_33'])+str( x['DeviceType'])+ str(x['DeviceInfo'])
    h = hashlib.sha256(s.encode('utf-8')).hexdigest()[0:15]
    return h

In [12]:
for df in [train,test]:
    df['device_hash'] = df.apply(lambda x: device_hash(x), axis=1)

In [13]:
concat_df = pd.concat([train[['uid1','device_hash']],test[['uid1','device_hash']]])
tmp = concat_df.groupby('uid1')['device_hash'].agg(['nunique'])

In [14]:
train['uid_device_nunique'] = train.uid1.map(tmp.to_dict()['nunique'])
test['uid_device_nunique'] = test.uid1.map(tmp.to_dict()['nunique'])

In [15]:
#concat_df = pd.concat([train[['uid1','device_hash']],test[['uid1','device_hash']]])
tmp = concat_df.groupby('device_hash')['uid1'].agg(['nunique'])

train['device_uid_nunique'] = train.device_hash.map(tmp.to_dict()['nunique'])
test['device_uid_nunique'] = test.device_hash.map(tmp.to_dict()['nunique'])

In [16]:
# train['uid1'] = train.card1.astype(str) +' '+ train.card2.astype(str)+' '+ train.card3.astype(str)+' '+train.card4.astype(str)+' '+ train.card5.astype(str)+' '+ train.card6.astype(str) +' '+ train.addr1.astype(str)+' '+train.addr2.astype(str)+' '+train.open_card.astype(str)
# test['uid1'] = test.card1.astype(str) +' '+ test.card2.astype(str)+' '+ test.card3.astype(str)+' '+ test.card4.astype(str)+' '+ test.card5.astype(str)+' '+ test.card6.astype(str) +' '+ test.addr1.astype(str)+' '+test.addr2.astype(str)+' '+test.open_card.astype(str)

###train['uid2'] = train.card1.astype(str) +' '+ train.card2.astype(str)+' '+ train.card3.astype(str)+' '+train.card4.astype(str)+' '+ train.card5.astype(str)+' '+ train.card6.astype(str) +' '+ train.addr1.astype(str)+' '+train.addr2.astype(str)+' '+train.open_card.astype(str)+' '+train.first_tran.astype(str)
###test['uid2'] = test.card1.astype(str) +' '+ test.card2.astype(str)+' '+ test.card3.astype(str)+' '+ test.card4.astype(str)+' '+ test.card5.astype(str)+' '+ test.card6.astype(str) +' '+ test.addr1.astype(str)+' '+test.addr2.astype(str)+' '+test.open_card.astype(str)+' '+test.first_tran.astype(str)

# train['uid3'] = train.card1.astype(str) +' '+ train.card2.astype(str)+' '+ train.card3.astype(str)+' '+train.card4.astype(str)+' '+ train.card5.astype(str)+' '+ train.card6.astype(str) +' '+ train.addr1.astype(str)+' '+train.addr2.astype(str)+' '+train.open_card.astype(str)+' '+train.first_tran.astype(str)+' '+train.P_emaildomain.astype(str)
# test['uid3'] = test.card1.astype(str) +' '+ test.card2.astype(str)+' '+ test.card3.astype(str)+' '+ test.card4.astype(str)+' '+ test.card5.astype(str)+' '+ test.card6.astype(str) +' '+ test.addr1.astype(str)+' '+test.addr2.astype(str)+' '+test.open_card.astype(str)+' '+test.first_tran.astype(str)+' '+test.P_emaildomain.astype(str)

def change(hoge):
    hoge = np.round(hoge,3)
    num = 3
    hoge = int(np.round(np.round(hoge,3)*1000))
    while(hoge % 10 ==0):
        num = num-1
        hoge = hoge /10
    if num<0:
        num = 0
    return num
  
train['decimal_digit'] = train["TransactionAmt"].map(change)
test['decimal_digit'] = test['TransactionAmt'].map(change)
import gc
gc.collect()
train.had_id = train.had_id.fillna(0)
test.had_id = test.had_id.fillna(0)
cat_columns = ['uid1','id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9','hour','dow','device_name', 'device_version', 'OS_id_30',  'browser_id_31']
count_columns = ['uid1','id_13','id_14','id_17','id_18','id_19','id_20','id_21',
                 'id_22','id_24','id_25','id_26','id_30','id_31','id_33',
                 'DeviceInfo','card6','P_emaildomain','R_emaildomain','card1',
                 'card2','card3','card5','addr1','addr2','hour','device_version','OS_id_30','browser_id_31']
### scale
for t in ['D15','D2','D1','D4','D6','D10','D11','D12']:
    train[t+'_revised'] = train[t]/train.groupby('DT_W')[t].transform('max')
    test[t+'_revised'] = test[t]/test.groupby('DT_W')[t].transform('max')
for t in ['D3','D5','D7','D8','D13']:
    train[t+'_revised'] = train[t]/train.groupby('DT_M')[t].transform('max')
    test[t+'_revised'] = test[t]/test.groupby('DT_M')[t].transform('max')
train['D14_revised'] = train['D14']/train.groupby('DT_W')['D14'].transform('max')
test['D14_revised'] = test['D14']/test.groupby('DT_W')['D14'].transform('max')
test.loc[test.DT_W == 78 ,'D14_revised'] = test.loc[test.DT_W == 78 ,'D14_revised'].map(lambda x: np.nan if pd.isna(x) else x/900*530)
### 
train['dow'] = train['TransactionDT'].dt.dayofweek
train['hour'] = train['TransactionDT'].dt.hour
test['dow'] = test['TransactionDT'].dt.dayofweek
test['hour'] = test['TransactionDT'].dt.hour
# train['month'] = train['TransactionDT'].dt.month
# test['month'] = test['TransactionDT'].dt.month
train['email_domain_comp'] = (train['P_emaildomain'].values == train['R_emaildomain'].values).astype(int)
test['email_domain_comp'] = (test['P_emaildomain'].values == test['R_emaildomain'].values).astype(int)
train.drop(['D9'],axis=1,inplace=True)
test.drop(['D9'],axis=1,inplace=True)
# X_train = train.drop(['TransactionID','TransactionDT'],axis=1)
# X_test = test.drop(['TransactionID','TransactionDT'],axis=1)
for f in cat_columns:
    #if X_train[f].dtype=='object' or X_test[f].dtype=='object': 
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[f].astype(str)) + list(test[f].astype(str)))
    train[f] = lbl.transform(list(train[f].astype(str)))
    test[f] = lbl.transform(list(test[f].astype(str))) 
train.fillna(-999,inplace = True)
test.fillna(-999,inplace = True)
for i in count_columns:
    train[i+'_count_full'] = train[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))
    test[i+'_count_full'] = test[i].map(pd.concat([train[i], test[i]], ignore_index=True).value_counts(dropna=False))
#train['decimal_digit_count_full'] = train['decimal_digit'].map(pd.concat([train['decimal_digit'], test['decimal_digit']], ignore_index=True).value_counts(dropna=False))
#test['decimal_digit_count_full'] = test['decimal_digit'].map(pd.concat([train['decimal_digit'], test['decimal_digit']], ignore_index=True).value_counts(dropna=False))
train_test_all = pd.concat([train,test],ignore_index=True,sort=False)
train_test_all['day_count'] = train_test_all.groupby(train_test_all.TransactionDT.dt.date)['TransactionAmt'].transform('count')
train_test_all['hour_count'] = train_test_all.groupby(train_test_all.TransactionDT.map(lambda x:str(x)[:13]))['TransactionAmt'].transform('count')
train['day_count'] = train_test_all[:590540].day_count.tolist()
test['day_count'] = train_test_all[590540:].day_count.tolist()
train['hour_count'] = train_test_all[:590540].hour_count.tolist()
test['hour_count'] = train_test_all[590540:].hour_count.tolist()
# a= mean_encode(train,test,columns=cat_columns,target_col='isFraud',
#                reg_method = 'k_fold',folds=5,alpha =5 )
# for i in ['id_15', 'id_16', 'M4', 'card5','DeviceInfo']:
#     train['mean_'+i] = a['mean_isFraud_'+i][:590540].tolist()
#     test['mean_'+i] = a['mean_isFraud_'+i][590540:].tolist()
y_train = train['isFraud'].copy()
X_train = train.drop(['TransactionID','isFraud','TransactionDT'],axis=1)
X_test = test.drop(['TransactionID','TransactionDT'],axis=1)
del train_test_all
### add new
temp123 = ['TransactionAmt__ProductCD']
for feature in temp123:
    f1, f2 = feature.split('__')
    X_train[feature] = X_train[f1].astype(str) + '_' + X_train[f2].astype(str)
    X_test[feature] = X_test[f1].astype(str) + '_' + X_test[f2].astype(str)
    le = LabelEncoder()
    le.fit(list(X_train[feature].astype(str).values) + list(X_test[feature].astype(str).values))
    X_train[feature] = le.transform(list(X_train[feature].astype(str).values))
    X_test[feature] = le.transform(list(X_test[feature].astype(str).values))
X_train.rename(columns = {'TransactionAmt__ProductCD':'ProductID'},inplace=True)
X_test.rename(columns = {'TransactionAmt__ProductCD':'ProductID'},inplace=True)
for i in ['ProductID']:
    X_train[i+'_count_full'] = X_train[i].map(pd.concat([X_train[i], X_test[i]], ignore_index=True).value_counts(dropna=False))
    X_test[i+'_count_full'] = X_test[i].map(pd.concat([X_train[i], X_test[i]], ignore_index=True).value_counts(dropna=False))
###    
temp = ['DeviceInfo__P_emaildomain', 
        'card1__card5', 
        'card2__id_20',
        'card5__P_emaildomain', 
        'addr1__card1',
        'addr1__addr2',
        'card1__card2',
        'card2__addr1',
        'card1__P_emaildomain',
        'card2__P_emaildomain',
        'addr1__P_emaildomain',
        'DeviceInfo__id_31',
        'DeviceInfo__id_20',
        'DeviceType__id_31',
        'DeviceType__id_20',
        'DeviceType__P_emaildomain',
        'card1__M4',
        'card2__M4',
        'addr1__M4',
        'P_emaildomain__M4',
       'uid1__ProductID',
       'uid1__DeviceInfo']
for feature in temp:
    f1, f2 = feature.split('__')
    X_train[feature] = X_train[f1].astype(str) + '_' + X_train[f2].astype(str)
    X_test[feature] = X_test[f1].astype(str) + '_' + X_test[f2].astype(str)

    le = LabelEncoder()
    le.fit(list(X_train[feature].astype(str).values) + list(X_test[feature].astype(str).values))
    X_train[feature] = le.transform(list(X_train[feature].astype(str).values))
    X_test[feature] = le.transform(list(X_test[feature].astype(str).values))

for i in temp:
    X_train[i+'_count_full'] = X_train[i].map(pd.concat([X_train[i], X_test[i]], ignore_index=True).value_counts(dropna=False))
    X_test[i+'_count_full'] = X_test[i].map(pd.concat([X_train[i], X_test[i]], ignore_index=True).value_counts(dropna=False))
con_fea = ['V258','C1','C14','C13','TransactionAmt','D15_revised','D2_revised','id_02','dist1','V294','C11']
cat_fea = ['card1','card2','addr1','card4','R_emaildomain','P_emaildomain','ProductID','uid1']
train_test = pd.concat([X_train,X_test],ignore_index=True,sort=False)
for cont in con_fea:
    for cat in cat_fea:
        X_train[cont+'_'+cat+'_mean'] = train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('mean')[:590540].tolist()
        X_train[cont+'_'+cat+'_std'] = train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('std')[:590540].tolist()
        X_test[cont+'_'+cat+'_mean'] = train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('mean')[590540:].tolist()
        X_test[cont+'_'+cat+'_std'] =  train_test[cont].map(lambda x:np.nan if x==-999 else x).groupby(train_test[cat]).transform('std')[590540:].tolist()
X_train.fillna(-999,inplace=True)
X_test.fillna(-999,inplace=True)
X_train.drop(['DeviceInfo','device_version','DT_D','DT_W','DT_M','D15',
              'D2','D1','D4','D6','D10','D11','D12','D3','D5','D7','D8','D13','D14','TransactionAmt_ProductID_mean'],axis=1,inplace=True)
X_test.drop(['DeviceInfo','device_version','DT_D','DT_W','DT_M','D15',
             'D2','D1','D4','D6','D10','D11','D12','D3','D5','D7','D8','D13','D14','TransactionAmt_ProductID_mean'],axis=1,inplace=True)
X_train.shape

(590540, 700)

In [17]:
orders = pd.read_csv('importance.csv')
drop = orders.loc[350:,'Unnamed: 0'].tolist()
X_train.drop(drop,axis=1,inplace=True)
X_test.drop(drop,axis=1,inplace=True)

In [18]:
X_train.shape

(590540, 359)

In [19]:
cat = ['uid1','id_12', 'id_13', 'id_14', 'id_15', 'id_16', 'id_17', 'id_18', 'id_19', 'id_20', 'id_21', 'id_22', 'id_23', 'id_24', 'id_25', 'id_26', 'id_27', 'id_28', 'id_29',
            'id_30', 'id_31', 'id_32', 'id_33', 'id_34', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'ProductCD', 'card4', 'card6', 'M4','P_emaildomain',
            'R_emaildomain', 'card1', 'card2', 'card3',  'card5', 'addr1', 'addr2', 'M1', 'M2', 'M3', 'M5', 'M6', 'M7', 'M8', 'M9','hour','dow','device_name', 'OS_id_30',  'browser_id_31','ProductID',
'DeviceInfo__P_emaildomain', 
        'card1__card5', 
        'card2__id_20',
        'card5__P_emaildomain', 
        'addr1__card1',
        'addr1__addr2',
        'card1__card2',
        'card2__addr1',
        'card1__P_emaildomain',
        'card2__P_emaildomain',
        'addr1__P_emaildomain',
        'DeviceInfo__id_31',
        'DeviceInfo__id_20',
        'DeviceType__id_31',
        'DeviceType__id_20',
        'DeviceType__P_emaildomain',
        'card1__M4',
        'card2__M4',
        'addr1__M4',
        'P_emaildomain__M4',
       'uid1__ProductID',
       'uid1__DeviceInfo']

In [20]:
for i in drop:
    if i in cat:
        cat.remove(i)

In [31]:
for column in cat:
    train_set = set(X_train[column])
    test_set = set(X_test[column])
    tt = train_set.intersection(test_set)
    print('----------------------------------------')
    print(column)
    print('train:','{:.2f}'.format(len(tt)/len(train_set)))
    print('test:','{:.2f}'.format(len(tt)/len(test_set)))
    X_train[column] = X_train[column].map(lambda x: -999 if x not in tt else x)
    X_test[column] = X_test[column].map(lambda x: -999 if x not in tt else x)

----------------------------------------
uid1
train: 0.12
test: 0.14
----------------------------------------
id_13
train: 0.49
test: 0.96
----------------------------------------
id_14
train: 0.96
test: 0.89
----------------------------------------
id_18
train: 0.89
test: 0.94
----------------------------------------
id_19
train: 0.87
test: 0.91
----------------------------------------
id_20
train: 0.72
test: 0.65
----------------------------------------
id_30
train: 0.99
test: 0.86
----------------------------------------
id_31
train: 0.72
test: 0.69
----------------------------------------
id_33
train: 0.73
test: 0.49
----------------------------------------
card6
train: 0.80
test: 1.00
----------------------------------------
M4
train: 1.00
test: 1.00
----------------------------------------
P_emaildomain
train: 1.00
test: 0.98
----------------------------------------
R_emaildomain
train: 1.00
test: 1.00
----------------------------------------
card1
train: 0.72
test: 0.73
--------

In [21]:
X_train.drop(['device_hash'],axis=1,inplace=True)
X_test.drop(['device_hash'],axis=1,inplace=True)

In [22]:
X_train.shape

(590540, 358)

In [30]:
### go LB
kf = KFold(n_splits = 5, shuffle =False)
stack_train = np.zeros([X_train.shape[0],])
resu1 = 0
#impor1 = 0
for train_index, test_index in kf.split(X_train, y_train):
    X_train1= X_train.iloc[train_index,:]
    y_train1= y_train.iloc[train_index]
    X_test1= X_train.iloc[test_index,:]
    y_test1= y_train.iloc[test_index]
    clf = cb.CatBoostClassifier(n_estimators=10000, random_state=0, learning_rate= 0.05,depth=10,cat_features = cat,
                               early_stopping_rounds = 400,eval_metric='AUC',task_type = 'GPU',border_count = 254,l2_leaf_reg=2)
    clf.fit(X_train1,y_train1,eval_set = (X_test1,y_test1),verbose=100)
    resu1 += clf.predict_proba(X_test)[:,1]/5
    temp_predict = clf.predict_proba(X_test1)[:,1]
    stack_train[test_index] = temp_predict
    print('done')
    gc.collect()
resu = pd.read_csv('../../sample_submission.csv')
resu['isFraud'] = resu1
resu.to_csv('10_4_catboost.csv',index=False)

0:	learn: 0.9259948	test: 0.7661170	best: 0.7661170 (0)	total: 226ms	remaining: 37m 42s
100:	learn: 0.9768629	test: 0.9178556	best: 0.9178556 (100)	total: 22.5s	remaining: 36m 42s
200:	learn: 0.9804655	test: 0.9296348	best: 0.9296348 (200)	total: 44.2s	remaining: 35m 52s
300:	learn: 0.9822619	test: 0.9356003	best: 0.9356225 (299)	total: 1m 6s	remaining: 35m 53s
400:	learn: 0.9837284	test: 0.9387641	best: 0.9387736 (395)	total: 1m 30s	remaining: 36m 9s
500:	learn: 0.9849352	test: 0.9405177	best: 0.9405177 (500)	total: 1m 55s	remaining: 36m 22s
600:	learn: 0.9860211	test: 0.9426259	best: 0.9426323 (599)	total: 2m 19s	remaining: 36m 26s
700:	learn: 0.9868497	test: 0.9442004	best: 0.9442406 (698)	total: 2m 44s	remaining: 36m 21s
800:	learn: 0.9878129	test: 0.9453971	best: 0.9454157 (788)	total: 3m 9s	remaining: 36m 14s
900:	learn: 0.9886663	test: 0.9467204	best: 0.9467204 (900)	total: 3m 34s	remaining: 36m 5s
1000:	learn: 0.9893571	test: 0.9476216	best: 0.9476229 (998)	total: 3m 59s	remain

200:	learn: 0.9784948	test: 0.9642524	best: 0.9642524 (200)	total: 44.8s	remaining: 36m 23s
300:	learn: 0.9806073	test: 0.9663358	best: 0.9663426 (299)	total: 1m 9s	remaining: 37m 18s
400:	learn: 0.9821578	test: 0.9679272	best: 0.9679272 (400)	total: 1m 34s	remaining: 37m 37s
500:	learn: 0.9833518	test: 0.9686972	best: 0.9687101 (497)	total: 1m 59s	remaining: 37m 39s
600:	learn: 0.9843065	test: 0.9692003	best: 0.9692003 (600)	total: 2m 24s	remaining: 37m 37s
700:	learn: 0.9852154	test: 0.9698293	best: 0.9698293 (700)	total: 2m 49s	remaining: 37m 24s
800:	learn: 0.9860694	test: 0.9702656	best: 0.9702656 (800)	total: 3m 14s	remaining: 37m 12s
900:	learn: 0.9868647	test: 0.9705834	best: 0.9705834 (900)	total: 3m 39s	remaining: 36m 54s
1000:	learn: 0.9876322	test: 0.9709226	best: 0.9709226 (1000)	total: 4m 4s	remaining: 36m 40s
1100:	learn: 0.9884220	test: 0.9712241	best: 0.9712277 (1098)	total: 4m 30s	remaining: 36m 25s
1200:	learn: 0.9892052	test: 0.9713671	best: 0.9713671 (1200)	total: 

In [32]:
kf=KFold(n_splits = 5)
resu1 = 0
impor1 = 0
y_pred = 0
stack_train = np.zeros([X_train.shape[0],])
for train_index, test_index in kf.split(X_train, y_train):
    X_train2= X_train.iloc[train_index,:]
    y_train2= y_train.iloc[train_index]
    X_test2= X_train.iloc[test_index,:]
    y_test2= y_train.iloc[test_index]
    clf = xgb.XGBClassifier(n_estimators=10000, max_depth=11, learning_rate=0.01,random_state=0, subsample=0.8,
                                 colsample_bytree=0.6,min_child_weight = 3,reg_alpha=1,reg_lambda = 0.01,n_jobs=-1,tree_method='gpu_hist')
    
    clf.fit(X_train2,y_train2,eval_set = [(X_train2,y_train2),(X_test2,y_test2)], eval_metric = 'auc',early_stopping_rounds=500,verbose=30)
    temp_predict = clf.predict_proba(X_test2)[:,1]
    stack_train[test_index] = temp_predict
    y_pred += clf.predict_proba(X_test)[:,1]/5
    roc = roc_auc_score(y_test2, temp_predict)
    print(roc)
    resu1 += roc/5
    impor1 += clf.feature_importances_/5
    gc.collect()
print('End:',resu1)
resu = pd.read_csv('../../sample_submission.csv')
resu['isFraud'] = y_pred
resu.to_csv('10_4_xgb.csv',index=False)


[0]	validation_0-auc:0.872459	validation_1-auc:0.828763
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 500 rounds.
[30]	validation_0-auc:0.906986	validation_1-auc:0.87227
[60]	validation_0-auc:0.912122	validation_1-auc:0.875567
[90]	validation_0-auc:0.919767	validation_1-auc:0.879905
[120]	validation_0-auc:0.928479	validation_1-auc:0.885434
[150]	validation_0-auc:0.938242	validation_1-auc:0.892142
[180]	validation_0-auc:0.946396	validation_1-auc:0.896892
[210]	validation_0-auc:0.953889	validation_1-auc:0.902484
[240]	validation_0-auc:0.962615	validation_1-auc:0.906767
[270]	validation_0-auc:0.967871	validation_1-auc:0.909721
[300]	validation_0-auc:0.972039	validation_1-auc:0.912217
[330]	validation_0-auc:0.975806	validation_1-auc:0.91465
[360]	validation_0-auc:0.978932	validation_1-auc:0.916841
[390]	validation_0-auc:0.981693	validation_1-auc:0.918938
[420]	validation_0-auc:0.984036	valid

[1290]	validation_0-auc:0.998226	validation_1-auc:0.95621
[1320]	validation_0-auc:0.998342	validation_1-auc:0.956293
[1350]	validation_0-auc:0.998445	validation_1-auc:0.956305
[1380]	validation_0-auc:0.998533	validation_1-auc:0.95639
[1410]	validation_0-auc:0.998621	validation_1-auc:0.956389
[1440]	validation_0-auc:0.9987	validation_1-auc:0.956474
[1470]	validation_0-auc:0.998784	validation_1-auc:0.956465
[1500]	validation_0-auc:0.99886	validation_1-auc:0.956443
[1530]	validation_0-auc:0.998928	validation_1-auc:0.956479
[1560]	validation_0-auc:0.998996	validation_1-auc:0.95659
[1590]	validation_0-auc:0.999048	validation_1-auc:0.95659
[1620]	validation_0-auc:0.999115	validation_1-auc:0.956635
[1650]	validation_0-auc:0.99918	validation_1-auc:0.956664
[1680]	validation_0-auc:0.999233	validation_1-auc:0.95669
[1710]	validation_0-auc:0.999277	validation_1-auc:0.956719
[1740]	validation_0-auc:0.999325	validation_1-auc:0.956742
[1770]	validation_0-auc:0.999363	validation_1-auc:0.956728
[1800]

[0]	validation_0-auc:0.854853	validation_1-auc:0.849849
Multiple eval metrics have been passed: 'validation_1-auc' will be used for early stopping.

Will train until validation_1-auc hasn't improved in 500 rounds.
[30]	validation_0-auc:0.901716	validation_1-auc:0.900937
[60]	validation_0-auc:0.907622	validation_1-auc:0.904855
[90]	validation_0-auc:0.914202	validation_1-auc:0.909249
[120]	validation_0-auc:0.920519	validation_1-auc:0.913541
[150]	validation_0-auc:0.928756	validation_1-auc:0.919551
[180]	validation_0-auc:0.935744	validation_1-auc:0.924609
[210]	validation_0-auc:0.944472	validation_1-auc:0.930786
[240]	validation_0-auc:0.952445	validation_1-auc:0.935932
[270]	validation_0-auc:0.959698	validation_1-auc:0.940312
[300]	validation_0-auc:0.966824	validation_1-auc:0.944583
[330]	validation_0-auc:0.971734	validation_1-auc:0.947644
[360]	validation_0-auc:0.975782	validation_1-auc:0.950514
[390]	validation_0-auc:0.979126	validation_1-auc:0.952491
[420]	validation_0-auc:0.982117	val

[1230]	validation_0-auc:0.998202	validation_1-auc:0.943253
[1260]	validation_0-auc:0.998306	validation_1-auc:0.943373
[1290]	validation_0-auc:0.998413	validation_1-auc:0.943487
[1320]	validation_0-auc:0.998513	validation_1-auc:0.943578
[1350]	validation_0-auc:0.998615	validation_1-auc:0.943654
[1380]	validation_0-auc:0.99871	validation_1-auc:0.943888
[1410]	validation_0-auc:0.998786	validation_1-auc:0.944026
[1440]	validation_0-auc:0.998861	validation_1-auc:0.944171
[1470]	validation_0-auc:0.998929	validation_1-auc:0.944308
[1500]	validation_0-auc:0.999002	validation_1-auc:0.944381
[1530]	validation_0-auc:0.999062	validation_1-auc:0.944386
[1560]	validation_0-auc:0.999125	validation_1-auc:0.944471
[1590]	validation_0-auc:0.999189	validation_1-auc:0.944585
[1620]	validation_0-auc:0.99924	validation_1-auc:0.944639
[1650]	validation_0-auc:0.999292	validation_1-auc:0.94467
[1680]	validation_0-auc:0.999339	validation_1-auc:0.944745
[1710]	validation_0-auc:0.999376	validation_1-auc:0.944857
