<a href="https://colab.research.google.com/github/AlessandroVol23/ieee_cis_fraud_detection_kaggle/blob/master/notebooks/0_4_AV_model_pca_features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Model with PCA

- Same baseline model just with some PCA on features

## Imports

In [0]:
import os

import numpy as np
import pandas as pd
from sklearn import preprocessing
import xgboost as xgb

from sklearn.preprocessing import minmax_scale
from sklearn.decomposition import PCA


## Colab Kaggle Init

In [2]:
import os.path

if not os.path.exists('~/.kaggle/'):
  print("Kaggle Folder doesn't exist yet")
  from google.colab import files
  print("Please click on button an upload your kaggle.json api file")
  files.upload()
  
  !mkdir -p ~/.kaggle
  !cp kaggle.json ~/.kaggle/
  !chmod 600 ~/.kaggle/kaggle.json
  !ls ~/.kaggle
  
  !pip install -q kaggle
  !pip install -q kaggle-cli
  
  !kaggle competitions download -c ieee-fraud-detection
  
  !unzip \*.zip
  
  from IPython.display import clear_output
  clear_output()
  
  print("DONE!")
  
else:
  print("Data already exists")

DONE!


## Functions

In [0]:
## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

## Data

In [4]:
# Read in datasets 
df_train_ident = pd.read_csv('train_identity.csv', index_col='TransactionID')
df_test_ident = pd.read_csv('test_identity.csv', index_col='TransactionID')

df_train_trans = pd.read_csv('train_transaction.csv', index_col='TransactionID')
df_test_trans = pd.read_csv('test_transaction.csv', index_col='TransactionID')

df_sample_submission = pd.read_csv('sample_submission.csv', index_col='TransactionID')

# Merge datasets
df_train = df_train_trans.merge(df_train_ident, how='left', left_index=True, right_index=True)
df_test = df_test_trans.merge(df_test_ident, how='left', left_index=True, right_index=True)

# Print shapes
print(df_train.shape)
print(df_test.shape)

(590540, 433)
(506691, 432)


## Preprocessing

In [5]:
df_train = reduce_mem_usage(df_train)
df_test = reduce_mem_usage(df_test)

Mem. usage decreased to 668.22 Mb (66.2% reduction)
Mem. usage decreased to 583.43 Mb (65.6% reduction)


In [0]:
emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 
          'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft',
          'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo',
          'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 
          'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink',
          'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other',
          'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 
          'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 
          'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo',
          'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other',
          'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft',
          'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 
          'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 
          'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 
          'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 
          'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 
          'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 
          'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other',
          'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}

us_emails = ['gmail', 'net', 'edu']

# https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
for c in ['P_emaildomain', 'R_emaildomain']:
    df_train[c + '_bin'] = df_train[c].map(emails)
    df_test[c + '_bin'] = df_test[c].map(emails)
    
    df_train[c + '_suffix'] = df_train[c].map(lambda x: str(x).split('.')[-1])
    df_test[c + '_suffix'] = df_test[c].map(lambda x: str(x).split('.')[-1])
    
    df_train[c + '_suffix'] = df_train[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')
    df_test[c + '_suffix'] = df_test[c + '_suffix'].map(lambda x: x if str(x) not in us_emails else 'us')

In [7]:
df_train.head(10)

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,id_05,id_06,id_07,id_08,id_09,id_10,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,P_emaildomain_bin,P_emaildomain_suffix,R_emaildomain_bin,R_emaildomain_suffix
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,credit,315.0,87.0,19.0,,,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,credit,325.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,google,com,,
2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,debit,330.0,87.0,287.0,,outlook.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,microsoft,com,,
2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,debit,476.0,87.0,,,yahoo.com,,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,84.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yahoo,com,,
2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,credit,420.0,87.0,,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,...,,,,,,,100.0,NotFound,,-480.0,New,NotFound,166.0,,542.0,144.0,,,,,,,,New,NotFound,Android 7.0,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M,google,com,,
2987005,0,86510,49.0,W,5937,555.0,150.0,visa,226.0,debit,272.0,87.0,36.0,,gmail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,google,com,,
2987006,0,86522,159.0,W,12308,360.0,150.0,visa,166.0,debit,126.0,87.0,0.0,,yahoo.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yahoo,com,,
2987007,0,86529,422.5,W,12695,490.0,150.0,visa,226.0,debit,325.0,87.0,,,mail.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,other,com,,
2987008,0,86535,15.0,H,2803,100.0,150.0,visa,226.0,debit,337.0,87.0,,,anonymous.com,,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,...,0.0,-5.0,,,,,100.0,NotFound,49.0,-300.0,New,NotFound,166.0,,621.0,500.0,,,,,,,,New,NotFound,iOS 11.1.2,mobile safari 11.0,32.0,1334x750,match_status:1,T,F,F,T,mobile,iOS Device,other,com,,
2987009,0,86536,117.0,W,17399,111.0,150.0,mastercard,224.0,debit,204.0,87.0,19.0,,yahoo.com,,2.0,2.0,0.0,0.0,0.0,3.0,0.0,0.0,3.0,0.0,1.0,0.0,12.0,2.0,61.0,61.0,30.0,318.0,30.0,,,,,40.0,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,yahoo,com,,


In [0]:
# Encode all categorical features
for f in df_train.drop('isFraud', axis=1).columns:
    if df_train[f].dtype=='object' or df_test[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df_train[f].values) + list(df_test[f].values))
        df_train[f] = lbl.transform(list(df_train[f].values))
        df_test[f] = lbl.transform(list(df_test[f].values))   

In [0]:
# Some feature engineering

df_train['Trans_min_mean'] = df_train['TransactionAmt'] - df_train['TransactionAmt'].mean()
df_train['Trans_min_std'] = df_train['Trans_min_mean'] / df_train['TransactionAmt'].std()
df_test['Trans_min_mean'] = df_test['TransactionAmt'] - df_test['TransactionAmt'].mean()
df_test['Trans_min_std'] = df_test['Trans_min_mean'] / df_test['TransactionAmt'].std()

df_train['TransactionAmt_to_mean_card1'] = df_train['TransactionAmt'] / df_train.groupby(['card1'])['TransactionAmt'].transform('mean')
df_train['TransactionAmt_to_mean_card4'] = df_train['TransactionAmt'] / df_train.groupby(['card4'])['TransactionAmt'].transform('mean')
df_train['TransactionAmt_to_std_card1'] = df_train['TransactionAmt'] / df_train.groupby(['card1'])['TransactionAmt'].transform('std')
df_train['TransactionAmt_to_std_card4'] = df_train['TransactionAmt'] / df_train.groupby(['card4'])['TransactionAmt'].transform('std')

df_test['TransactionAmt_to_mean_card1'] = df_test['TransactionAmt'] / df_test.groupby(['card1'])['TransactionAmt'].transform('mean')
df_test['TransactionAmt_to_mean_card4'] = df_test['TransactionAmt'] / df_test.groupby(['card4'])['TransactionAmt'].transform('mean')
df_test['TransactionAmt_to_std_card1'] = df_test['TransactionAmt'] / df_test.groupby(['card1'])['TransactionAmt'].transform('std')
df_test['TransactionAmt_to_std_card4'] = df_test['TransactionAmt'] / df_test.groupby(['card4'])['TransactionAmt'].transform('std')

In [10]:
df_train.head()

Unnamed: 0_level_0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,...,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,P_emaildomain_bin,P_emaildomain_suffix,R_emaildomain_bin,R_emaildomain_suffix,Trans_min_mean,Trans_min_std,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4
TransactionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
2987000,0,86400,68.5,4,13926,,150.0,1,142.0,1,315.0,87.0,19.0,,32,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,13.0,...,,2,,,3,2,,,,,,,3,,,,2,2,2,86,136,,461,4,2,2,2,2,2,2740,6,6,6,6,,,0.19458,0.257812,0.0,0.0
2987001,0,86401,29.0,4,2755,404.0,150.0,2,102.0,1,325.0,87.0,,,16,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,...,,2,,,3,2,,,,,,,3,,,,2,2,2,86,136,,461,4,2,2,2,2,2,2740,4,0,6,6,,,0.123779,0.219116,0.0,0.114258
2987002,0,86469,59.0,4,4663,490.0,150.0,4,166.0,2,330.0,87.0,287.0,,36,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,0.0,...,,2,,,3,2,,,,,,,3,,,,2,2,2,86,136,,461,4,2,2,2,2,2,2740,5,0,6,6,,,0.608398,0.443115,0.589355,0.258545
2987003,0,86499,50.0,4,18132,567.0,150.0,2,117.0,2,476.0,87.0,,,55,32,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,84.0,...,,2,,,3,2,,,,,,,3,,,,2,2,2,86,136,,461,4,2,2,2,2,2,2740,9,0,6,6,,,0.405029,0.377686,0.259521,0.196899
2987004,0,86506,50.0,1,4497,514.0,150.0,2,102.0,1,420.0,87.0,,,16,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,,...,100.0,1,,-480.0,1,1,166.0,,542.0,144.0,,,3,,,,2,1,1,7,162,32.0,268,3,1,0,1,1,1,1565,4,0,6,6,,,0.515625,0.377686,0.882812,0.196899


In [0]:
df_train['TransactionAmt'] = np.log(df_train['TransactionAmt'])
df_test['TransactionAmt'] = np.log(df_test['TransactionAmt'])

## PCA

In [12]:
df_test['isFraud'] = 'test'
df = pd.concat([df_train, df_test], axis = 0, sort=False)
df = df.reset_index()
#df = df.drop('index', axis=1)
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,id_11,id_12,id_13,id_14,id_15,id_16,id_17,id_18,id_19,id_20,id_21,id_22,id_23,id_24,id_25,id_26,id_27,id_28,id_29,id_30,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,P_emaildomain_bin,P_emaildomain_suffix,R_emaildomain_bin,R_emaildomain_suffix,Trans_min_mean,Trans_min_std,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4
0,2987000,0,86400,4.226562,4,13926,,150.0,1,142.0,1,315.0,87.0,19.0,,32,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,...,,2,,,3,2,,,,,,,3,,,,2,2,2,86,136,,461,4,2,2,2,2,2,2740,6,6,6,6,,,0.19458,0.257812,0.0,0.0
1,2987001,0,86401,3.367188,4,2755,404.0,150.0,2,102.0,1,325.0,87.0,,,16,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,2,,,3,2,,,,,,,3,,,,2,2,2,86,136,,461,4,2,2,2,2,2,2740,4,0,6,6,,,0.123779,0.219116,0.0,0.114258
2,2987002,0,86469,4.078125,4,4663,490.0,150.0,4,166.0,2,330.0,87.0,287.0,,36,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,,2,,,3,2,,,,,,,3,,,,2,2,2,86,136,,461,4,2,2,2,2,2,2740,5,0,6,6,,,0.608398,0.443115,0.589355,0.258545
3,2987003,0,86499,3.912109,4,18132,567.0,150.0,2,117.0,2,476.0,87.0,,,55,32,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,...,,2,,,3,2,,,,,,,3,,,,2,2,2,86,136,,461,4,2,2,2,2,2,2740,9,0,6,6,,,0.405029,0.377686,0.259521,0.196899
4,2987004,0,86506,3.912109,1,4497,514.0,150.0,2,102.0,1,420.0,87.0,,,16,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,...,100.0,1,,-480.0,1,1,166.0,,542.0,144.0,,,3,,,,2,1,1,7,162,32.0,268,3,1,0,1,1,1,1565,4,0,6,6,,,0.515625,0.377686,0.882812,0.196899


In [0]:
def PCA_change(df, cols, n_components, prefix='PCA_', rand_seed=4):
  from sklearn.preprocessing import minmax_scale
  from sklearn.decomposition import PCA
  
  pca = PCA(n_components=n_components, random_state=rand_seed)

  principalComponents = pca.fit_transform(df[cols])

  principalDf = pd.DataFrame(principalComponents)

  df.drop(cols, axis=1, inplace=True)

  principalDf.rename(columns=lambda x: str(prefix)+str(x), inplace=True)

  df = pd.concat([df, principalDf], axis=1)

  return df

In [0]:
# Get list with all V columns
mas_v = list(filter(lambda x: str.startswith(x, 'V'), list(df_train.columns)))

In [15]:
len(mas_v)

339

In [0]:
for col in mas_v:
  # Fill nas with minimum - 2
  df[col] = df[col].fillna((df[col].min() - 2))
  
  # Scale feature
  df[col] = (minmax_scale(df[col], feature_range=(0,1)))
 

In [0]:
df = PCA_change(df, mas_v, prefix='PCA_V_', n_components=30)

In [18]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,...,P_emaildomain_bin,P_emaildomain_suffix,R_emaildomain_bin,R_emaildomain_suffix,Trans_min_mean,Trans_min_std,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4,PCA_V_0,PCA_V_1,PCA_V_2,PCA_V_3,PCA_V_4,PCA_V_5,PCA_V_6,PCA_V_7,PCA_V_8,PCA_V_9,PCA_V_10,PCA_V_11,PCA_V_12,PCA_V_13,PCA_V_14,PCA_V_15,PCA_V_16,PCA_V_17,PCA_V_18,PCA_V_19,PCA_V_20,PCA_V_21,PCA_V_22,PCA_V_23,PCA_V_24,PCA_V_25,PCA_V_26,PCA_V_27,PCA_V_28,PCA_V_29
0,2987000,0,86400,4.226562,4,13926,,150.0,1,142.0,1,315.0,87.0,19.0,,32,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,2.0,0.0,1.0,1.0,14.0,,13.0,,,,,,,...,6,6,6,6,,,0.19458,0.257812,0.0,0.0,-0.008168,0.896768,-0.772604,0.207604,-0.01201,0.039182,-0.165671,0.005578,0.015828,-0.023394,-0.021164,0.011667,-0.011619,0.078595,0.03517,0.012806,-0.011725,-0.008065,-0.003894,-0.001232,0.014847,-0.013241,0.004725,-0.04894,-0.036978,-0.04339,-0.006306,-0.063382,0.007336,-0.051456
1,2987001,0,86401,3.367188,4,2755,404.0,150.0,2,102.0,1,325.0,87.0,,,16,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,4,0,6,6,,,0.123779,0.219116,0.0,0.114258,0.14497,-0.600663,-0.115998,-0.527758,0.007039,-0.039916,-0.110942,-0.029017,-0.009271,0.105701,-0.113208,-0.045032,-0.01523,0.019584,0.004589,0.016261,-0.011488,0.002073,-0.033657,0.002014,0.011012,-0.004015,-0.001425,0.017559,0.034023,0.037497,0.021133,-0.03526,0.004477,0.004268
2,2987002,0,86469,4.078125,4,4663,490.0,150.0,4,166.0,2,330.0,87.0,287.0,,36,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,,,0.0,,,,,,...,5,0,6,6,,,0.608398,0.443115,0.589355,0.258545,-0.604634,0.213996,0.216993,0.013979,-0.020224,0.101989,-0.156353,0.048773,0.008887,-0.029055,-0.003555,0.009281,-0.042361,0.020306,-0.009878,-7.2e-05,-0.016698,7.8e-05,-0.008966,0.003851,-0.005289,-0.002104,-0.00604,-0.001916,-0.000795,-0.000388,-0.00741,-0.000199,-0.015841,-0.000666
3,2987003,0,86499,3.912109,4,18132,567.0,150.0,2,117.0,2,476.0,87.0,,,55,32,2.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,1.0,0.0,25.0,1.0,112.0,112.0,0.0,94.0,0.0,,,,,...,9,0,6,6,,,0.405029,0.377686,0.259521,0.196899,0.013352,-0.615523,-0.100615,-0.563702,0.029317,0.024159,-0.203748,0.149432,0.055324,-0.061442,0.136283,-0.04527,-0.015744,-0.019649,-0.027313,0.028196,0.135071,-0.050881,0.070268,0.024874,0.014941,0.024241,-0.001054,0.033764,0.056825,0.032423,0.009071,-0.015629,0.049825,0.004073
4,2987004,0,86506,3.912109,1,4497,514.0,150.0,2,102.0,1,420.0,87.0,,,16,32,1.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,,,,,,,,,...,4,0,6,6,,,0.515625,0.377686,0.882812,0.196899,2.709322,0.645603,0.619546,0.116302,1.136,-0.001207,-0.173497,-0.300247,-0.084976,-0.038396,0.080889,0.014135,-0.000949,0.00559,-0.001372,-0.021714,0.006357,-0.011552,0.014059,-0.011929,0.013197,0.069722,0.024154,0.016609,-0.045449,-0.010758,-0.00741,0.00723,-0.030608,0.010876


In [19]:
df.dtypes

TransactionID       int64
isFraud            object
TransactionDT       int32
TransactionAmt    float16
ProductCD           int64
card1               int16
card2             float16
card3             float16
card4               int64
card5             float16
card6               int64
addr1             float16
addr2             float16
dist1             float16
dist2             float16
P_emaildomain       int64
R_emaildomain       int64
C1                float16
C2                float16
C3                float16
C4                float16
C5                float16
C6                float16
C7                float16
C8                float16
C9                float16
C10               float16
C11               float16
C12               float16
C13               float16
                   ...   
PCA_V_0           float64
PCA_V_1           float64
PCA_V_2           float64
PCA_V_3           float64
PCA_V_4           float64
PCA_V_5           float64
PCA_V_6           float64
PCA_V_7     

In [0]:
# Encode all categorical features
for f in df.drop('isFraud', axis=1).columns:
    if df[f].dtype=='object': 
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(df[f].values))
        df[f] = lbl.transform(list(df[f].values))

In [0]:
df_train, df_test = df[df['isFraud'] != 'test'], df[df['isFraud'] == 'test'].drop('isFraud', axis=1)

In [22]:
df_train.shape

(590540, 135)

In [0]:
X_train = df_train.sort_values('TransactionDT').drop(['isFraud', 
                                                      'TransactionDT', 
                                                      #'Card_ID'
                                                     ],
                                                     axis=1)
y_train = df_train.sort_values('TransactionDT')['isFraud'].astype(bool)

X_test = df_test.sort_values('TransactionDT').drop(['TransactionDT',
                                                    #'Card_ID'
                                                   ], 
                                                   axis=1)
del df_train
df_test = df_test[["TransactionDT"]]

In [24]:
X_train.dtypes

TransactionID       int64
TransactionAmt    float16
ProductCD           int64
card1               int16
card2             float16
card3             float16
card4               int64
card5             float16
card6               int64
addr1             float16
addr2             float16
dist1             float16
dist2             float16
P_emaildomain       int64
R_emaildomain       int64
C1                float16
C2                float16
C3                float16
C4                float16
C5                float16
C6                float16
C7                float16
C8                float16
C9                float16
C10               float16
C11               float16
C12               float16
C13               float16
C14               float16
D1                float16
                   ...   
PCA_V_0           float64
PCA_V_1           float64
PCA_V_2           float64
PCA_V_3           float64
PCA_V_4           float64
PCA_V_5           float64
PCA_V_6           float64
PCA_V_7     

In [25]:
% time
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
EPOCHS = 3
kf = KFold(n_splits = EPOCHS, shuffle = True)
y_preds = np.zeros(df_sample_submission.shape[0])
y_oof = np.zeros(X_train.shape[0])
i = 0
for tr_idx, val_idx in kf.split(X_train, y_train):
    i += 1
    print("Split {}".format(i))
    clf = xgb.XGBClassifier(
        n_estimators=500,
        max_depth=9,
        learning_rate=0.05,
        subsample=0.9,
        colsample_bytree=0.9,
        tree_method='gpu_hist'
    )
    
    X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
    y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
    clf.fit(X_tr, y_tr)
    y_pred_train = clf.predict_proba(X_vl)[:,1]
    y_oof[val_idx] = y_pred_train
    print('ROC AUC {}'.format(roc_auc_score(y_vl, y_pred_train)))
    
    y_preds+= clf.predict_proba(X_test)[:,1] / EPOCHS

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 4.05 µs
Split 1
ROC AUC 0.9588900769809261
Split 2
ROC AUC 0.9633408886378165
Split 3
ROC AUC 0.9616856273054544


In [0]:
X_test_preds = X_test.assign(isFraud = y_preds)

In [27]:
X_test_preds.head()

Unnamed: 0,TransactionID,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,addr1,addr2,dist1,dist2,P_emaildomain,R_emaildomain,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,C11,C12,C13,C14,D1,D2,D3,D4,D5,D6,D7,D8,D9,D10,D11,...,P_emaildomain_suffix,R_emaildomain_bin,R_emaildomain_suffix,Trans_min_mean,Trans_min_std,TransactionAmt_to_mean_card1,TransactionAmt_to_mean_card4,TransactionAmt_to_std_card1,TransactionAmt_to_std_card4,PCA_V_0,PCA_V_1,PCA_V_2,PCA_V_3,PCA_V_4,PCA_V_5,PCA_V_6,PCA_V_7,PCA_V_8,PCA_V_9,PCA_V_10,PCA_V_11,PCA_V_12,PCA_V_13,PCA_V_14,PCA_V_15,PCA_V_16,PCA_V_17,PCA_V_18,PCA_V_19,PCA_V_20,PCA_V_21,PCA_V_22,PCA_V_23,PCA_V_24,PCA_V_25,PCA_V_26,PCA_V_27,PCA_V_28,PCA_V_29,isFraud
590540,3663549,3.464844,4,10409,111.0,150.0,4,226.0,2,170.0,87.0,1.0,,16,32,6.0,6.0,0.0,0.0,3.0,4.0,0.0,0.0,6.0,0.0,5.0,1.0,115.0,6.0,419.0,419.0,27.0,398.0,27.0,,,,,418.0,203.0,...,0,6,6,,,0.339355,0.237305,0.260254,0.129883,-0.554728,0.246604,0.337474,0.033179,-0.023737,-0.092791,0.075775,-0.083842,-0.063282,0.132838,-0.078747,-0.016921,-0.061115,-0.096361,-0.115193,-0.04251,-0.012989,0.003192,-0.029831,0.000572,-0.029706,0.002824,-0.007199,0.022592,-0.082517,0.104346,0.075076,0.003298,0.065885,0.081385,0.000915
590541,3663550,3.892578,4,4272,111.0,150.0,4,226.0,2,299.0,87.0,4.0,,2,32,3.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,2.0,0.0,1.0,1.0,12.0,2.0,149.0,149.0,7.0,634.0,7.0,,,,,231.0,634.0,...,0,6,6,,,0.333496,0.364014,0.0,0.199219,-0.577638,0.210843,0.2326,0.020214,-0.025376,0.079533,-0.146796,0.039055,-0.032317,0.035633,-0.023607,-0.011849,-0.036534,0.010093,-0.057892,-0.085813,0.084974,-0.002993,-0.086699,0.001286,0.039429,-0.019123,-0.000441,-0.003241,0.018732,-0.010039,-0.009841,0.000648,-0.000283,-0.011197,0.000707
590542,3663551,5.140625,4,4476,574.0,150.0,4,226.0,2,472.0,87.0,2636.0,,19,32,2.0,2.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,0.0,2.0,0.0,22.0,2.0,137.0,137.0,10.0,97.0,10.0,,,,,136.0,136.0,...,0,6,6,,,1.485352,1.270508,1.896484,0.695312,-0.602006,0.201474,0.223318,0.025145,-0.021464,0.058603,-0.172129,0.061025,-0.025983,0.024071,-0.007835,-0.011112,-0.027154,0.070439,-0.069631,-0.087398,0.060579,0.009353,-0.093457,-0.013678,0.090672,-0.038708,0.00184,0.007517,0.00578,0.010685,-0.007389,0.032826,0.006103,0.005565,0.002293
590543,3663552,5.652344,4,10989,360.0,150.0,4,166.0,2,205.0,87.0,17.0,,16,32,5.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,2.0,0.0,2.0,0.0,7.0,4.0,42.0,42.0,41.0,242.0,41.0,,,,,242.0,242.0,...,0,6,6,,,2.970703,2.117188,1.914062,1.158203,-0.782738,0.181128,0.320019,-0.030468,0.02829,-0.046644,0.127985,0.061618,0.023541,-0.122069,0.125268,0.021588,0.058194,-0.04955,-0.123944,-0.065231,-0.060485,0.022989,-0.018981,0.006132,-0.051986,0.023717,0.000269,0.040894,0.003274,-0.007878,0.016107,-0.001832,0.021598,-0.009249,0.001794
590544,3663553,4.21875,4,18018,452.0,150.0,2,117.0,2,264.0,87.0,6.0,,16,32,6.0,6.0,0.0,0.0,2.0,5.0,0.0,0.0,5.0,0.0,6.0,0.0,14.0,6.0,22.0,22.0,0.0,22.0,0.0,,,,,22.0,22.0,...,0,6,6,,,0.567383,0.517578,0.310059,0.277344,-0.727566,0.226788,0.223545,-0.00269,0.027237,-0.064508,0.14923,0.028365,0.014843,-0.062912,0.093837,0.000499,0.097024,-0.036177,-0.012419,-0.016145,0.031031,-0.021404,0.013396,0.01172,0.01179,0.002117,0.00833,0.00817,-0.00061,0.000101,0.012466,-0.023077,-0.037113,0.042061,0.002064


In [0]:
X_test_preds = X_test_preds[['TransactionID', 'isFraud']]

In [29]:
X_test_preds.head()

Unnamed: 0,TransactionID,isFraud
590540,3663549,0.000915
590541,3663550,0.000707
590542,3663551,0.002293
590543,3663552,0.001794
590544,3663553,0.002064


In [0]:
X_test_preds.set_index('TransactionID', inplace=True)

In [31]:
X_test_preds.head()

Unnamed: 0_level_0,isFraud
TransactionID,Unnamed: 1_level_1
3663549,0.000915
3663550,0.000707
3663551,0.002293
3663552,0.001794
3663553,0.002064


In [0]:
X_test_preds.to_csv('preds.csv')

In [33]:
pd.read_csv('preds.csv').head()

Unnamed: 0,TransactionID,isFraud
0,3663549,0.000915
1,3663550,0.000707
2,3663551,0.002293
3,3663552,0.001794
4,3663553,0.002064


In [0]:
#!kaggle competitions submit -c ieee-fraud-detection -f preds.csv -m "PCA"

## Hyperparam search

In [0]:
from sklearn.model_selection import KFold,TimeSeriesSplit, StratifiedKFold
from sklearn.metrics import roc_auc_score
from xgboost import plot_importance
from sklearn.metrics import make_scorer
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

import time
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'num_leaves': '{:.3f}'.format(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'min_child_samples': '{:.3f}'.format(params['min_child_samples']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 7
    count=1
    skf = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=42)

    tss = TimeSeriesSplit(n_splits=FOLDS)
    y_preds = np.zeros(df_sample_submission.shape[0])
    y_oof = np.zeros(X_train.shape[0])
    score_mean = 0
    for tr_idx, val_idx in tss.split(X_train, y_train):
        clf = xgb.XGBClassifier(
            n_estimators=600, random_state=4, verbose=True, 
            tree_method='gpu_hist', 
            **params
        )

        X_tr, X_vl = X_train.iloc[tr_idx, :], X_train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)
        #y_pred_train = clf.predict_proba(X_vl)[:,1]
        #print(y_pred_train)
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        # plt.show()
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    return -(score_mean / FOLDS)


space = {
    # The maximum depth of a tree, same as GBM.
    # Used to control over-fitting as higher depth will allow model 
    # to learn relations very specific to a particular sample.
    # Should be tuned using CV.
    # Typical values: 3-10
    'max_depth': hp.quniform('max_depth', 7, 23, 1),
    
    # reg_alpha: L1 regularization term. L1 regularization encourages sparsity 
    # (meaning pulling weights to 0). It can be more useful when the objective
    # is logistic regression since you might need help with feature selection.
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    
    # reg_lambda: L2 regularization term. L2 encourages smaller weights, this
    # approach can be more useful in tree-models where zeroing 
    # features might not make much sense.
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    
    # eta: Analogous to learning rate in GBM
    # Makes the model more robust by shrinking the weights on each step
    # Typical final values to be used: 0.01-0.2
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.2),
    
    # colsample_bytree: Similar to max_features in GBM. Denotes the 
    # fraction of columns to be randomly samples for each tree.
    # Typical values: 0.5-1
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, .9),
    
    # A node is split only when the resulting split gives a positive
    # reduction in the loss function. Gamma specifies the 
    # minimum loss reduction required to make a split.
    # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
    'gamma': hp.uniform('gamma', 0.01, .7),
    
    # more increases accuracy, but may lead to overfitting.
    # num_leaves: the number of leaf nodes to use. Having a large number 
    # of leaves will improve accuracy, but will also lead to overfitting.
    'num_leaves': hp.choice('num_leaves', list(range(20, 250, 10))),
    
    # specifies the minimum samples per leaf node.
    # the minimum number of samples (data) to group into a leaf. 
    # The parameter can greatly assist with overfitting: larger sample
    # sizes per leaf will reduce overfitting (but may lead to under-fitting).
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    
    # subsample: represents a fraction of the rows (observations) to be 
    # considered when building each subtree. Tianqi Chen and Carlos Guestrin
    # in their paper A Scalable Tree Boosting System recommend 
    'subsample': hp.choice('subsample', [0.2, 0.4, 0.5, 0.6, 0.7, .8, .9]),
    
    # randomly select a fraction of the features.
    # feature_fraction: controls the subsampling of features used
    # for training (as opposed to subsampling the actual training data in 
    # the case of bagging). Smaller fractions reduce overfitting.
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    
    # randomly bag or subsample training data.
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
    
    # bagging_fraction and bagging_freq: enables bagging (subsampling) 
    # of the training data. Both values need to be set for bagging to be used.
    # The frequency controls how often (iteration) bagging is used. Smaller
    # fractions and frequencies reduce overfitting.
}

In [36]:
# Set algoritm parameters
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=27)

# Print best parameters
best_params = space_eval(space, best)


############## New Run ################
params = {'max_depth': 12, 'gamma': '0.457', 'subsample': '0.20', 'reg_alpha': '0.157', 'reg_lambda': '0.066', 'learning_rate': '0.128', 'num_leaves': '90.000', 'colsample_bytree': '0.827', 'min_child_samples': '230.000', 'feature_fraction': '0.701', 'bagging_fraction': '0.885'}
1 CV - score: 0.8794
2 CV - score: 0.8715
3 CV - score: 0.8966
4 CV - score: 0.8681
5 CV - score: 0.8992
6 CV - score: 0.8879
7 CV - score: 0.8912
Total Time Run: 3.16
  0%|          | 0/27 [03:09<?, ?it/s, best loss: ?]


NameError: ignored