In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold,TimeSeriesSplit
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from math import sin,log,pow
import lightgbm as lgb
import datetime
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder,StandardScaler
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import multiprocessing as mp
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from functools import partial
pd.options.mode.chained_assignment = None
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier, Pool, cv

def importer(path,verbose=True):
  start = datetime.datetime.now()
  df=pd.read_csv(path)
  if verbose:
     print('it took: ', datetime.datetime.now()-start)
  gc.collect()
  return(df)

def pow2(x):
  return pow(x,2)
def log_corr(x):
  return log(1+ abs(x))
def inverse(x):
  return 1/(1+abs(x))
import gc

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)
  
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in tqdm_notebook(df.columns):
        gc.collect()
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):#,missing_correction=True
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    
#     if missing_correction is True:
#         trn_series.astype(object)
#         trn_series.fillna('missing')
#         tst_series.astype(object)
#         tst_series.fillna('missing')

    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)


In [2]:
def sd(col, max_loss_limit=0.001, avg_loss_limit=0.001, na_loss_limit=0, n_uniq_loss_limit=0, fillna=0):
    """
    max_loss_limit - don't allow any float to lose precision more than this value. Any values are ok for GBT algorithms as long as you don't unique values.
                     See https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Precision_limitations_on_decimal_values_in_[0,_1]
    avg_loss_limit - same but calculates avg throughout the series.
    na_loss_limit - not really useful.
    n_uniq_loss_limit - very important parameter. If you have a float field with very high cardinality you can set this value to something like n_records * 0.01 in order to allow some field relaxing.
    """
    is_float = str(col.dtypes)[:5] == 'float'
    na_count = col.isna().sum()
    n_uniq = col.nunique(dropna=False)
    try_types = ['float16', 'float32']

    if na_count <= na_loss_limit:
        try_types = ['int8', 'int16', 'float16', 'int32', 'float32']

    for type in try_types:
        col_tmp = col

        # float to int conversion => try to round to minimize casting error
        if is_float and (str(type)[:3] == 'int'):
            col_tmp = col_tmp.copy().fillna(fillna).round()

        col_tmp = col_tmp.astype(type)
        max_loss = (col_tmp - col).abs().max()
        avg_loss = (col_tmp - col).abs().mean()
        na_loss = np.abs(na_count - col_tmp.isna().sum())
        n_uniq_loss = np.abs(n_uniq - col_tmp.nunique(dropna=False))

        if max_loss <= max_loss_limit and avg_loss <= avg_loss_limit and na_loss <= na_loss_limit and n_uniq_loss <= n_uniq_loss_limit:
            return col_tmp

    # field can't be converted
    return col


def reduce_mem_usage_sd(df, deep=True, verbose=False, obj_to_cat=False):
    numerics = ['int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2
    for col in tqdm_notebook(df.columns):
        col_type = df[col].dtypes

        # collect stats
        na_count = df[col].isna().sum()
        n_uniq = df[col].nunique(dropna=False)
        
        # numerics
        if col_type in numerics:
            df[col] = sd(df[col])

        # strings
        if (col_type == 'object') and obj_to_cat:
            df[col] = df[col].astype('category')
        
        if verbose:
            print(f'Column {col}: {col_type} -> {df[col].dtypes}, na_count={na_count}, n_uniq={n_uniq}')
        new_na_count = df[col].isna().sum()
        if (na_count != new_na_count):
            print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost na values. Before: {na_count}, after: {new_na_count}')
        new_n_uniq = df[col].nunique(dropna=False)
        if (n_uniq != new_n_uniq):
            print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost unique values. Before: {n_uniq}, after: {new_n_uniq}')

    end_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2
    percent = 100 * (start_mem - end_mem) / start_mem
    print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, percent))
    return df


In [3]:
def change(hoge):
    num = 3
    hoge = int(hoge*1000)
    while(hoge % 10 ==0):
        num = num-1
        hoge = hoge /10
    if num<0:
        num = 0
    return num

In [4]:
def id_split(dataframe,label=True):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe['OS_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[0]
    dataframe['version_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[1]

    dataframe['browser_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[0]
    dataframe['version_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[1]

    dataframe['screen_width'] = dataframe['id_33'].str.split('x', expand=True)[0]
    dataframe['screen_height'] = dataframe['id_33'].str.split('x', expand=True)[1]

    dataframe['id_34'] = dataframe['id_34'].str.split(':', expand=True)[1]
    dataframe['id_23'] = dataframe['id_23'].str.split(':', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    gc.collect()
    if label is True:
        colname=['device_name','device_version','OS_id_30','version_id_30','browser_id_31','version_id_31','screen_width','screen_height','id_34','id_23']
        return(dataframe,colname)
    else:
        return(dataframe)


In [5]:
def transformer():
  path='../input/ieee-fraud-detection/'
  print('Train readed\n')
  train_identity = pd.read_csv(f'{path}train_identity.csv')
  train_transaction = pd.read_csv(f'{path}train_transaction.csv')
  print('Test readed\n')
  test_identity = pd.read_csv(f'{path}test_identity.csv')
  test_transaction = pd.read_csv(f'{path}test_transaction.csv')
  sub = pd.read_csv(f'{path}sample_submission.csv')
  print('Merging\n')
  train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
  test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
  del train_identity,train_transaction,test_identity,test_transaction
  gc.collect()
  print('Reducing Memory\n')

    
  Name=pd.read_csv('../input/feat-sel-3/feature_selection.csv')
  useful_features=Name['Name'].tolist()
  useful_features+=['M1','V107', 'V108', 'V110', 'V111', 'V112', 'V113', 'V117', 'V118',
           'V119', 'V120', 'V121', 'V122', 'V284', 'V286', 'V305', 'id_07',
           'id_08','id_12', 'id_14','id_23','id_34', 'id_27']
  column_to_keep = list(set(['TransactionID','TransactionDT','isFraud']+useful_features))

  train = reduce_mem_usage_sd(train, obj_to_cat=False)
  test = reduce_mem_usage_sd(test, obj_to_cat=False)
    
  gc.collect()
  print('First Checkpoint - Shape : {}\n'.format(train.shape))
  one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
  one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]

  cols_to_drop =['id_22', 'id_27', 'id_08', 'dist2', 'id_07', 'id_21', 'id_24', 'id_25', 'id_18', 'id_26', 'D7']+[col for col in train.columns if (col not in column_to_keep)]

  dum =list(set(['id_22', 'id_27', 'id_08', 'dist2', 'id_07', 'id_21', 'id_24', 'id_25', 'id_18', 'id_26', 'D7']+one_value_cols+one_value_cols_test))
  cols_to_drop = list(set(one_value_cols+ one_value_cols_test+cols_to_drop))
  column_to_keep = [col for col in column_to_keep if col not in dum]
  #['id_22', 'id_27', 'id_08', 'dist2', 'id_07', 'id_21', 'id_24', 'id_25', 'id_18', 'id_26', 'D7']
#   print('Cols Dropped: {}'.format(cols_to_drop))
  train = train.sort_values('TransactionDT').drop(cols_to_drop, axis=1)
  test = test.drop(cols_to_drop, axis=1)
  
  train = train[column_to_keep]
  test = test[[col for col in column_to_keep if col!='isFraud']]

  print('Second Checkpoint - Shape : {}\n'.format(train.shape))

  train,new_cat =  id_split(train,label=True)
  gc.collect()
  test =  id_split(test,label=False)
  gc.collect()

  Cat= ['ProductCD']+['card'+str(x) for x in range(1,7)]+['addr1','addr2','P_emaildomain','R_emaildomain']+['M'+str(x) for x in range(1,10)]+['id_'+str(x) for x in range(12,39)]+['DeviceType','DeviceInfo']
  Cat = Cat+ new_cat
  Cat=[col for col in Cat if (col not in cols_to_drop)]
  ToDel = ['isFraud', 'TransactionDT', 'TransactionID']
  Num=[item for item in train.columns.values.tolist() if (item not in Cat+ToDel+cols_to_drop)]
    
  Target=train['isFraud']

  Num=Num#+['card_null','addr_null','email_null','M_null','id_null','tot_null']
    
  print('decimal feature \n')
  train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
  test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)
  
  train["TransactionAmt_decimal_number"] = train["TransactionAmt"].map(change)
  test["TransactionAmt_decimal_number"] = test["TransactionAmt"].map(change)
  
  Num=Num+['TransactionAmt_decimal','TransactionAmt_decimal_number']
  
  print('Time Feature\n')

  train['Transaction_day_of_week'] = (np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)).astype(str)
  test['Transaction_day_of_week'] = (np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)).astype(str)
    
  train['Transaction_hour'] = (np.floor(train['TransactionDT'] / 3600) % 24).astype(str)
  test['Transaction_hour'] = (np.floor(test['TransactionDT'] / 3600) % 24).astype(str)
  Cat = Cat+['Transaction_day_of_week','Transaction_hour']
    
  print('splitter feature\n')
  train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
  train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
  test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
  test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)
  
  Cat=Cat+['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3','R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
  print('Third Checkpoint - Shape : {}\n'.format(train.shape))

  #transform to string and fill na for str

  print('Feature Interaction\n')
  for feature in ['ProductCD__DeviceType','ProductCD__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__card2',
              'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:
      f1, f2 = feature.split('__')
      mis_pos = (train[f1].isnull() | train[f2].isnull()).tolist()
      mis_pos_test =  (test[f1].isnull() | test[f2].isnull()).tolist()
      train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
      test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)
      feat_mask = [cats==feature for cats in [x for x in train.columns]]
      feat_mask_test = [cats==feature for cats in [x for x in test.columns]]
      train.iloc[mis_pos,feat_mask] = None
      test.iloc[mis_pos_test,feat_mask_test] = None
  Cat=Cat+['ProductCD__DeviceType','ProductCD__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__card2',
              'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']
    
  print('Forth Checkpoint - Shape : {}\n'.format(train.shape))
  
#   for f in tqdm_notebook(Cat):
#       le = LabelEncoder()
#       mis_pos = train[f].isnull().tolist()
#       mis_pos_test =  test[f].isnull().tolist()
#       le.fit(list(train[f].astype(str).values)+list(test[f].astype(str).values))
#       train[f+'_label'] = le.transform(list(train[f].astype(str).values))
#       test[f+'_label'] = le.transform(list(test[f].astype(str).values))
#       feat_mask = [cats==(f+'label') for cats in [x for x in train.columns]]
#       feat_mask_test = [cats==(f+'label') for cats in [x for x in test.columns]]
#       train.iloc[mis_pos,feat_mask] = np.nan
#       test.iloc[mis_pos_test,feat_mask_test] = np.nan
        
  print('Fifth Checkpoint - Shape : {}\n'.format(train.shape))
  print('Inizio count encoding\n')
    
  for feature in tqdm_notebook(Cat):
      gc.collect()
      train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))
      test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts(dropna=False))

# Encoding - count encoding separately for train and test
  for feature in tqdm_notebook(Cat):
      gc.collect()
      train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
      test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))


  print('Inizio Trasformate\n')
  gc.collect()

  To_Aggr=[col for col in Num if col[0] not in ['V','D']]+[x for x in ['V107', 'V108', 'V110', 'V111', 'V112', 'V113', 'V117', 'V118',
       'V119', 'V120', 'V121', 'V122', 'V284', 'V286', 'V305', 'id_07',
       'id_08'] if x in train.columns] 
  for numeric_col in tqdm_notebook(To_Aggr):
    gc.collect()
    train[numeric_col+'_log']=train[numeric_col].transform(log_corr)
    gc.collect()
    test[numeric_col+'_log']=test[numeric_col].transform(log_corr)
  print('Sixth Checkpoint - Shape : {}\n'.format(train.shape))

  print('\nInizio Medie per Giorno, Mese\n')
  for numeric_col in tqdm_notebook(To_Aggr):
    gc.collect()
    for categorical_col in [x for x in ['M1','card4','id_12', 'id_14', 'id_27'] if x in train.columns]: #['Transaction_day_of_week','Transaction_hour','card1', 'card4']
      train[numeric_col+'_mean_'+categorical_col]=train[numeric_col]/train.groupby(categorical_col)[numeric_col].transform('mean')

      test[numeric_col+'_mean_'+categorical_col]=train[numeric_col]/test.groupby(categorical_col)[numeric_col].transform('mean')
  print('Seventh Checkpoint - Shape : {}\n'.format(train.shape))

#   Position = np.max(index_to_ret)+1
  gc.collect()
  X = train.drop(ToDel, axis=1).copy()
  
#   X = X.iloc[Position:,:]
#   X.reset_index(drop=True,inplace=True)
  gc.collect()
  y = train['isFraud'].copy()
  del train
#   y = y.iloc[Position:]
#   y = y.reset_index(drop=True)
  X_test = test.sort_values('TransactionID').drop(['TransactionDT', 'TransactionID'], axis=1).copy()
  del test

  gc.collect()

  print('Replace Infinite Train\n')
  for col in tqdm_notebook(X.columns):
    X[col].replace([np.inf, -np.inf,np.nan],-9999,inplace=True)

  gc.collect()
  print('Replace Infinite Test\n')
  for col in tqdm_notebook(X_test.columns):
    X_test[col].replace([np.inf, -np.inf,np.nan], -9999,inplace=True)
  return(X,X_test,y,Cat)

In [6]:
X,X_test,y,Cat = transformer()

Train readed

Test readed

Merging

Reducing Memory



HBox(children=(IntProgress(value=0, max=434), HTML(value='')))


Mem. usage decreased from 2598.36 Mb to 1327.81 Mb (48.9% reduction)


HBox(children=(IntProgress(value=0, max=433), HTML(value='')))


Mem. usage decreased from 2243.87 Mb to 1149.38 Mb (48.8% reduction)
First Checkpoint - Shape : (590540, 434)

Second Checkpoint - Shape : (590540, 185)

decimal feature 

Time Feature

splitter feature

Third Checkpoint - Shape : (590540, 203)

Feature Interaction

Forth Checkpoint - Shape : (590540, 211)

Fifth Checkpoint - Shape : (590540, 211)

Inizio count encoding



HBox(children=(IntProgress(value=0, max=57), HTML(value='')))




HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Inizio Trasformate



HBox(children=(IntProgress(value=0, max=37), HTML(value='')))


Sixth Checkpoint - Shape : (590540, 358)


Inizio Medie per Giorno, Mese



HBox(children=(IntProgress(value=0, max=37), HTML(value='')))


Seventh Checkpoint - Shape : (590540, 506)

Replace Infinite Train



HBox(children=(IntProgress(value=0, max=503), HTML(value='')))


Replace Infinite Test



HBox(children=(IntProgress(value=0, max=503), HTML(value='')))




In [7]:
# X = reduce_mem_usage_sd(X, obj_to_cat=True)
# X_test = reduce_mem_usage_sd(X_test, obj_to_cat=True)
gc.collect()

3

In [8]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
seed=1

gc.collect()

#tolgo variabili con >.9 nan , aggiungo trasformate variabili shift invariant e rfecv 0.919413
idxs = np.arange(X.shape[0])
trn_idx,val_idx = [(idxs[:400000], idxs[450000:])][0]

train_x, train_y = X.iloc[trn_idx,:], y.iloc[trn_idx]
valid_x, valid_y = X.iloc[val_idx,:], y.iloc[val_idx]

#cv 0.9123506482
cat_params = {
    'iterations':100000,
    'learning_rate': .07,
    'depth': 8,
    'random_strength': 0.5,
    'loss_function':'CrossEntropy',
    'eval_metric':'AUC',
    'verbose':50,
    'random_state':0,
    'task_type' : 'CPU',
    'early_stopping_rounds' : 100,
}

model = CatBoostClassifier(**cat_params)

model.fit(
    train_x, train_y,
    cat_features=Cat,
    eval_set=(valid_x, valid_y))
del train_x,train_y,valid_x,valid_y
gc.collect()

0:	test: 0.7144244	best: 0.7144244 (0)	total: 4.64s	remaining: 5d 8h 54m 15s
50:	test: 0.8894498	best: 0.8894498 (50)	total: 3m 30s	remaining: 4d 18h 45m 23s
100:	test: 0.9005708	best: 0.9005708 (100)	total: 6m 56s	remaining: 4d 18h 24m 23s
150:	test: 0.9051511	best: 0.9051511 (150)	total: 10m 19s	remaining: 4d 17h 46m 13s
200:	test: 0.9071249	best: 0.9071354 (198)	total: 13m 40s	remaining: 4d 17h 6m 37s
250:	test: 0.9085620	best: 0.9085620 (250)	total: 17m 5s	remaining: 4d 17h 11m 9s
300:	test: 0.9099526	best: 0.9099526 (300)	total: 20m 40s	remaining: 4d 18h 8m 38s
350:	test: 0.9104880	best: 0.9105023 (348)	total: 24m 11s	remaining: 4d 18h 27m 56s
400:	test: 0.9112792	best: 0.9113190 (396)	total: 27m 44s	remaining: 4d 18h 48m 39s
450:	test: 0.9111995	best: 0.9115407 (424)	total: 31m 22s	remaining: 4d 19h 24m 1s
500:	test: 0.9118226	best: 0.9118226 (500)	total: 34m 53s	remaining: 4d 19h 29m 35s
550:	test: 0.9127877	best: 0.9129783 (545)	total: 38m 28s	remaining: 4d 19h 43m 10s
600:	tes

0

In [9]:
num_boost = model.get_best_iteration()

cat_params = {
    'iterations':num_boost,
    'learning_rate': 0.07,
    'depth': 8,
    'random_strength': 0.5,
    'loss_function':'CrossEntropy',
    'random_state':0,
    'task_type' : 'CPU',
    'verbose':False,
}

model = CatBoostClassifier(**cat_params)
model.fit(
    X, y,
    cat_features=Cat)

print('Finish final model')
sub['isFraud']=model.predict_proba(X_test)[:,1]
sub.to_csv('submission_ensemble_catboost.csv', index=False)

Finish final model
