In [1]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold,TimeSeriesSplit
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from math import sin,log,pow
import lightgbm as lgb
import datetime
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import LabelEncoder,StandardScaler
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import multiprocessing as mp
import multiprocessing as mp
from multiprocessing import Pool
from tqdm import tqdm
from functools import partial
pd.options.mode.chained_assignment = None
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
from sklearn.decomposition import PCA

def importer(path,verbose=True):
  start = datetime.datetime.now()
  df=pd.read_csv(path)
  if verbose:
     print('it took: ', datetime.datetime.now()-start)
  gc.collect()
  return(df)

def pow2(x):
  return pow(x,2)
def log_corr(x):
  return log(1+ abs(x))
def inverse(x):
  return 1/(1+abs(x))
import gc

def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

def clean_inf_nan(df):
    return df.replace([np.inf, -np.inf], np.nan)
  
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in tqdm_notebook(df.columns):
        gc.collect()
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def target_encode(trn_series=None, 
                  tst_series=None, 
                  target=None, 
                  min_samples_leaf=1, 
                  smoothing=1,
                  noise_level=0):#,missing_correction=True
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    
#     if missing_correction is True:
#         trn_series.astype(object)
#         trn_series.fillna('missing')
#         tst_series.astype(object)
#         tst_series.fillna('missing')

    temp = pd.concat([trn_series, target], axis=1)
    # Compute target mean 
    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])
    # Compute smoothing
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    # Apply average function to all target data
    prior = target.mean()
    # The bigger the count the less full_avg is taken into account
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    # Apply averages to trn and tst series
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_trn_series.index = trn_series.index 
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)


In [2]:
def sd(col, max_loss_limit=0.001, avg_loss_limit=0.001, na_loss_limit=0, n_uniq_loss_limit=0, fillna=0):
    """
    max_loss_limit - don't allow any float to lose precision more than this value. Any values are ok for GBT algorithms as long as you don't unique values.
                     See https://en.wikipedia.org/wiki/Half-precision_floating-point_format#Precision_limitations_on_decimal_values_in_[0,_1]
    avg_loss_limit - same but calculates avg throughout the series.
    na_loss_limit - not really useful.
    n_uniq_loss_limit - very important parameter. If you have a float field with very high cardinality you can set this value to something like n_records * 0.01 in order to allow some field relaxing.
    """
    is_float = str(col.dtypes)[:5] == 'float'
    na_count = col.isna().sum()
    n_uniq = col.nunique(dropna=False)
    try_types = ['float16', 'float32']

    if na_count <= na_loss_limit:
        try_types = ['int8', 'int16', 'float16', 'int32', 'float32']

    for type in try_types:
        col_tmp = col

        # float to int conversion => try to round to minimize casting error
        if is_float and (str(type)[:3] == 'int'):
            col_tmp = col_tmp.copy().fillna(fillna).round()

        col_tmp = col_tmp.astype(type)
        max_loss = (col_tmp - col).abs().max()
        avg_loss = (col_tmp - col).abs().mean()
        na_loss = np.abs(na_count - col_tmp.isna().sum())
        n_uniq_loss = np.abs(n_uniq - col_tmp.nunique(dropna=False))

        if max_loss <= max_loss_limit and avg_loss <= avg_loss_limit and na_loss <= na_loss_limit and n_uniq_loss <= n_uniq_loss_limit:
            return col_tmp

    # field can't be converted
    return col


def reduce_mem_usage_sd(df, deep=True, verbose=False, obj_to_cat=False):
    numerics = ['int16', 'uint16', 'int32', 'uint32', 'int64', 'uint64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2
    for col in tqdm_notebook(df.columns):
        col_type = df[col].dtypes

        # collect stats
        na_count = df[col].isna().sum()
        n_uniq = df[col].nunique(dropna=False)
        
        # numerics
        if col_type in numerics:
            df[col] = sd(df[col])

        # strings
        if (col_type == 'object') and obj_to_cat:
            df[col] = df[col].astype('category')
        
        if verbose:
            print(f'Column {col}: {col_type} -> {df[col].dtypes}, na_count={na_count}, n_uniq={n_uniq}')
        new_na_count = df[col].isna().sum()
        if (na_count != new_na_count):
            print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost na values. Before: {na_count}, after: {new_na_count}')
        new_n_uniq = df[col].nunique(dropna=False)
        if (n_uniq != new_n_uniq):
            print(f'Warning: column {col}, {col_type} -> {df[col].dtypes} lost unique values. Before: {n_uniq}, after: {new_n_uniq}')

    end_mem = df.memory_usage(deep=deep).sum() / 1024 ** 2
    percent = 100 * (start_mem - end_mem) / start_mem
    print('Mem. usage decreased from {:5.2f} Mb to {:5.2f} Mb ({:.1f}% reduction)'.format(start_mem, end_mem, percent))
    return df


In [3]:
def change(hoge):
    num = 3
    hoge = int(hoge*1000)
    while(hoge % 10 ==0):
        num = num-1
        hoge = hoge /10
    if num<0:
        num = 0
    return num

In [4]:
def id_split(dataframe,label=True):
    dataframe['device_name'] = dataframe['DeviceInfo'].str.split('/', expand=True)[0]
    dataframe['device_version'] = dataframe['DeviceInfo'].str.split('/', expand=True)[1]

    dataframe['OS_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[0]
    dataframe['version_id_30'] = dataframe['id_30'].str.split(' ', expand=True)[1]

    dataframe['browser_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[0]
    dataframe['version_id_31'] = dataframe['id_31'].str.split(' ', expand=True)[1]

    dataframe['screen_width'] = dataframe['id_33'].str.split('x', expand=True)[0]
    dataframe['screen_height'] = dataframe['id_33'].str.split('x', expand=True)[1]

    dataframe['id_34'] = dataframe['id_34'].str.split(':', expand=True)[1]
    dataframe['id_23'] = dataframe['id_23'].str.split(':', expand=True)[1]

    dataframe.loc[dataframe['device_name'].str.contains('SM', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('SAMSUNG', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('GT-', na=False), 'device_name'] = 'Samsung'
    dataframe.loc[dataframe['device_name'].str.contains('Moto G', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('Moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('moto', na=False), 'device_name'] = 'Motorola'
    dataframe.loc[dataframe['device_name'].str.contains('LG-', na=False), 'device_name'] = 'LG'
    dataframe.loc[dataframe['device_name'].str.contains('rv:', na=False), 'device_name'] = 'RV'
    dataframe.loc[dataframe['device_name'].str.contains('HUAWEI', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('ALE-', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('-L', na=False), 'device_name'] = 'Huawei'
    dataframe.loc[dataframe['device_name'].str.contains('Blade', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('BLADE', na=False), 'device_name'] = 'ZTE'
    dataframe.loc[dataframe['device_name'].str.contains('Linux', na=False), 'device_name'] = 'Linux'
    dataframe.loc[dataframe['device_name'].str.contains('XT', na=False), 'device_name'] = 'Sony'
    dataframe.loc[dataframe['device_name'].str.contains('HTC', na=False), 'device_name'] = 'HTC'
    dataframe.loc[dataframe['device_name'].str.contains('ASUS', na=False), 'device_name'] = 'Asus'

    dataframe.loc[dataframe.device_name.isin(dataframe.device_name.value_counts()[dataframe.device_name.value_counts() < 200].index), 'device_name'] = "Others"
    gc.collect()
    if label is True:
        colname=['device_name','device_version','OS_id_30','version_id_30','browser_id_31','version_id_31','screen_width','screen_height','id_34','id_23']
        return(dataframe,colname)
    else:
        return(dataframe)


In [5]:
# path='../input/ieee-fraud-detection/'
# print('Train readed\n')
# train_identity = pd.read_csv(f'{path}train_identity.csv')
# train_transaction = pd.read_csv(f'{path}train_transaction.csv')
# print('Test readed\n')
# test_identity = pd.read_csv(f'{path}test_identity.csv')
# test_transaction = pd.read_csv(f'{path}test_transaction.csv')
# sub = pd.read_csv(f'{path}sample_submission.csv')
# print('Merging\n')
# train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
# test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
# del train_identity,train_transaction,test_identity,test_transaction
# gc.collect()
# print('Reducing Memory\n')


# Name=pd.read_csv('../input/feat-sel-3/feature_selection.csv')
# useful_features=Name['Name'].tolist()
# useful_features+=['M1','V107', 'V108', 'V110', 'V111', 'V112', 'V113', 'V117', 'V118',
#        'V119', 'V120', 'V121', 'V122', 'V284', 'V286', 'V305', 'id_07',
#        'id_08','id_12', 'id_14','id_23','id_34', 'id_27']
# column_to_keep = list(set(['TransactionID','TransactionDT','isFraud']+useful_features))

# train = reduce_mem_usage_sd(train, obj_to_cat=False)
# test = reduce_mem_usage_sd(test, obj_to_cat=False)

# gc.collect()
# print('First Checkpoint - Shape : {}\n'.format(train.shape))
# one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
# one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]

# cols_to_drop =['id_22', 'id_27', 'id_08', 'dist2', 'id_07', 'id_21', 'id_24', 'id_25', 'id_18', 'id_26', 'D7']+[col for col in train.columns if (col not in column_to_keep)]

# dum =list(set(['id_22', 'id_27', 'id_08', 'dist2', 'id_07', 'id_21', 'id_24', 'id_25', 'id_18', 'id_26', 'D7']+one_value_cols+one_value_cols_test))
# cols_to_drop = list(set(one_value_cols+ one_value_cols_test+cols_to_drop))
# column_to_keep = [col for col in column_to_keep if col not in dum]
# #['id_22', 'id_27', 'id_08', 'dist2', 'id_07', 'id_21', 'id_24', 'id_25', 'id_18', 'id_26', 'D7']
# print('Cols Dropped: {}'.format(cols_to_drop))
# train = train.sort_values('TransactionDT').drop(cols_to_drop, axis=1)
# test = test.drop(cols_to_drop, axis=1)

# train = train[column_to_keep]
# test = test[[col for col in column_to_keep if col!='isFraud']]

# print('Second Checkpoint - Shape : {}\n'.format(train.shape))

# train,new_cat =  id_split(train,label=True)
# gc.collect()
# test =  id_split(test,label=False)
# gc.collect()

# Cat= ['ProductCD']+['card'+str(x) for x in range(1,7)]+['addr1','addr2','P_emaildomain','R_emaildomain']+['M'+str(x) for x in range(1,10)]+['id_'+str(x) for x in range(12,39)]+['DeviceType','DeviceInfo']
# Cat = Cat+ new_cat
# Cat=[col for col in Cat if (col not in cols_to_drop)]
# ToDel = ['isFraud', 'TransactionDT', 'TransactionID']
# Num=[item for item in train.columns.values.tolist() if (item not in Cat+ToDel+cols_to_drop)]

# Target=train['isFraud']

# Num=Num#+['card_null','addr_null','email_null','M_null','id_null','tot_null']

# print('decimal feature \n')
# train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
# test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)

# train["TransactionAmt_decimal_number"] = train["TransactionAmt"].map(change)
# test["TransactionAmt_decimal_number"] = test["TransactionAmt"].map(change)

# Num=Num+['TransactionAmt_decimal','TransactionAmt_decimal_number']

# print('Time Feature\n')

# train['Transaction_day_of_week'] = (np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)).astype(str)
# test['Transaction_day_of_week'] = (np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)).astype(str)

# train['Transaction_hour'] = (np.floor(train['TransactionDT'] / 3600) % 24).astype(str)
# test['Transaction_hour'] = (np.floor(test['TransactionDT'] / 3600) % 24).astype(str)
# Cat = Cat+['Transaction_day_of_week','Transaction_hour']

# print('splitter feature\n')
# train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
# train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
# test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
# test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)

# Cat=Cat+['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3','R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
# print('Third Checkpoint - Shape : {}\n'.format(train.shape))

# #transform to string and fill na for str

# print('Feature Interaction\n')
# for feature in ['ProductCD__DeviceType','ProductCD__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__card2',
#           'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:
#   f1, f2 = feature.split('__')
#   mis_pos = (train[f1].isnull() | train[f2].isnull()).tolist()
#   mis_pos_test =  (test[f1].isnull() | test[f2].isnull()).tolist()
#   train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
#   test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)
#   feat_mask = [cats==feature for cats in [x for x in train.columns]]
#   feat_mask_test = [cats==feature for cats in [x for x in test.columns]]
#   train.iloc[mis_pos,feat_mask] = None
#   test.iloc[mis_pos_test,feat_mask_test] = None
# Cat=Cat+['ProductCD__DeviceType','ProductCD__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__card2',
#           'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']

# print('Forth Checkpoint - Shape : {}\n'.format(train.shape))

# for f in tqdm_notebook(Cat):
#   le = LabelEncoder()
#   mis_pos = train[f].isnull().tolist()
#   mis_pos_test =  test[f].isnull().tolist()
#   le.fit(list(train[f].astype(str).values)+list(test[f].astype(str).values))
#   train[f+'_label'] = le.transform(list(train[f].astype(str).values))
#   test[f+'_label'] = le.transform(list(test[f].astype(str).values))
#   feat_mask = [cats==(f+'label') for cats in [x for x in train.columns]]
#   feat_mask_test = [cats==(f+'label') for cats in [x for x in test.columns]]
#   train.iloc[mis_pos,feat_mask] = np.nan
#   test.iloc[mis_pos_test,feat_mask_test] = np.nan


In [6]:
def transformer():
  path='../input/ieee-fraud-detection/'
  print('Train readed\n')
  train_identity = pd.read_csv(f'{path}train_identity.csv')
  train_transaction = pd.read_csv(f'{path}train_transaction.csv')
  print('Test readed\n')
  test_identity = pd.read_csv(f'{path}test_identity.csv')
  test_transaction = pd.read_csv(f'{path}test_transaction.csv')
  sub = pd.read_csv(f'{path}sample_submission.csv')
  print('Merging\n')
  train = pd.merge(train_transaction, train_identity, on='TransactionID', how='left')
  test = pd.merge(test_transaction, test_identity, on='TransactionID', how='left')
  del train_identity,train_transaction,test_identity,test_transaction
  gc.collect()
  print('Reducing Memory\n')

    
  Name=pd.read_csv('../input/feat-sel-3/feature_selection.csv')
  useful_features=Name['Name'].tolist()
  useful_features+=['M1','V107', 'V108', 'V110', 'V111', 'V112', 'V113', 'V117', 'V118',
           'V119', 'V120', 'V121', 'V122', 'V284', 'V286', 'V305', 'id_07',
           'id_08','id_12', 'id_14','id_23','id_34', 'id_27']
  column_to_keep = list(set(['TransactionID','TransactionDT','isFraud']+useful_features))

  train = reduce_mem_usage_sd(train, obj_to_cat=False)
  test = reduce_mem_usage_sd(test, obj_to_cat=False)
    
  gc.collect()
  print('First Checkpoint - Shape : {}\n'.format(train.shape))
  one_value_cols = [col for col in train.columns if train[col].nunique() <= 1]
  one_value_cols_test = [col for col in test.columns if test[col].nunique() <= 1]

  cols_to_drop =['id_22', 'id_27', 'id_08', 'dist2', 'id_07', 'id_21', 'id_24', 'id_25', 'id_18', 'id_26', 'D7']+[col for col in train.columns if (col not in column_to_keep)]

  dum =list(set(['id_22', 'id_27', 'id_08', 'dist2', 'id_07', 'id_21', 'id_24', 'id_25', 'id_18', 'id_26', 'D7']+one_value_cols+one_value_cols_test))
  cols_to_drop = list(set(one_value_cols+ one_value_cols_test+cols_to_drop))
  column_to_keep = [col for col in column_to_keep if col not in dum]
  #['id_22', 'id_27', 'id_08', 'dist2', 'id_07', 'id_21', 'id_24', 'id_25', 'id_18', 'id_26', 'D7']
  print('Cols Dropped: {}'.format(cols_to_drop))
  train = train.sort_values('TransactionDT').drop(cols_to_drop, axis=1)
  test = test.drop(cols_to_drop, axis=1)
  
  train = train[column_to_keep]
  test = test[[col for col in column_to_keep if col!='isFraud']]

  print('Second Checkpoint - Shape : {}\n'.format(train.shape))

  train,new_cat =  id_split(train,label=True)
  gc.collect()
  test =  id_split(test,label=False)
  gc.collect()

  Cat= ['ProductCD']+['card'+str(x) for x in range(1,7)]+['addr1','addr2','P_emaildomain','R_emaildomain']+['M'+str(x) for x in range(1,10)]+['id_'+str(x) for x in range(12,39)]+['DeviceType','DeviceInfo']
  Cat = Cat+ new_cat
  Cat=[col for col in Cat if (col not in cols_to_drop)]
  ToDel = ['isFraud', 'TransactionDT', 'TransactionID']
  Num=[item for item in train.columns.values.tolist() if (item not in Cat+ToDel+cols_to_drop)]
    
  Target=train['isFraud']

  Num=Num#+['card_null','addr_null','email_null','M_null','id_null','tot_null']
    
  print('decimal feature \n')
  train['TransactionAmt_decimal'] = ((train['TransactionAmt'] - train['TransactionAmt'].astype(int)) * 1000).astype(int)
  test['TransactionAmt_decimal'] = ((test['TransactionAmt'] - test['TransactionAmt'].astype(int)) * 1000).astype(int)
  
  train["TransactionAmt_decimal_number"] = train["TransactionAmt"].map(change)
  test["TransactionAmt_decimal_number"] = test["TransactionAmt"].map(change)
  
  Num=Num+['TransactionAmt_decimal','TransactionAmt_decimal_number']
  
  print('Time Feature\n')

  train['Transaction_day_of_week'] = (np.floor((train['TransactionDT'] / (3600 * 24) - 1) % 7)).astype(str)
  test['Transaction_day_of_week'] = (np.floor((test['TransactionDT'] / (3600 * 24) - 1) % 7)).astype(str)
    
  train['Transaction_hour'] = (np.floor(train['TransactionDT'] / 3600) % 24).astype(str)
  test['Transaction_hour'] = (np.floor(test['TransactionDT'] / 3600) % 24).astype(str)
  Cat = Cat+['Transaction_day_of_week','Transaction_hour']
    
  print('splitter feature\n')
  train[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = train['P_emaildomain'].str.split('.', expand=True)
  train[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = train['R_emaildomain'].str.split('.', expand=True)
  test[['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3']] = test['P_emaildomain'].str.split('.', expand=True)
  test[['R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']] = test['R_emaildomain'].str.split('.', expand=True)
  
  Cat=Cat+['P_emaildomain_1', 'P_emaildomain_2', 'P_emaildomain_3','R_emaildomain_1', 'R_emaildomain_2', 'R_emaildomain_3']
  print('Third Checkpoint - Shape : {}\n'.format(train.shape))

  #transform to string and fill na for str

  print('Feature Interaction\n')
  for feature in ['ProductCD__DeviceType','ProductCD__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__card2',
              'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']:
      f1, f2 = feature.split('__')
      mis_pos = (train[f1].isnull() | train[f2].isnull()).tolist()
      mis_pos_test =  (test[f1].isnull() | test[f2].isnull()).tolist()
      train[feature] = train[f1].astype(str) + '_' + train[f2].astype(str)
      test[feature] = test[f1].astype(str) + '_' + test[f2].astype(str)
      feat_mask = [cats==feature for cats in [x for x in train.columns]]
      feat_mask_test = [cats==feature for cats in [x for x in test.columns]]
      train.iloc[mis_pos,feat_mask] = None
      test.iloc[mis_pos_test,feat_mask_test] = None
  Cat=Cat+['ProductCD__DeviceType','ProductCD__DeviceInfo', 'DeviceInfo__P_emaildomain', 'P_emaildomain__card2',
              'card1__card5', 'card2__id_20', 'card5__P_emaildomain', 'addr1__card1']
    
  print('Forth Checkpoint - Shape : {}\n'.format(train.shape))
  
  for f in tqdm_notebook(Cat):
      le = LabelEncoder()
      mis_pos = train[f].isnull().tolist()
      mis_pos_test =  test[f].isnull().tolist()
      le.fit(list(train[f].astype(str).values)+list(test[f].astype(str).values))
      train[f+'_label'] = le.transform(list(train[f].astype(str).values))
      test[f+'_label'] = le.transform(list(test[f].astype(str).values))
      feat_mask = [cats==(f+'label') for cats in [x for x in train.columns]]
      feat_mask_test = [cats==(f+'label') for cats in [x for x in test.columns]]
      train.iloc[mis_pos,feat_mask] = np.nan
      test.iloc[mis_pos_test,feat_mask_test] = np.nan
        
  print('Fifth Checkpoint - Shape : {}\n'.format(train.shape))
  print('Inizio count encoding\n')
    
  for feature in tqdm_notebook(Cat):
      gc.collect()
      train[feature + '_count_full'] = train[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts()) #dropna=False
      test[feature + '_count_full'] = test[feature].map(pd.concat([train[feature], test[feature]], ignore_index=True).value_counts()) #dropna=False

# Encoding - count encoding separately for train and test
  for feature in tqdm_notebook(Cat):
      gc.collect()
      train[feature + '_count_dist'] = train[feature].map(train[feature].value_counts(dropna=False))
      test[feature + '_count_dist'] = test[feature].map(test[feature].value_counts(dropna=False))


  print('Inizio Trasformate\n')
  gc.collect()

  To_Aggr=[col for col in Num if col[0] not in ['V','D']]+[x for x in ['V107', 'V108', 'V110', 'V111', 'V112', 'V113', 'V117', 'V118',
       'V119', 'V120', 'V121', 'V122', 'V284', 'V286', 'V305', 'id_07',
       'id_08'] if x in train.columns] #['M1','id_12', 'id_14', 'id_27']
  for numeric_col in tqdm_notebook(To_Aggr):
    gc.collect()
    train[numeric_col+'_log']=train[numeric_col].transform(log_corr)
    gc.collect()
    test[numeric_col+'_log']=test[numeric_col].transform(log_corr)
  print('Sixth Checkpoint - Shape : {}\n'.format(train.shape))

  print('\nInizio Medie per Giorno, Mese\n')
  for numeric_col in tqdm_notebook(To_Aggr):
    gc.collect()
    for categorical_col in [x for x in ['M1','card4','id_12', 'id_14', 'id_27'] if x in train.columns]: #['Transaction_day_of_week','Transaction_hour','card1', 'card4']
      train[numeric_col+'_mean_'+categorical_col]=train[numeric_col]/train.groupby(categorical_col)[numeric_col].transform('mean')

      test[numeric_col+'_mean_'+categorical_col]=train[numeric_col]/test.groupby(categorical_col)[numeric_col].transform('mean')
  print('Seventh Checkpoint - Shape : {}\n'.format(train.shape))

#   Position = np.max(index_to_ret)+1
  gc.collect()
  X = train.drop(ToDel+Cat, axis=1).copy()
  
#   X = X.iloc[Position:,:]
#   X.reset_index(drop=True,inplace=True)
  gc.collect()
  y = train['isFraud'].copy()
  del train
#   y = y.iloc[Position:]
#   y = y.reset_index(drop=True)
  X_test = test.sort_values('TransactionID').drop(['TransactionDT', 'TransactionID']+Cat, axis=1).copy()
  del test

  gc.collect()

  print('Replace Infinite Train\n')
  for col in tqdm_notebook(X.columns):
    X[col].replace([np.inf, -np.inf], np.nan,inplace=True)

  gc.collect()
  print('Replace Infinite Test\n')
  for col in tqdm_notebook(X_test.columns):
    X_test[col].replace([np.inf, -np.inf], np.nan,inplace=True)
  return(X,X_test,y)

In [7]:
X,X_test,y = transformer()

Train readed

Test readed

Merging

Reducing Memory



HBox(children=(IntProgress(value=0, max=434), HTML(value='')))


Mem. usage decreased from 2598.36 Mb to 1327.81 Mb (48.9% reduction)


HBox(children=(IntProgress(value=0, max=433), HTML(value='')))


Mem. usage decreased from 2243.87 Mb to 1149.38 Mb (48.8% reduction)
First Checkpoint - Shape : (590540, 434)

Cols Dropped: ['V237', 'V4', 'V146', 'V153', 'V26', 'V224', 'V276', 'V59', 'V339', 'V41', 'V144', 'V263', 'V184', 'V301', 'V199', 'V180', 'V236', 'V49', 'V2', 'id_15', 'V23', 'V194', 'V248', 'V6', 'V46', 'V221', 'V192', 'V51', 'V214', 'V336', 'V10', 'V327', 'V98', 'V338', 'V104', 'V106', 'V213', 'V240', 'id_25', 'id_22', 'V126', 'V255', 'V300', 'V226', 'V250', 'V249', 'V243', 'V50', 'V175', 'V334', 'V245', 'V97', 'V190', 'V299', 'V60', 'V148', 'V154', 'V275', 'V34', 'V322', 'V7', 'V100', 'V84', 'V140', 'V204', 'V241', 'V80', 'V32', 'V260', 'V293', 'V259', 'V18', 'V158', 'V176', 'V330', 'V43', 'V57', 'V188', 'V230', 'V114', 'id_04', 'V115', 'V211', 'V30', 'V181', 'V139', 'V103', 'V233', 'V173', 'V228', 'V182', 'V72', 'V239', 'V247', 'V193', 'V269', 'V171', 'V138', 'V71', 'V256', 'V27', 'V11', 'V290', 'id_28', 'V16', 'V220', 'id_10', 'V331', 'V155', 'V265', 'M9', 'V231', 'V216'

HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Fifth Checkpoint - Shape : (590540, 266)

Inizio count encoding



HBox(children=(IntProgress(value=0, max=57), HTML(value='')))




HBox(children=(IntProgress(value=0, max=57), HTML(value='')))


Inizio Trasformate



HBox(children=(IntProgress(value=0, max=37), HTML(value='')))


Sixth Checkpoint - Shape : (590540, 413)


Inizio Medie per Giorno, Mese



HBox(children=(IntProgress(value=0, max=37), HTML(value='')))


Seventh Checkpoint - Shape : (590540, 561)

Replace Infinite Train



HBox(children=(IntProgress(value=0, max=503), HTML(value='')))


Replace Infinite Test



HBox(children=(IntProgress(value=0, max=503), HTML(value='')))




In [8]:
# X = reduce_mem_usage_sd(X, obj_to_cat=True)
# X_test = reduce_mem_usage_sd(X_test, obj_to_cat=True)
gc.collect()

3

In [9]:
seed=1

lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'feature_fraction': 0.44301599784064954,
                    'lambda_l1': 0.7185712774952702,
                    'lambda_l2': 0.8036657945008269,
                    'learning_rate': 0.006820638087926107,
                    'min_data_in_leaf': int(122.18518093103775),
                    'min_gain_to_split': 0.8732382864345388,
                    'min_sum_hessian_in_leaf': 0.009332742523926576,
                    'num_leaves': int(274.4907722765963),
                    'max_depth': int(30.889651140632285),
                    'tree_learner':'serial',
                    'max_bin':255,
                    'seed': seed,
                    'early_stopping_rounds':500, 
                }

gc.collect()

0

In [10]:
sub = pd.read_csv('../input/ieee-fraud-detection/sample_submission.csv')
gc.collect()

#693 1169.2692
#Count encoding :892  auc : 0.894752532172477
#Count encoding tolgo i log alle V variable: 983  auc :0.896614759148957  ; 
#Count encoding tolgo i log alle V,i variable - aggiungo card,1,card,4,addr1 per mean by: 983  auc :0.90005  ;
#Count encodint tolgo i .... :1118, 0.901521

######FINO A QUI TUTTI SBAGLIATI
#tutte variabili nuove auc : 1346; 0.921537

#con tutte 0.9179521995090667
#senza quelle con shift forte 0.913782
#tolgo variabili con >.9 nan e i log 0.917742
#tolgo variabili con >.9 nan e non fillo con miss i nan categorici 

#tolgo variabili con >.9 nan e somma per riga e categorie dei nan .9173
#tolgo variabili con >.9 nan e somma per riga dei nan 0.9179111023919577


#tolgo variabili con >.9 nan 0.918369
#tolgo variabili con >.9 nan e aggiungo trasformate variabili shift invariant 0.919684
#tolgo variabili con >.9 nan , aggiungo trasformate variabili shift invariant e rfecv 0.920157
#tolgo variabili con >.9 nan ,aggiungo trasformate variabili shift invariant e rfecv 0.9207036212788067  
idxs = np.arange(X.shape[0])
trn_idx,val_idx = [(idxs[:400000], idxs[450000:])][0]

train_x, train_y = X.iloc[trn_idx,:], y.iloc[trn_idx]
valid_x, valid_y = X.iloc[val_idx,:], y.iloc[val_idx]

model = lgb.train(lgb_params,lgb.Dataset(train_x, label=train_y),100000,lgb.Dataset(valid_x, label=valid_y),verbose_eval=50)
num_boost = model.best_iteration
Pred=model.predict(valid_x)
print('AUC-ROC : {}  ; Iteration : {}\n'.format(roc_auc_score(valid_y,Pred),model.best_iteration))
del train_x,train_y,valid_x,valid_y
gc.collect()
del model, Pred
gc.collect()

# num_boost *= X.shape[0]/len(trn_idx)
num_boost = int(num_boost)
gc.collect()
################################################
# Final_Boost=num_boost
# print('AUC-ROC CV: {}  ; Iteration Finale : {} \n'.format(Score,Final_Boost))




Training until validation scores don't improve for 500 rounds.
[50]	valid_0's auc: 0.877104
[100]	valid_0's auc: 0.885155
[150]	valid_0's auc: 0.889653
[200]	valid_0's auc: 0.893151
[250]	valid_0's auc: 0.89671
[300]	valid_0's auc: 0.899941
[350]	valid_0's auc: 0.903322
[400]	valid_0's auc: 0.90631
[450]	valid_0's auc: 0.909517
[500]	valid_0's auc: 0.912373
[550]	valid_0's auc: 0.914562
[600]	valid_0's auc: 0.916461
[650]	valid_0's auc: 0.917701
[700]	valid_0's auc: 0.918621
[750]	valid_0's auc: 0.919302
[800]	valid_0's auc: 0.919852
[850]	valid_0's auc: 0.920149
[900]	valid_0's auc: 0.920257
[950]	valid_0's auc: 0.920382
[1000]	valid_0's auc: 0.920347
[1050]	valid_0's auc: 0.920233
[1100]	valid_0's auc: 0.920068
[1150]	valid_0's auc: 0.91992
[1200]	valid_0's auc: 0.919664
[1250]	valid_0's auc: 0.919315
[1300]	valid_0's auc: 0.918979
[1350]	valid_0's auc: 0.918691
[1400]	valid_0's auc: 0.918526
Early stopping, best iteration is:
[946]	valid_0's auc: 0.920414
AUC-ROC : 0.920413707556920

0

In [11]:
lgb_params = {
                    'objective':'binary',
                    'boosting_type':'gbdt',
                    'metric':'auc',
                    'n_jobs':-1,
                    'feature_fraction': 0.44301599784064954,
                    'lambda_l1': 0.7185712774952702,
                    'lambda_l2': 0.8036657945008269,
                    'learning_rate': 0.006820638087926107,
                    'min_data_in_leaf': int(122.18518093103775),
                    'min_gain_to_split': 0.8732382864345388,
                    'min_sum_hessian_in_leaf': 0.009332742523926576,
                    'num_leaves': int(274.4907722765963),
                    'max_depth': int(30.889651140632285),
                    'tree_learner':'serial',
                    'max_bin':255,
                    'seed': seed,
                }

model = lgb.train(lgb_params,lgb.Dataset(X, label=y),num_boost)
print('Finish final model')
sub['isFraud']=model.predict(X_test)
sub.to_csv('submission_ensemble.csv', index=False)

Finish final model
