# Categorical Features CV Encoding

### Explanation: https://medium.com/@pouryaayria/k-fold-target-encoding-dfe9a594874b

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
!pip install category_encoders

In [0]:
# General imports
import numpy as np
import pandas as pd
import os, sys, gc, warnings, random, datetime, time, multiprocessing
import pickle

from sklearn import metrics
from sklearn.model_selection import train_test_split, GroupKFold
from sklearn.preprocessing import LabelEncoder
import category_encoders as ce

from tqdm import tqdm

import math
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 500)

In [0]:
# Seeder
# :seed to make all processes deterministic     # type: int
def seed_everything(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    
# Memory Reducer
def memory_usage_mb(df, *args, **kwargs):
    """Dataframe memory usage in MB. """
    return df.memory_usage(*args, **kwargs).sum() / 1024**2

def reduce_mem_usage(df, deep=True, verbose=False, categories=False):
    # All types that we want to change for "lighter" ones.
    # int8 and float16 are not include because we cannot reduce
    # those data types.
    # float32 is not include because float16 has too low precision.
    numeric2reduce = ["int16", "int32", "int64", "float64"]
    start_mem = 0
    start_mem = memory_usage_mb(df, deep=deep)

    for col, col_type in df.dtypes.iteritems():
        best_type = None
        if categories:
            if col_type == "object":
                df[col] = df[col].astype("category")
                best_type = "category"
        elif col_type in numeric2reduce:
            downcast = "integer" if "int" in str(col_type) else "float"
            df[col] = pd.to_numeric(df[col], downcast=downcast)
            best_type = df[col].dtype.name
        # Log the conversion performed.
        if verbose and best_type is not None and best_type != str(col_type):
            print(f"Column '{col}' converted from {col_type} to {best_type}")

    end_mem = memory_usage_mb(df, deep=deep)
    diff_mem = start_mem - end_mem
    percent_mem = 100 * diff_mem / start_mem
    print(f"Memory usage decreased from"
          f" {start_mem:.2f}MB to {end_mem:.2f}MB"
          f" ({diff_mem:.2f}MB, {percent_mem:.2f}% reduction)")
    
    return df

# Check memory
def show_mem_usage():
    '''Displays memory usage from inspection
    of global variables in this notebook'''
    gl = sys._getframe(1).f_globals
    vars= {}
    for k,v in list(gl.items()):
        # for pandas dataframes
        if hasattr(v, 'memory_usage'):
            mem = v.memory_usage(deep=True)
            if not np.isscalar(mem):
                mem = mem.sum()
            vars.setdefault(id(v),[mem]).append(k)
        # work around for a bug
        elif isinstance(v,pd.Panel):
            v = v.values
        vars.setdefault(id(v),[sys.getsizeof(v)]).append(k)
    total = 0
    for k,(value,*names) in vars.items():
        if value>1e6:
            print(names,"%.3fMB"%(value/1e6))
        total += value
    print("%.3fMB"%(total/1e6))

In [0]:
# DATA LOAD

print('Load Data')
train_df = pd.read_csv('../content/gdrive/My Drive/IEEE fraud Kaggle 2019/train.csv')
   

test_df = pd.read_csv('../content/gdrive/My Drive/IEEE fraud Kaggle 2019/test.csv')
test_df['isFraud'] = 0

print('Shape control:\nTrain:', train_df.shape, '\nTest:',test_df.shape)

Load Data
Shape control:
Train: (590540, 357) 
Test: (506691, 356)


In [0]:
train_df = train_df.replace([np.inf, -np.inf], np.nan)
train_df = train_df.fillna(-10000)

test_df = test_df.replace([np.inf, -np.inf], np.nan)
test_df = test_df.fillna(-10000)

In [0]:
# Full list
categorical_features = [ 'M2',
                         'M2__M3_0',
                         'M2__M3_1',
                         'M2__M3_2',
                         'M2__M3_3',
                         'M3',
                         'M4',
                         'M5',
                         'M6',
                         'M7',
                         'M8',
                         'M9',
                        'DeviceInfo_version','DeviceInfo_device','DeviceInfo',
                        'ProductCD','product_type',
                        'card1','card2','card3', 'card4','card5','card6',
                        'card2__dist1','card1__card5',
                        'addr1','addr2',
                        'addr1__card1',
                        'R_emaildomain','P_emaildomain',
                        'P_emaildomain__C2','DeviceInfo__P_emaildomain',
                        'card5__P_emaildomain', 
                        'card2__id_20', 'D11__DeviceInfo',
                        'D8__D9','M2__M3']

In [0]:
tokeep = ['TransactionID','isFraud']
tokeep.extend(categorical_features)
print(len(tokeep))

train_df = train_df[tokeep]
test_df = test_df[tokeep]
test_df.drop(columns=['isFraud'],inplace=True)
print('Shape control:\nTrain:', train_df.shape, '\nTest:',test_df.shape)

35
Shape control:
Train: (590540, 35) 
Test: (506691, 34)


In [0]:
groups = pd.read_csv('../content/gdrive/My Drive/IEEE fraud Kaggle 2019/groupkfolds_5.csv')

In [0]:
train_df = train_df.merge(groups, how='left',on='TransactionID')
print('Shape control:\nTrain:', train_df.shape, '\nTest:',test_df.shape)

Shape control:
Train: (590540, 36) 
Test: (506691, 34)


In [0]:
train_df.head()

Unnamed: 0,TransactionID,isFraud,M2,M2__M3_0,M2__M3_1,M2__M3_2,M2__M3_3,M3,M4,M5,M6,M7,M8,M9,DeviceInfo_version,DeviceInfo_device,DeviceInfo,product_type,card1,card2,card3,card5,card2__dist1,card1__card5,addr1,addr2,addr1__card1,R_emaildomain,P_emaildomain,P_emaildomain__C2,DeviceInfo__P_emaildomain,card5__P_emaildomain,card2__id_20,D11__DeviceInfo,D8__D9,fold
0,2987000,0,1.0,0,0,1,0,1.0,122947.0,0.0,1.0,-10000.0,-10000.0,-10000.0,1445,1310,2565,10645,13930.0,327.0,150.0,142.0,14264,2369,315.0,87.0,24866,49,49,4112,3969,487,3892,35,0,0
1,2987001,0,-10000.0,0,0,0,1,-10000.0,357789.0,1.0,1.0,-10000.0,-10000.0,-10000.0,1445,1310,2565,8545,2756.0,404.0,150.0,102.0,18027,5539,325.0,87.0,27328,49,16,1522,3953,43,5250,2241,0,0
2,2987002,0,1.0,0,0,1,0,1.0,357789.0,0.0,0.0,0.0,0.0,0.0,1445,1310,2565,10361,4664.0,490.0,150.0,166.0,22898,6527,330.0,87.0,30056,49,35,4555,3973,716,6774,241,0,0
3,2987003,0,-10000.0,0,0,0,1,-10000.0,357789.0,1.0,0.0,-10000.0,-10000.0,-10000.0,1445,1310,2565,10027,18130.0,567.0,150.0,117.0,29152,4894,476.0,87.0,40465,49,55,5820,3990,160,8614,2241,0,0
4,2987004,0,-10000.0,0,0,0,1,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,-10000.0,1251,882,1663,5631,4496.0,514.0,150.0,102.0,25396,6449,420.0,87.0,34391,49,16,1522,5079,43,7148,2544,0,0


In [0]:
test_df.head()

Unnamed: 0,TransactionID,M2,M2__M3_0,M2__M3_1,M2__M3_2,M2__M3_3,M3,M4,M5,M6,M7,M8,M9,DeviceInfo_version,DeviceInfo_device,DeviceInfo,product_type,card1,card2,card3,card5,card2__dist1,card1__card5,addr1,addr2,addr1__card1,R_emaildomain,P_emaildomain,P_emaildomain__C2,DeviceInfo__P_emaildomain,card5__P_emaildomain,card2__id_20,D11__DeviceInfo,D8__D9
0,3663549,1.0,0,1,0,0,0.0,-10000.0,-10000.0,0.0,1.0,1.0,1.0,1445,1310,2565,8769,10410.0,111.0,150.0,226.0,778,265,170.0,87.0,4507,49,16,2088,3953,1272,372,117,0
1,3663550,0.0,1,0,0,0,0.0,357789.0,-10000.0,0.0,-10000.0,-10000.0,-10000.0,1445,1310,2565,9962,4270.0,111.0,150.0,226.0,1586,6337,299.0,87.0,22663,49,2,700,3939,1258,372,595,0
2,3663551,1.0,0,1,0,0,0.0,357789.0,0.0,0.0,0.0,0.0,0.0,1445,1310,2565,6996,4476.0,574.0,150.0,226.0,29334,6436,472.0,87.0,39671,49,19,2669,3956,1275,8681,42,0
3,3663552,1.0,0,0,1,0,1.0,-10000.0,-10000.0,1.0,-10000.0,-10000.0,-10000.0,1445,1310,2565,8502,10990.0,360.0,150.0,166.0,15502,625,205.0,87.0,12166,49,16,1839,3953,696,4373,160,0
4,3663553,1.0,0,0,1,0,1.0,-10000.0,-10000.0,0.0,0.0,1.0,1.0,1445,1310,2565,10619,18020.0,452.0,150.0,117.0,19837,4827,264.0,87.0,17543,49,16,2088,3953,132,6007,135,0


In [0]:
def cv_encoding(train_df, test_df, categorical_features, target='isFraud', id_col='TransactionID', encoder='CatBoostEncoder', return_fold_col=False):
  '''
  This function encodes using CV Encoding technique link: https://medium.com/@pouryaayria/k-fold-target-encoding-dfe9a594874b.
  
  Input:
  - train_df: pandas df with just categorical columns, target column, id column and fold column. All numerical.
  - test_df: pandas df with just categorical columns and id column. All numerical.
  - categorical_features: list containing the categorical features to be encoded
  - id_col: str, specify the ID column
  - encoder: https://contrib.scikit-learn.org/categorical-encoding/index.html
    Encoding options:
    'BaseNEncoder','BinaryEncoder','CatBoostEncoder','HashingEncoder',
    'HelmertEncoder','JamesSteinEncoder','LeaveOneOutEncoder',
    'MEstimateEncoder','OneHotEncoder','OrdinalEncoder','SumEncoder',
    'PolynomialEncoder','TargetEncoder','WOEEncoder'
  - return_fold_col: bool, return or not the fold col in train_df
  
  Output:
  - new_train
  - new_test
  
  Prerequisite:
  - pandas
  - category_encoders (pip install category_encoders)
  '''
  import pandas as pd
  import category_encoders as ce
  import time
  
  print('Starting DF Shapes:\nTrain:', train_df.shape, '\nTest:',test_df.shape)
  start = time.time()
  new_columns = [id_col,'fold']
  new_columns.extend(categorical_features)
  new_train = pd.DataFrame(columns=new_columns)

  print('\nCV Encoding - Train set:')
  
  for i in sorted(list(train_df.fold.unique())):
    #------------
    print('Processing Fold:',i)
    # will use the encoder we pass as an argument
    if encoder == 'BackwardDifferenceEncoder':
      enc = ce.BackwardDifferenceEncoder(cols=categorical_features)

    elif encoder == 'BaseNEncoder':
      enc = ce.BaseNEncoder(cols=categorical_features)

    elif encoder == 'BinaryEncoder':
      enc = ce.BinaryEncoder(cols=categorical_features)

    elif encoder == 'CatBoostEncoder':
      enc = ce.CatBoostEncoder(cols=categorical_features, sigma=2.0)

    elif encoder == 'HashingEncoder':
      enc = ce.HashingEncoder(cols=categorical_features)

    elif encoder == 'HelmertEncoder':
      enc = ce.HelmertEncoder(cols=categorical_features)

    elif encoder == 'JamesSteinEncoder':
      enc = ce.JamesSteinEncoder(cols=categorical_features)

    elif encoder == 'LeaveOneOutEncoder':
      enc = ce.LeaveOneOutEncoder(cols=categorical_features)

    elif encoder == 'MEstimateEncoder':
      enc = ce.MEstimateEncoder(cols=categorical_features)

    elif encoder == 'OneHotEncoder':
      enc = ce.OneHotEncoder(cols=categorical_features)

    elif encoder == 'OrdinalEncoder':
      enc = ce.OrdinalEncoder(cols=categorical_features)

    elif encoder == 'SumEncoder':
      enc = ce.SumEncoder(cols=categorical_features)

    elif encoder == 'PolynomialEncoder':
      enc = ce.PolynomialEncoder(cols=categorical_features)

    elif encoder == 'TargetEncoder':
      enc = ce.TargetEncoder(cols=categorical_features)

    elif encoder == 'WOEEncoder':
      enc = ce.WOEEncoder(cols=categorical_features)

    enc.fit(train_df[train_df['fold'] != i][categorical_features], train_df[train_df['fold'] != i][target])
    trans_fold = enc.transform(train_df[train_df['fold'] == i][categorical_features])
    trans_fold.reset_index(drop=True, inplace=True)
    trans_fold[id_col] = train_df[train_df['fold'] == i][id_col].reset_index(drop=True)
    trans_fold['fold'] = train_df[train_df['fold'] == i]['fold'].reset_index(drop=True)
    trans_fold = trans_fold[new_columns]
    new_train = new_train.append(trans_fold)

    del enc, trans_fold
    gc.collect()
    #------------

  new_train_cal = new_train.merge(train_df, how='left', on=id_col)
  new_test = test_df
  print('\nCV Encoding - Test set:')
  for col in categorical_features:
    #------------
    print('Column:',col)
    calc = new_train_cal.groupby(by=str(col)+'_y')[[str(col)+'_x']].mean()
    calc.reset_index(inplace=True)
    new_test = new_test.merge(calc, how = 'left', left_on = test_df[str(col)], right_on = calc[str(col)+'_y'])
    new_test.drop(columns=['key_0'],inplace=True)
    #------------

  to_remove = [s + '_y' for s in categorical_features]
  to_remove.extend(categorical_features)
  to_remove = sorted(to_remove)
  
  new_test.drop(columns=to_remove, inplace=True)

  if not return_fold_col:
    new_train.drop(columns=['fold'],inplace=True)

  newtraincols = [col+'_'+str(encoder.lower()) for col in new_train.columns if col not in [id_col,'fold']]
  newtraincols.insert(0,id_col)
  new_train.columns = newtraincols
  
  newcols = [col.replace('_x','_'+str(encoder.lower())) for col in new_test.columns if '_x' in col]
  newcols.insert(0,id_col)
  new_test.columns = newcols
  
  print('\nNew DF Shapes:\nNew Train:', new_train.shape, '\nNew Test:',new_test.shape)
  print('Processing time (min):', round((time.time() - start)/60,2))
  
  return new_train, new_test

In [0]:
new_train, new_test = cv_encoding(train_df, test_df, categorical_features)

Starting DF Shapes:
Train: (590540, 36) 
Test: (506691, 34)

CV Encoding - Train set:
Processing Fold: 0
Processing Fold: 1
Processing Fold: 2
Processing Fold: 3
Processing Fold: 4

CV Encoding - Test set:
Column: M2
Column: M2__M3_0
Column: M2__M3_1
Column: M2__M3_2
Column: M2__M3_3
Column: M3
Column: M4
Column: M5
Column: M6
Column: M7
Column: M8
Column: M9
Column: DeviceInfo_version
Column: DeviceInfo_device
Column: DeviceInfo
Column: product_type
Column: card1
Column: card2
Column: card3
Column: card5
Column: card2__dist1
Column: card1__card5
Column: addr1
Column: addr2
Column: addr1__card1
Column: R_emaildomain
Column: P_emaildomain
Column: P_emaildomain__C2
Column: DeviceInfo__P_emaildomain
Column: card5__P_emaildomain
Column: card2__id_20
Column: D11__DeviceInfo
Column: D8__D9

New DF Shapes:
New Train: (590540, 34) 
New Test: (506691, 34)
Processing time (min): 2.25


In [0]:
new_train.head()

Unnamed: 0,TransactionID,M2_catboostencoder,M2__M3_0_catboostencoder,M2__M3_1_catboostencoder,M2__M3_2_catboostencoder,M2__M3_3_catboostencoder,M3_catboostencoder,M4_catboostencoder,M5_catboostencoder,M6_catboostencoder,M7_catboostencoder,M8_catboostencoder,M9_catboostencoder,DeviceInfo_version_catboostencoder,DeviceInfo_device_catboostencoder,DeviceInfo_catboostencoder,product_type_catboostencoder,card1_catboostencoder,card2_catboostencoder,card3_catboostencoder,card5_catboostencoder,card2__dist1_catboostencoder,card1__card5_catboostencoder,addr1_catboostencoder,addr2_catboostencoder,addr1__card1_catboostencoder,R_emaildomain_catboostencoder,P_emaildomain_catboostencoder,P_emaildomain__C2_catboostencoder,DeviceInfo__P_emaildomain_catboostencoder,card5__P_emaildomain_catboostencoder,card2__id_20_catboostencoder,D11__DeviceInfo_catboostencoder,D8__D9_catboostencoder
0,2987000,0.018476,0.037977,0.038605,0.017581,0.020214,0.017581,0.123302,0.028021,0.017838,0.054597,0.054601,0.054601,0.021137,0.021137,0.021137,0.033632,0.170132,0.102516,0.026553,0.049134,0.002517,0.286793,0.017173,0.026051,0.037759,0.021453,0.031198,0.025014,0.023066,0.025944,0.088982,0.029009,0.02608
1,2987001,0.06532,0.037977,0.038605,0.05653,0.06532,0.06532,0.036719,0.036532,0.017838,0.054597,0.054601,0.054601,0.021137,0.021137,0.021137,0.019807,0.049501,0.072488,0.026553,0.070733,0.105904,0.050363,0.025573,0.026051,0.000858,0.021453,0.0461,0.03065,0.020548,0.102892,0.034366,0.024443,0.02608
2,2987002,0.018476,0.037977,0.038605,0.017581,0.020214,0.017581,0.036719,0.028021,0.023485,0.019846,0.022342,0.031089,0.021137,0.021137,0.021137,0.021443,0.008443,0.025151,0.026553,0.011705,0.00944,0.008699,0.036423,0.026051,0.001452,0.021453,0.099258,0.039378,0.023181,0.00733,0.021178,0.009612,0.02608
3,2987003,0.06532,0.037977,0.038605,0.05653,0.06532,0.06532,0.036719,0.036532,0.023485,0.054597,0.054601,0.054601,0.021137,0.021137,0.021137,0.008497,0.014495,0.018538,0.026553,0.013112,0.018019,0.014491,0.034404,0.026051,0.044164,0.021453,0.025,0.046227,0.021409,0.005834,0.01915,0.024443,0.02608
4,2987004,0.06532,0.037977,0.038605,0.05653,0.06532,0.06532,0.021404,0.042347,0.093737,0.054597,0.054601,0.054601,0.00944,0.146737,0.00944,0.059379,0.037063,0.073028,0.026553,0.070733,0.088091,0.061045,0.032833,0.026051,0.037759,0.021453,0.0461,0.03065,0.037759,0.102892,0.002221,0.00944,0.02608


In [0]:
new_test.head()

Unnamed: 0,TransactionID,M2_catboostencoder,M2__M3_0_catboostencoder,M2__M3_1_catboostencoder,M2__M3_2_catboostencoder,M2__M3_3_catboostencoder,M3_catboostencoder,M4_catboostencoder,M5_catboostencoder,M6_catboostencoder,M7_catboostencoder,M8_catboostencoder,M9_catboostencoder,DeviceInfo_version_catboostencoder,DeviceInfo_device_catboostencoder,DeviceInfo_catboostencoder,product_type_catboostencoder,card1_catboostencoder,card2_catboostencoder,card3_catboostencoder,card5_catboostencoder,card2__dist1_catboostencoder,card1__card5_catboostencoder,addr1_catboostencoder,addr2_catboostencoder,addr1__card1_catboostencoder,R_emaildomain_catboostencoder,P_emaildomain_catboostencoder,P_emaildomain__C2_catboostencoder,DeviceInfo__P_emaildomain_catboostencoder,card5__P_emaildomain_catboostencoder,card2__id_20_catboostencoder,D11__DeviceInfo_catboostencoder,D8__D9_catboostencoder
0,3663549,0.017951,0.035159,0.025638,0.049593,0.019736,0.030205,0.018917,0.037932,0.023639,0.021777,0.016051,0.017619,0.020885,0.020885,0.020885,0.004978,0.000382,0.021629,0.024549,0.029567,0.01615,0.000587,0.035477,0.024135,0.000735,0.020765,0.043732,0.076602,0.02119,0.034338,0.019427,0.039921,0.024875
1,3663550,0.034732,0.034732,0.0357,0.049593,0.019736,0.030205,0.036583,0.037932,0.023639,0.047579,0.047582,0.047582,0.020885,0.020885,0.020885,0.029458,0.025026,0.021629,0.024549,0.029567,0.016156,0.024411,0.021138,0.024135,0.024651,0.020765,0.021573,0.026121,0.021774,0.020547,0.019427,0.008332,0.024875
2,3663551,0.017951,0.035159,0.025638,0.049593,0.019736,0.030205,0.036583,0.02634,0.023639,0.01913,0.021458,0.029436,0.020885,0.020885,0.020885,0.035442,0.012661,0.000878,0.024549,0.029567,0.007323,0.001034,0.025965,0.024135,0.001034,0.020765,0.053701,0.033779,0.018259,0.034606,0.000931,0.016647,0.024875
3,3663552,0.017951,0.035159,0.0357,0.016915,0.019736,0.016915,0.018917,0.037932,0.017017,0.047579,0.047582,0.047582,0.020885,0.020885,0.020885,0.084685,0.005922,0.010466,0.024549,0.010958,0.00059,0.002858,0.011298,0.024135,0.003066,0.020765,0.043732,0.036751,0.02119,0.010198,0.00933,0.003108,0.024875
4,3663553,0.017951,0.035159,0.0357,0.016915,0.019736,0.016915,0.018917,0.037932,0.023639,0.01913,0.016051,0.017619,0.020885,0.020885,0.020885,0.003281,0.010847,0.009401,0.024549,0.013481,0.003204,0.010114,0.018126,0.024135,0.011878,0.020765,0.043732,0.076602,0.02119,0.013171,0.008125,0.02202,0.024875


In [0]:
# LOAD LATEST VERSION

print('Load Data')

train_df = pd.read_csv('../content/gdrive/My Drive/IEEE fraud Kaggle 2019/train_all_feat.csv')

test_df = pd.read_csv('../content/gdrive/My Drive/IEEE fraud Kaggle 2019/test_all_feat.csv')

Load Data


In [0]:
train_df = train_df.merge(new_train, how='left', on='TransactionID')
test_df = test_df.merge(new_test, how='left', on='TransactionID')

train_df.drop(columns=categorical_features, inplace=True)
test_df.drop(columns=categorical_features, inplace=True)

print('Final DF Shapes:\nFinal Train:', train_df.shape, '\nFinal Test:',test_df.shape)

In [0]:
train_df.to_csv('../content/gdrive/My Drive/IEEE fraud Kaggle 2019/train_full_catencode.csv',index=False)
test_df.to_csv('../content/gdrive/My Drive/IEEE fraud Kaggle 2019/test_full_catencode.csv',index=False)