In [1]:
import pandas as pd
import numpy as np
import os
import gc
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
import pickle

from utilities import (
    RANDOM_STATE, TARGET_COL, CAT_TRESHOLD, N_FOLD, FOLD_STRAT_NAME, REDUCED_FOLD_NAME,
    reduce_mem_usage_sd
)
    
MAX_BINS = 256
INPUT_PATH = '../input/tabular-playground-series-oct-2021'

In [2]:
train = pd.read_csv(
    os.path.join(INPUT_PATH, 'train.csv')
).drop('id', axis = 1)

In [3]:
test = pd.read_csv(
    os.path.join(INPUT_PATH, 'test.csv')
).drop('id', axis = 1)

# Feature Engineering

In [4]:
FEATURE = [x for x in train.columns if x != TARGET_COL]
CAT_COL = [x for x in FEATURE if train[x].nunique()<= CAT_TRESHOLD]

NUMERIC_COL = [x for x in FEATURE if x not in CAT_COL]

gc.collect()

0

In [5]:
%%time

n = 0
bins_list = []

bins_list.append(-np.inf)

for i in range(1,MAX_BINS):
    n += 1./MAX_BINS
    bins_list.append(n)

bins_list.append(np.inf)

labels = [i for i in range(MAX_BINS)]

for col in tqdm(NUMERIC_COL):
    train[col] = pd.cut(train[col], bins=bins_list, labels=labels).values
    test[col] = pd.cut(test[col], bins=bins_list, labels=labels).values
    
train.head()


  0%|          | 0/240 [00:00<?, ?it/s]

CPU times: user 57.8 s, sys: 58.2 s, total: 1min 55s
Wall time: 1min 57s


Unnamed: 0,f0,f1,f2,f3,f4,f5,f6,f7,f8,f9,...,f276,f277,f278,f279,f280,f281,f282,f283,f284,target
0,52,105,45,57,108,121,105,156,136,37,...,0,1,0,0,0,0,0,0,0,1
1,46,121,3,54,158,113,58,175,72,61,...,0,1,0,0,0,0,0,0,0,1
2,46,78,83,53,155,79,126,192,137,73,...,0,0,0,1,1,0,0,0,0,1
3,46,126,2,57,194,112,110,198,123,66,...,0,0,0,0,1,0,0,0,0,1
4,45,126,3,140,160,143,29,143,19,40,...,0,1,1,0,1,0,0,1,0,1


In [6]:
for col in FEATURE:
    train[col] = train[col].astype(np.int32)
    test[col] = test[col].astype(np.int32)
    
    train_un = np.sort(np.unique(train[col]))
    test_un = np.sort(np.unique(test[col]))
    
    assert (np.isnan(test_un).mean() == 0.) & (np.isnan(train_un).mean() == 0.)
    
    #every integer
    assert all(isinstance(x, (int, np.int32, np.int64)) for x in train_un)
    assert all(isinstance(x, (int, np.int32, np.int64)) for x in test_un)

gc.collect()

0

# Fold

In [7]:
#DEFINE FOLD
strat_fold = StratifiedKFold(n_splits = N_FOLD, random_state = RANDOM_STATE, shuffle = True)
train[FOLD_STRAT_NAME] = -1

for i, (train_index, test_index) in enumerate(strat_fold.split(train[FEATURE], train[TARGET_COL])):
    train.loc[test_index, FOLD_STRAT_NAME] = i

  This is separate from the ipykernel package so we can avoid doing imports until


# Reduced FOLD

In [8]:
#DEFINE FOLD
strat_fold = StratifiedKFold(n_splits = N_FOLD * 4, random_state = RANDOM_STATE, shuffle = True)
train[REDUCED_FOLD_NAME] = -1

for i, (train_index, test_index) in enumerate(strat_fold.split(train[FEATURE], train[TARGET_COL])):
    
    #train
    if i <= 1:
        train.loc[test_index, REDUCED_FOLD_NAME] = 0
        
    #test
    if (i == 2):
        train.loc[test_index, REDUCED_FOLD_NAME] = 1

# Save unscaled data as pickle

In [9]:
train.to_pickle('train_unscaled.pkl')
test.to_pickle('test_unscaled.pkl')

gc.collect()

0

In [10]:
feature_dic = {
    'feature': FEATURE,
    'categorical' : CAT_COL,
    'numerical': NUMERIC_COL
}

with open('feature_dic.pkl', 'wb') as file_name:
    pickle.dump(feature_dic, file_name)