In [1]:
import pandas as pd
import numpy as np
import os
import gc
from tqdm.notebook import tqdm
from sklearn.model_selection import StratifiedKFold
import pickle

from utilities import (
    RANDOM_STATE, TARGET_COL, CAT_TRESHOLD, N_FOLD, FOLD_STRAT_NAME, REDUCED_FOLD_NAME,
    reduce_mem_usage_sd
)
    

INPUT_PATH = '../input/tabular-playground-series-oct-2021'

In [2]:
train = pd.read_csv(
    os.path.join(INPUT_PATH, 'train.csv')
).drop('id', axis = 1)

In [3]:
test = pd.read_csv(
    os.path.join(INPUT_PATH, 'test.csv')
).drop('id', axis = 1)

# Feature Engineering

In [4]:
FEATURE = [x for x in train.columns if x != TARGET_COL]
CAT_COL = [x for x in FEATURE if train[x].nunique()<= CAT_TRESHOLD]

NUMERIC_COL = [x for x in FEATURE if x not in CAT_COL]

gc.collect()

0

In [5]:
#add row information for numeric, cat and all
for dataset in [train, test]:

    dataset['all_std'] = dataset[FEATURE].std(axis=1).astype(np.float32)
    dataset['all_mean'] = dataset[FEATURE].mean(axis=1).astype(np.float32)
    dataset['all_median'] = dataset[FEATURE].median(axis=1).astype(np.float32)

    gc.collect()

    dataset['numeric_std'] = dataset[NUMERIC_COL].std(axis=1).astype(np.float32)
    dataset['numeric_mean'] = dataset[NUMERIC_COL].mean(axis=1).astype(np.float32)
    dataset['numeric_median'] = dataset[NUMERIC_COL].median(axis=1).astype(np.float32)

    gc.collect()

    dataset['cat_std'] = dataset[CAT_COL].std(axis=1).astype(np.float32)
    dataset['cat_mean'] = dataset[CAT_COL].mean(axis=1).astype(np.float32)
    dataset['cat_median'] = dataset[CAT_COL].median(axis=1).astype(np.float32)

    dataset = dataset.dropna(axis = 1)
    gc.collect()

not_cat_col = [
    'all_std', 'all_mean', 'all_median',
    'numeric_std', 'numeric_mean', 'numeric_median',
    'cat_std', 'cat_mean', 'cat_median',
]

In [6]:
FEATURE = [x for x in train.columns if x != TARGET_COL]
CAT_COL = [x for x in FEATURE if (train[x].nunique()<= CAT_TRESHOLD) & (x not in not_cat_col)]

NUMERIC_COL = [x for x in FEATURE if x not in CAT_COL]

gc.collect()

0

In [7]:
for col in CAT_COL:
    train_un = np.sort(np.unique(train[col]))
    test_un = np.sort(np.unique(test[col]))
    
    assert np.any(train_un == test_un)
    
    #every integer
    assert all(isinstance(x, (int, np.int32, np.int64)) for x in train_un)
    assert all(isinstance(x, (int, np.int32, np.int64)) for x in test_un)

gc.collect()

0

In [8]:
#calculate new mapping dic
train = reduce_mem_usage_sd(train)

dtype_mapping = train.dtypes.to_dict()
dtype_mapping = {x: y for x, y in dtype_mapping.items() if x in FEATURE}

gc.collect()

test = test.astype(dtype_mapping)

  0%|          | 0/295 [00:00<?, ?it/s]

Mem. usage decreased from 2216.34 Mb to 987.05 Mb (55.5% reduction)


# Fold

In [9]:
#DEFINE FOLD
strat_fold = StratifiedKFold(n_splits = N_FOLD, random_state = RANDOM_STATE, shuffle = True)
train[FOLD_STRAT_NAME] = -1

for i, (train_index, test_index) in enumerate(strat_fold.split(train[FEATURE], train[TARGET_COL])):
    train.loc[test_index, FOLD_STRAT_NAME] = i

  This is separate from the ipykernel package so we can avoid doing imports until


# Reduced FOLD

In [10]:
#DEFINE FOLD
strat_fold = StratifiedKFold(n_splits = N_FOLD * 4, random_state = RANDOM_STATE, shuffle = True)
train[REDUCED_FOLD_NAME] = -1

for i, (train_index, test_index) in enumerate(strat_fold.split(train[FEATURE], train[TARGET_COL])):
    
    #train
    if i <= 1:
        train.loc[test_index, REDUCED_FOLD_NAME] = 0
        
    #test
    if (i == 2):
        train.loc[test_index, REDUCED_FOLD_NAME] = 1

# Save unscaled data as pickle

In [11]:
train.to_pickle('train_unscaled.pkl')
test.to_pickle('test_unscaled.pkl')

gc.collect()

0

# Scale train - test and save

In [12]:
#cycle for memory issue

for col in tqdm(FEATURE):
    train[col] = train[col].astype(np.float32)
    test[col] = test[col].astype(np.float32)

    mean_col, std_col = train[col].mean(), train[col].std()
    
    train[col] = ((train[col] - mean_col)/(std_col)).astype(np.float32)
    test[col] = ((test[col] - mean_col)/(std_col)).astype(np.float32)
    
    gc.collect()

  0%|          | 0/294 [00:00<?, ?it/s]

In [13]:
# train
train.to_pickle('train_scaled.pkl')
test.to_pickle('test_scaled.pkl')

gc.collect()

0

# SAVE FEATURE LIST

In [14]:
feature_dic = {
    'feature': FEATURE,
    'categorical' : CAT_COL,
    'numerical': NUMERIC_COL
}

with open('feature_dic.pkl', 'wb') as file_name:
    pickle.dump(feature_dic, file_name)