In [1]:
import numpy as np
import pandas as pd
import datatable as dt
from tqdm.auto import tqdm

import tensorflow as tf
from tensorflow.keras import layers

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import QuantileTransformer

# Read the train and test datasets

In [2]:
# Let's take only the first 100,000 samples from the training data and the first 50,000 from the test data.
train_df = dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas().iloc[:100000, 1:]
test_df = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas().iloc[:50000, 1:]

In [3]:
# Detect the numerical and categorical features.

cat_feats = [col for col in train_df.columns if train_df[col].nunique() <= 20 and col != 'target']
num_feats = [col for col in train_df.columns if col not in cat_feats + ['target']]
        
print(f'Number of categorical features: {len(cat_feats)}')
print(f'Number of numerical features: {len(num_feats)}')

Number of categorical features: 45
Number of numerical features: 240


# Denoising Autoencoder (DAE) for Feature Engineering

#### **DAE** is only used with numerical features. There are other techniques to tackle categorical features.

This technique can make use of unlabeled data to train in an unsupervised manner. Hence we'll concatenate the training and test datasets.

In [4]:
# First standardize the numerical features in the dataset with the quanitle transformer.
# The quantile transformer has proven much better performance compare to standard and max-min scaler.

all_data = np.concatenate([train_df[num_feats].values, test_df[num_feats]])

scaler = QuantileTransformer(output_distribution='normal', random_state=123)
scaled_num_all_data = scaler.fit_transform(all_data)

In [5]:
# split the dataset
scaled_train_arr, scaled_valid_arr = train_test_split(scaled_num_all_data, random_state=123, shuffle=True, test_size=0.2)

## TensorFlow Preprocessing Pipeline

## Method 1: DAE with Masking

In [7]:
# x will be used for both the input and the label
def preprocess(x):
    x = tf.cast(x, dtype=tf.float32)
    return (x, x)

scaled_num_train_tf = tf.data.Dataset.from_tensor_slices(scaled_train_arr).map(preprocess).\
                      shuffle(512).batch(1024).prefetch(-1)

scaled_num_valid_tf = tf.data.Dataset.from_tensor_slices(scaled_valid_arr).map(preprocess).\
                      batch(1024).prefetch(-1)

#### For this method, use the dropout layer to randomly set some feature values to zero. Make sure mask a sufficient proportion of the dataset to make the **DAE**'s task harder and force it to learn better representation.

In [10]:
def dae_model(num_hidden_layers, num_hidden_units, activation='relu', input_dropout_rate=0.5, hidden_dropout_rate=0.3):
    inputs = layers.Input(shape=scaled_train_arr.shape[-1])
    x = layers.Dropout(input_dropout_rate)(inputs)
    
    for i in range(num_hidden_layers):
        x = layers.Dense(num_hidden_units[i], activation=activation)(x)
        if i < (num_hidden_layers - 2):
            x = layers.Dropout(hidden_dropout_rate)(x)
    
    output = layers.Dense(scaled_train_arr.shape[-1], activation='linear')(x)
    # consturct the model
    model = tf.keras.Model(inputs=inputs, outputs=output)
    
    # configure the model
    model.compile(loss='mse',
                  optimizer='adam')
    return model

tf.keras.backend.clear_session()
model = dae_model(3, [512, 128, 512], 'relu', 0.6, 0.3)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 240)]             0         
_________________________________________________________________
dropout (Dropout)            (None, 240)               0         
_________________________________________________________________
dense (Dense)                (None, 512)               123392    
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)               65664     
_________________________________________________________________
dense_2 (Dense)              (None, 512)               66048     
_________________________________________________________________
dense_3 (Dense)              (None, 240)               123120

## Train the model

In [12]:
tf.keras.backend.clear_session()
model = dae_model(3, [512, 128, 512], 'relu', 0.6, 0.3)

model_checkpoint = tf.keras.callbacks.ModelCheckpoint(
                                                'best_weights.h5',
                                                monitor="val_loss",
                                                verbose=1,
                                                save_best_only=True,
                                                save_weights_only=True,
                                                mode="min")

model.fit(scaled_num_train_tf,
          validation_data=scaled_num_train_tf,
          epochs=20,
          callbacks=[model_checkpoint])

Epoch 1/20

Epoch 00001: val_loss improved from inf to 0.93759, saving model to best_weights.h5
Epoch 2/20

Epoch 00002: val_loss improved from 0.93759 to 0.83201, saving model to best_weights.h5
Epoch 3/20

Epoch 00003: val_loss improved from 0.83201 to 0.78073, saving model to best_weights.h5
Epoch 4/20

Epoch 00004: val_loss improved from 0.78073 to 0.76007, saving model to best_weights.h5
Epoch 5/20

Epoch 00005: val_loss improved from 0.76007 to 0.74967, saving model to best_weights.h5
Epoch 6/20

Epoch 00006: val_loss improved from 0.74967 to 0.74311, saving model to best_weights.h5
Epoch 7/20

Epoch 00007: val_loss improved from 0.74311 to 0.73932, saving model to best_weights.h5
Epoch 8/20

Epoch 00008: val_loss improved from 0.73932 to 0.73689, saving model to best_weights.h5
Epoch 9/20

Epoch 00009: val_loss improved from 0.73689 to 0.73487, saving model to best_weights.h5
Epoch 10/20

Epoch 00010: val_loss improved from 0.73487 to 0.73394, saving model to best_weights.h5
Epo

<keras.callbacks.History at 0x7fc4003a2690>

# Method 2: by Swapping Feature Values

In [14]:
def swapping(input_arr, p):
    arr = input_arr.copy()
    rows, cols = arr.shape
    idx = range(rows)
    swap_rows = round(rows*p)
    for i in tqdm(range(cols)):
        col_vals = np.random.permutation(arr[:, i])
        swap_idx = np.random.choice(idx, size = swap_rows)
        arr[swap_idx, i] = np.random.choice(col_vals, size = swap_rows) 
    return arr

In [15]:
num_epochs = 20
best_loss = np.inf

tf.keras.backend.clear_session()
model = dae_model(3, [512, 128, 512], 'relu', 0.0, 0.3)

for t in range(num_epochs):
    print(f'Epoch {t+1}')
    print('------------')
    # swap every epoch to make the task harder and use a fraction of 0.15
    corrupted_train_data = swapping(scaled_train_arr, 0.15)
    corrupted_valid_data = swapping(scaled_valid_arr, 0.15)
    
    scaled_num_train_tf = tf.data.Dataset.from_tensor_slices((corrupted_train_data, scaled_train_arr)).\
                      shuffle(512).batch(1024).prefetch(-1)

    scaled_num_valid_tf = tf.data.Dataset.from_tensor_slices((corrupted_valid_data, scaled_valid_arr)).\
                          batch(1024).prefetch(-1)
    
    
    model.fit(scaled_num_train_tf, epochs=1)
    
    valid_loss = model.evaluate(scaled_num_valid_tf)
    print(f'Valid Loss: {valid_loss:>.7f}')
    if valid_loss < best_loss:
        model.save_weights('best_weights_mothod_2.h5')
        print('Saving Best Weights')

Epoch 1
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.9815397
Saving Best Weights
Epoch 2
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.9138972
Saving Best Weights
Epoch 3
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8607188
Saving Best Weights
Epoch 4
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8409727
Saving Best Weights
Epoch 5
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8299596
Saving Best Weights
Epoch 6
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8239250
Saving Best Weights
Epoch 7
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8201698
Saving Best Weights
Epoch 8
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8165287
Saving Best Weights
Epoch 9
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8143964
Saving Best Weights
Epoch 10
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8139697
Saving Best Weights
Epoch 11
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8116671
Saving Best Weights
Epoch 12
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8119423
Saving Best Weights
Epoch 13
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8116335
Saving Best Weights
Epoch 14
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8107632
Saving Best Weights
Epoch 15
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8098555
Saving Best Weights
Epoch 16
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8101956
Saving Best Weights
Epoch 17
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8101475
Saving Best Weights
Epoch 18
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8097773
Saving Best Weights
Epoch 19
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8101377
Saving Best Weights
Epoch 20
------------


  0%|          | 0/240 [00:00<?, ?it/s]

  0%|          | 0/240 [00:00<?, ?it/s]

Valid Loss: 0.8099627
Saving Best Weights
