## Import datasets to workspace

1. Competition dataset. 
2. Memory optimized parquet dataset.

The competition dataset is large (18.55 GB) we will use a more memory optimized dataset (3.63 GB) in parquet format. Credit to @Rob Mulla for sharing the optimized dataset.

In [None]:
## Import Libraries
import pandas as pd
import os
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
import ubiquant
from sklearn.model_selection import KFold
from scipy.stats import pearsonr

In [None]:
## Remove scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
## Explore working environment
os.listdir('../input/ubiquant-market-prediction')


In [None]:
%%time
## Import Dataset
df = pd.read_parquet('../input/ubiquant-parquet/train_low_mem.parquet')
df.head()

In [None]:
## Dataframe shape and size
df.info()

In [None]:
## Features
df.columns

In [None]:
## Unique rows
df.row_id.nunique()

In [None]:
## Missing Values 
df.isnull().sum().sum()

In [None]:
## Select feature columns
f_col = df.drop(['row_id','time_id','investment_id','target'],axis=1).columns
print(len(f_col))
f_col

## Investment ID

Since "f_#" columns are similar to standard normal distribution, "investment_id" column will be converted for efficient training.

To apply the same criteria at test dataset, make the scaler and use it later.

In [None]:
## Scale investment id
print(df['investment_id'].describe())

In [None]:
df['investment_id'].value_counts().describe()

In [None]:
## StandardScaler standardizes a feature by subtracting the mean and then scaling to unit variance. 
## Unit variance means dividing all the values by the standard deviation.
scaler = StandardScaler()
scaler.fit(pd.DataFrame(df['investment_id']))

## Data Pre-processing Function

In [None]:
def make_dataset(df):
    
    """
    
    Fxn to:
    1. Scale investment ID
    2. Concatenate scaled Investment IDs with anonymized features
    
    """
    inv_df = df['investment_id']
    f_df = df[f_col]
    scaled_investment_id = scaler.transform(pd.DataFrame(inv_df))
    df['investment_id'] = scaled_investment_id
    data_x = pd.concat([df['investment_id'], f_df], axis=1)
    return data_x

### Change the data type
Notebook memory has limit which is too small to use raw data.

So, change the data type to "float16".

And divide the dataset into variables for input and output.

In [None]:
## Convert data types and pre process
df=df.astype('float16')
df_x = make_dataset(df)
df_x

In [None]:
## Glimpse of new scaled investment ID
df_x['investment_id'].describe()

In [None]:
## Target variable
df_y = pd.DataFrame(df['target'])
df_y

In [None]:
## Target feature distribution
df_y.describe()

In [None]:
## Target feature plot
df_y.plot()

In [None]:
## Delete raw data
del df

## Modeling

We will use a simple deep neural network.

The brief descriptions are as follows:

1. Use LeakyReLU activation.
improved version of the ReLU activation function.

2. Use BatchNormalization.
method used to make artificial neural networks faster and more stable through normalization of the layers' inputs by re-centering and re-scaling. It is mainly used before activation function layer.

3. Use Dropout.
To prevent overfitting.

4. Use kernel_initializer with 'he_normal'.
Initializers define the way to set the initial random weights of Keras layers
'he_normal' initializer strategy works well with derivatives of relu.
It draws samples from a truncated normal distribution centered on 0 with stddev = sqrt(2 / fan_in) where fan_in is the number of input units in the weight tensor.

5. Use ExponentialDecay scheduling.
When training a model, it is often useful to lower the learning rate as the training progresses. This schedule applies an exponential decay function to an optimizer step, given a provided initial learning rate.It will be great for improving your performance.

6. Use ModelCheckpoint.
To save model best performance, we will use ModelChechpoint in callbacks parameter.

In [None]:
def tm_model():
    
    """
    Model architecture defination
    """
    inputs_ = tf.keras.Input(shape = [df_x.shape[1]])
    
    x = tf.keras.layers.Dense(64, kernel_initializer = 'he_normal')(inputs_)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(128, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(256, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(512, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(256, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    drop = tf.keras.layers.Dropout(0.4)(leaky)
    
    x = tf.keras.layers.Dense(128, kernel_initializer = 'he_normal')(drop)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    
    x = tf.keras.layers.Dense(8, kernel_initializer = 'he_normal')(leaky)
    batch = tf.keras.layers.BatchNormalization()(x)
    leaky = tf.keras.layers.LeakyReLU(0.1)(batch)
    drop = tf.keras.layers.Dropout(0.4)(leaky)
    
    outputs_ = tf.keras.layers.Dense(1)(drop)
    
    model = tf.keras.Model(inputs = inputs_, outputs = outputs_)
    
    rmse = tf.keras.metrics.RootMeanSquaredError()

    learning_sch = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate = 0.003,
    decay_steps = 9700,
    decay_rate = 0.98)
    adam = tf.keras.optimizers.Adam(learning_rate = learning_sch)
    
    model.compile(loss = 'mse', metrics = rmse, optimizer = adam)
    return model

tm_model().summary()

In [None]:
## Plot Model
tf.keras.utils.plot_model(tm_model(),show_shapes=True,expand_nested=True)

## KFold Strategy

In [None]:
## Split data into 5 folds with shuffling
kfold_generator = KFold(n_splits =5, shuffle=True, random_state = 2022)
kfold_generator

## Model Fitting

In [None]:
%%time
callbacks = tf.keras.callbacks.ModelCheckpoint('tm_model.h5', save_best_only = True)
for train_index, val_index in kfold_generator.split(df_x, df_y):
    # Split training dataset.
    train_x, train_y = df_x.iloc[train_index], df_y.iloc[train_index]
    # Split validation dataset.
    val_x, val_y = df_x.iloc[val_index], df_y.iloc[val_index]
    # Make dataset.
    tf_train = tf.data.Dataset.from_tensor_slices((train_x, train_y)).shuffle(2022).batch(1024, drop_remainder=True).prefetch(1)
    tf_val = tf.data.Dataset.from_tensor_slices((val_x, val_y)).shuffle(2022).batch(1024, drop_remainder=True).prefetch(1)
    
    # Load model
    model = tm_model()
    
    # Model fitting
    
    ## Initial run with 5 epochs (epochs should be increased)
    
    model.fit(tf_train, callbacks = callbacks, epochs = 20, #### change the epochs into more numbers.
             validation_data = (tf_val), shuffle=True)
    # Delete tensor dataset and model for avoiding memory exploring.
    del tf_train
    del tf_val
    del model

In [None]:
best_model = tf.keras.models.load_model('tm_model.h5')
env = ubiquant.make_env()   
iter_test = env.iter_test()    
for (test_df, sample_prediction_df) in iter_test:
    test_df = make_dataset(test_df)
    sample_prediction_df['target'] = best_model.predict(test_df)  
    env.predict(sample_prediction_df)