## Import Packages

In [None]:
import os ## fxns for interacting with the OS
import pandas as pd ## data manipulation
import numpy as np ## mathematical fxns
import gc ## automatically releases memory when an object is no longer used
import matplotlib.pyplot as plt ## plotting
import tensorflow as tf ## deep learning
from tensorflow.keras import layers ## deep learning
from tensorflow import keras ## deep learning
from scipy import stats ## scientific computing and technical computing

In [None]:
## Remove scientific notation
pd.set_option('display.float_format', lambda x: '%.2f' % x)

## Import Dataset

This competition's dataset (18.55gb) is too large. We will use another dataset converted to utilize less memory in pickle format.

In [None]:
%%time
n_features = 300
features = [f'f_{i}' for i in range(n_features)]
train = pd.read_pickle('../input/ubiquant-market-prediction-half-precision-pickle/train.pkl')
train.head()

### Investment ID

In [None]:
## Select Investment Id from dataframe

investment_id = train.pop("investment_id")

print("Unique Investment IDs : {}".format(investment_id.nunique()))



In [None]:
print(investment_id.head())
print(investment_id.describe())

In [None]:
## Drop Time Id from dataframe
_ = train.pop("time_id")

In [None]:
## Select Target feature from dataframe
y = train.pop("target")
y.head()

## Create a IntegerLookup layer for investment_id input


A integer lookup layer is a preprocessing layer which maps integer features to contiguous ranges. Turns integer categorical values into an encoded representation that can be read by an Embedding layer or Dense layer.

The integer lookup layer will be one of the two input branches for the multi-input keras model.

In [None]:
%%time
investment_ids = list(investment_id.unique())
investment_id_size = len(investment_ids) + 1
investment_id_lookup_layer = layers.IntegerLookup(max_tokens=investment_id_size)
investment_id_lookup_layer.adapt(pd.DataFrame({"investment_ids":investment_ids}))

## Make Tensorflow dataset

Define functions to create a dataset from input features and preprocess the input data.

In [None]:
import tensorflow as tf
def preprocess(X, y):
    return X, y
def make_dataset(feature, investment_id, y, batch_size=1024, mode="train"):
    
    """
    
    1. Create a source dataset from your input data.
    2. Apply dataset transformations to preprocess the data.
    3. Iterate over the dataset and process the elements.
    
    """
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature), y)) ## read elements from memory
    ds = ds.map(preprocess)
    if mode == "train":
        ds = ds.shuffle(4096)
    
    ## Combine consecutive elements of this dataset into batches.
    ## Cache the elements in dataset
    ## allow later elements to be prepared while the current element is being processed (prefetch)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE) 
    return ds

## Modeling

The model architecture is a multi input keras network with 2 input branches. First branch handles investment Ids while the second branch will handle remaining anonymalized 300 features.

In [None]:
def get_model():
    
    """
    Fxn to define model architecture: Multi input keras model
    """
    investment_id_inputs = tf.keras.Input((1, ), dtype=tf.uint16)
    features_inputs = tf.keras.Input((300, ), dtype=tf.float16)
    
    investment_id_x = investment_id_lookup_layer(investment_id_inputs)
    investment_id_x = layers.Embedding(investment_id_size, 32, input_length=1)(investment_id_x) ## Turns positive integers (indexes) into dense vectors of fixed size
    investment_id_x = layers.Reshape((-1, ))(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    investment_id_x = layers.Dense(64, activation='swish')(investment_id_x)
    
    feature_x = layers.Dense(256, activation='swish')(features_inputs)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    feature_x = layers.Dense(256, activation='swish')(feature_x)
    
    ## Takes as input a list of tensors and returns a single tensor that is the concatenation of all inputs
    x = layers.Concatenate(axis=1)([investment_id_x, feature_x])
    
    x = layers.Dense(512, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(128, activation='swish', kernel_regularizer="l2")(x)
    x = layers.Dense(32, activation='swish', kernel_regularizer="l2")(x)
    
    output = layers.Dense(1)(x)
    
    rmse = keras.metrics.RootMeanSquaredError(name="rmse")
    
    model = tf.keras.Model(inputs=[investment_id_inputs, features_inputs], outputs=[output])
    
    model.compile(optimizer=tf.optimizers.Adam(0.001), loss='mse', metrics=['mse', "mae", "mape", rmse])
    
    return model

In [None]:
## Model summary and visualizating layout

model = get_model()
model.summary()
keras.utils.plot_model(model, show_shapes=True)

## Cross Validation 

In [None]:
%%time
## Stratified is to ensure that each fold of dataset has the same proportion of observations with a given label.
from sklearn.model_selection import StratifiedKFold
## Create 5 folds
kfold = StratifiedKFold(5, shuffle=True, random_state=42)
models = []
for index, (train_indices, valid_indices) in enumerate(kfold.split(train, investment_id)):
    ## Split dataset
    X_train, X_val = train.iloc[train_indices], train.iloc[valid_indices]
    investment_id_train = investment_id[train_indices]
    y_train, y_val = y.iloc[train_indices], y.iloc[valid_indices]
    investment_id_val = investment_id[valid_indices]
    train_ds = make_dataset(X_train, investment_id_train, y_train)
    valid_ds = make_dataset(X_val, investment_id_val, y_val, mode="valid")
    
    ## Call model
    model = get_model()
    
    ## Use callbacks to stop model training if model perfomance is not improving
    checkpoint = keras.callbacks.ModelCheckpoint(f"model_{index}", save_best_only=True)
    early_stop = keras.callbacks.EarlyStopping(patience=10)
    
    ## Fit model
    history = model.fit(train_ds, epochs=30, validation_data=valid_ds, callbacks=[checkpoint, early_stop])
    
    ## append model to models list
    models.append(keras.models.load_model(f"model_{index}"))
    
    ## Make predictions for validation set and get pearson correlation coefficient
    pearson_score = stats.pearsonr(model.predict(valid_ds).ravel(), y_val.values)[0]
    print('Pearson:', pearson_score)
    
    ## Create a dataframe of mean squared errors for train and validation sets and plot the metrics
    pd.DataFrame(history.history, columns=["mse", "val_mse"]).plot()
    plt.title("MSE")
    plt.show()
    
    ## Create a dataframe of mean absolute errors for train and validation sets and plot the metrics
    pd.DataFrame(history.history, columns=["mae", "val_mae"]).plot()
    plt.title("MAE")
    plt.show()
    
    ## Create a dataframe of root mean squared errors for train and validation sets and plot the metrics
    pd.DataFrame(history.history, columns=["rmse", "val_rmse"]).plot()
    plt.title("RMSE")
    plt.show()
    
    ## Remove un-neccesary objects from memory
    del investment_id_train
    del investment_id_val
    del X_train
    del X_val
    del y_train
    del y_val
    del train_ds
    del valid_ds
    gc.collect()
    break

## Submission

In [None]:
## Functions to pre-process test set

def preprocess_test(investment_id, feature):
    return (investment_id, feature), 0

def make_test_dataset(feature, investment_id, batch_size=1024):
    """
    Make test dataset from test features and preprocess test set
    """
    ds = tf.data.Dataset.from_tensor_slices(((investment_id, feature)))
    ds = ds.map(preprocess_test)
    ds = ds.batch(batch_size).cache().prefetch(tf.data.experimental.AUTOTUNE)
    return ds

def inference(models, ds):
    """
    Make predictions from the models and return mean of predictions
    
    """
    y_preds = []
    for model in models:
        y_pred = model.predict(ds)
        y_preds.append(y_pred)
    return np.mean(y_preds, axis=0)

In [None]:
## Call Kaggle API to make predictions
import ubiquant
env = ubiquant.make_env()
iter_test = env.iter_test() 
for (test_df, sample_prediction_df) in iter_test:
    ds = make_test_dataset(test_df[features], test_df["investment_id"])
    sample_prediction_df['target'] = inference(models, ds)
    env.predict(sample_prediction_df)