In [1]:
import pandas as pd
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import re
from utils import *
from layers import *
from AdvASLTM import *
import keras_tuner
from sklearn.preprocessing import RobustScaler, QuantileTransformer, PowerTransformer

2023-03-16 09:33:13.953588: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-16 09:33:16.687260: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/clement/miniconda3/envs/tf/lib/
2023-03-16 09:33:16.687488: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer_plugin.so.7'; dlerror: libnvinfer_plugin.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: :/home/clement/miniconda3/envs/tf/lib/


In [2]:
nrows = None
seq_len = 30
train_ratio = 0.7

x_df = pd.read_csv("input_training.csv", index_col="ID", nrows= nrows)
y_df = pd.read_csv("output_training_gmEd6Zt.csv", index_col="ID", nrows= nrows)
x_test_df = pd.read_csv("input_test.csv", index_col="ID")

x_df.sort_values(by="day", inplace= True)
x_test_df.sort_values(by="day", inplace= True)

y_df["reod"] = y_df["reod"] + 1


train_limit = int(train_ratio*len(x_df))

train_df = x_df.iloc[:train_limit].join(y_df.iloc[:train_limit]).copy(deep=True)
validation_df = x_df.iloc[train_limit:].join(y_df.iloc[train_limit:]).copy(deep=True)

In [3]:
r_filter = re.compile("r[0-9]+")
features_columns = list(filter(r_filter.match, train_df.columns))[-seq_len:]
preprocessed_features_columns = [f"preprocessed_{col}" for col in features_columns]

# Preprocess

In [4]:
# Deal with NaN
train_df.replace(np.nan, 0, inplace = True)
validation_df.replace(np.nan, 0, inplace = True)
x_test_df.replace(np.nan, 0, inplace = True)

# Scale
scaler = RobustScaler(unit_variance = True, with_centering= True).fit(train_df[features_columns])
train_df[preprocessed_features_columns] = scaler.transform(train_df[features_columns])
validation_df[preprocessed_features_columns] = scaler.transform(validation_df[features_columns])
x_test_df[preprocessed_features_columns] = scaler.transform(x_test_df[features_columns])

In [5]:
# Generate sequences
def generate_sequences(df, nb_sequences, nb_stocks, only_x = False):

    if not only_x: df_group_by_day = df[["day", "reod"] + preprocessed_features_columns].groupby("day").apply(lambda x : np.array(x))
    else:df_group_by_day = df[["day"] + preprocessed_features_columns].groupby("day").apply(lambda x : np.array(x))

    #df_group_by_day_y = df[["day","reod"]].groupby("day").apply(lambda x : np.array(x))
    sequences = []
    for i in range(nb_sequences):
        iday = i % len(df_group_by_day)
        try:
            picked_stocks = np.random.choice(df_group_by_day.iat[iday].shape[0], size = nb_stocks, replace = False)
            if not only_x:
                sequences.append([
                    df_group_by_day.iat[iday][picked_stocks, 2:],
                    df_group_by_day.iat[iday][picked_stocks, 1]
                ])
            else:sequences.append(
                    df_group_by_day.iat[iday][picked_stocks, 1:],)
        except KeyError:
            pass
    if not only_x:
        X, y = zip(*sequences)
        return np.array(X)[..., np.newaxis], np.array(y)
    return np.array(sequences)[..., np.newaxis]

X_train, y_train = generate_sequences(train_df, 50000, 10)
X_validation, y_validation = generate_sequences(validation_df, 30000, 10)

X_test = generate_sequences(x_test_df, 50000, 10, only_x= True)

: 

: 

In [None]:
X_train.shape, y_train.shape

((50000, 10, 30, 1), (50000, 10))

# Test best models

In [None]:
dropout = 0.3

inputs = tf.keras.layers.Input(X_train.shape[1:])

lstm_layer = tf.keras.layers.LSTM(64, dropout = dropout, return_sequences= False)
distributed_lstm_layer = tf.keras.layers.TimeDistributed(lstm_layer)
flatten_layer = tf.keras.layers.Flatten()

global_dense1 = tf.keras.layers.Dense(128, activation= "relu")
global_dense2 = tf.keras.layers.Dense(64, activation= "relu")
global_dense3 = tf.keras.layers.Dense(32, activation= "relu")
global_dim_reshaper = tf.keras.layers.Lambda(lambda x : tf.tile(tf.expand_dims(x, axis = 1), multiples= [1 ,X_train.shape[1], 1]))

local_dense1 = tf.keras.layers.Dense(32, activation= "relu")

concatenate_layer = tf.keras.layers.Concatenate(axis=-1)
dense_output = tf.keras.layers.Dense(3, activation= "softmax")



h_s = distributed_lstm_layer(inputs)

x = flatten_layer(h_s)
x = global_dense1(x)
x = global_dense2(x)
x = global_dense3(x)
global_latent_space = global_dim_reshaper(x)

local_latent_spaces = local_dense1(h_s)

combined_latent_spaces = concatenate_layer([local_latent_spaces, global_latent_space])
dense_output = dense_output(combined_latent_spaces)


model = tf.keras.models.Model(inputs = inputs, outputs = dense_output)


In [None]:
model.compile(
    optimizer = tf.keras.optimizers.Adam(learning_rate = 1E-3),
    loss= "sparse_categorical_crossentropy",
    metrics =["sparse_categorical_accuracy"]
)


# model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
#     filepath="model_weights/Model_{epoch:02d}-{val_loss:.4f}-{val_acc:0.4f}.hdf5",
#     save_weights_only=False,
#     monitor='val_acc',
#     mode='max',
#     save_best_only=True)

# early_stop = tf.keras.callbacks.EarlyStopping(
#     monitor="val_loss",
#     patience=12,
#     verbose=0,
#     mode="auto",
#     baseline=None,
#     restore_best_weights=True,
#     start_from_epoch=0,
# )

model.fit(
    X_train, y_train,
    validation_data = (X_validation, y_validation),
    batch_size= 64,
    epochs = 2,
    #callbacks=[model_checkpoint_callback, early_stop]
)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7efabfecbb50>

In [None]:
y_pred = model.predict(X_validation)
    



In [None]:
tf.argmax(y_pred[2000], axis = -1)

<tf.Tensor: shape=(10,), dtype=int64, numpy=array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1])>

In [None]:
y_validation[2002]

array([0., 0., 0., 0., 0., 2., 1., 0., 0., 0.])