In [1]:
import os
import pickle
from random import shuffle
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, BatchNormalization, Dropout, LSTM, Activation
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import RootMeanSquaredError
import gc

In [2]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [3]:
dir_train = 'train'
dir_valid = 'valid'
dir_test = 'test'

Writing a function that takes as input the dir-name, gets the data (X, y), shuffles it and returns list of names

In [4]:
def get_data_shuffled(filepath):

    files = os.listdir(filepath)

    X = []
    y = []

    for file in files:
        if 'X' in file:
            X.append(file)
        elif 'y' in file:
            y.append(file)

    X = sorted(X)
    y = sorted(y)

    list_for_shuffling = list(zip(X, y))  # put X and y in a list for shuffling
    shuffle(list_for_shuffling)
    X, y = zip(*list_for_shuffling)  # unpack again
    
    return list(X), list(y)

In [5]:
X_train_list, y_train_list = get_data_shuffled(filepath=dir_train)
X_valid_list, y_valid_list = get_data_shuffled(filepath=dir_valid)
X_test_list, y_test_list = get_data_shuffled(filepath=dir_test)

# Building a model and fitting hyperparameters

Data is too large to use RandomSearch with a large search space -> use TensorBoard and tune manually

In [6]:
model_1 = Sequential([
    LSTM(256, input_shape=(128, 300), return_sequences=True),
    BatchNormalization(),
    Dropout(0.25),
    
    LSTM(128, input_shape=(128, 300), return_sequences=True),
    BatchNormalization(),
    Dropout(0.25),
    
    LSTM(64, input_shape=(128, 300), return_sequences=False),
    BatchNormalization(),
    Dropout(0.25),
    
    Dense(32),
    BatchNormalization(),
    Activation('relu'),
    Dropout(0.25),

    Dense(1)
])

In [7]:
model_1.compile(optimizer=Adam(learning_rate=0.003), loss='mse', metrics=[RootMeanSquaredError()])

In [8]:
tb = TensorBoard(log_dir='logs/model_1')

In [9]:
#needed callback otherwise RAM is overloaded (>32GB)
class MyCustomCallback(tf.keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs=None):
        gc.collect()
        tf.keras.backend.clear_session()
        tf.compat.v1.reset_default_graph()

In [None]:
#training
for X_file, y_file in zip(X_train_list, y_train_list):
    X_filepath = os.path.join('train', X_file)
    y_filepath = os.path.join('train', y_file)
    
    with open(X_filepath, 'rb') as X_data:
        X = pickle.load(X_data)
    with open(y_filepath, 'rb') as y_data:
        y = pickle.load(y_data)
    
    model_1.fit(X, y, epochs=1, batch_size=128, callbacks=[tb, MyCustomCallback()])

In [None]:
model_1.save('model_1.h5')

In [None]:
#kernel dies
'''scores = []
for X_file, y_file in zip(X_valid_list, y_valid_list):
    X_filepath = os.path.join('valid', X_file)
    y_filepath = os.path.join('valid', y_file)
    
    with open(X_filepath, 'rb') as X_data:
        X = pickle.load(X_data)
    with open(y_filepath, 'rb') as y_data:
        y = pickle.load(y_data)
    
    score = model_1.evaluate(X, y, batch_size=128, callbacks=[tb, MyCustomCallback()])
    
    scores.append(score)'''

Evaluation/ prediction still not possible due to memory leakage (https://github.com/keras-team/keras/issues/13118)

-> Train the model with all data and submit model for evaluation

In [10]:
%%time

#train with all the data..
dirs = [dir_train, dir_valid, dir_test]
files = [[X_train_list, y_train_list], [X_valid_list, y_valid_list], [X_test_list, y_test_list]]
for epoch in range(1, 11):  # run for 10 epochs
    for dir_, file in zip(dirs, files):
        for X_file, y_file in zip(*file):
            X_filepath = os.path.join(dir_, X_file)
            y_filepath = os.path.join(dir_, y_file)

            with open(X_filepath, 'rb') as X_data:
                X = pickle.load(X_data)
            with open(y_filepath, 'rb') as y_data:
                y = pickle.load(y_data)

            model_1.fit(X, y, epochs=1, batch_size=128, callbacks=[tb, MyCustomCallback()])
            
    model_1.save(f'model_1_epoch_{epoch}.h5')











CPU times: user 2h 27min 40s, sys: 1h 2min 49s, total: 3h 30min 29s
Wall time: 2h 58min 48s


In [None]:
'''
Predictions must be done in kaggle notebook, following code was used:
Each time_id df is feeded in and evaluated. LSTM model takes input of shape (n, 128, 300).
If there are not yet 128 samples, another model (usual dnn) needs to predict the values unti queue is ready.


scaler = joblib.load('../input/previous-model/scaler_only_stand.joblib')
model_prev = load_model('../input/previous-model/model_only_stand.h5')

model_lstm = load_model('../input/model-1-epoch-6/model_1_epoch_6.h5')

last_values = deque(maxlen=128)

env = ubiquant.make_env()   
iter_test = env.iter_test()    

for (test_df, sample_prediction_df) in iter_test:
    test_df.drop(['investment_id', 'row_id'], axis=1, inplace=True)
    test_df = test_df.values
    if len(test_df) == 1:  # only 1 value in test_df
        last_values.append(test_df)  # add to the queue
        if len(last_values) == 128:  # make pred with lstm if queue is full
            value_to_predict = np.array(last_values)  # make array of queue
            value_to_predict = value_to_predict.reshape(-1, 128, 300)  # shape (128, 300) -> (1, 128, 300)
            prediction = model_lstm.predict(value_to_predict)  # pred with lstm model
            prediction = prediction.reshape(1)  # reduce dim
            sample_prediction_df.loc[0, 'target'] = prediction  # place prediction in the right spot (index 0 cause len==1)
        else:  # make pred with dnn if queue isn't full yet
            value_to_predict = np.array(test_df)
            value_to_predict = value_to_predict.reshape(-1, 300)  # shape (300) -> (1, 300)
            value_to_predict_scaled = scaler.transform(value_to_predict)
            prediction = model_prev.predict(value_to_predict_scaled)  # predict only current value with dnn (not the queue)
            prediction = prediction.reshape(1)
            sample_prediction_df.loc[0, 'target'] = prediction
    else:  # more then 1 value in test_df -> iterate over and make prediction for each
        for row, value in zip(range(len(test_df)), test_df):  # iterate parallel over index (0, 1, 2, ..) and values
            last_values.append(value)
            if len(last_values) == 128:  # make pred with lstm
                value_to_predict = np.array(last_values)  # make array for pred if queue full (128, 300)
                value_to_predict = value_to_predict.reshape(-1, 128, 300)  # shape (128, 300) -> (1, 128, 300)
                prediction = model_lstm.predict(value_to_predict)  # pred with lstm model
                prediction = prediction.reshape(1)
                sample_prediction_df.loc[row, 'target'] = prediction
            else:  # predict only current value with dnn
                value_to_predict = value.reshape(-1, 300)  # shape (300) -> (1, 300)
                value_to_predict_scaled = scaler.transform(value_to_predict)
                prediction = model_prev.predict(value_to_predict_scaled)
                prediction = prediction.reshape(1)
                sample_prediction_df.loc[row, 'target'] = prediction
           
    env.predict(sample_prediction_df)'''