In [None]:
### Import libraries

# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.preprocessing import MinMaxScaler

In [None]:
#Array che contiene le categorie di ogni time series. Dim 48000x1
categories = np.load("categories.npy")
#Array che contiene i valori delle time series. Dim 48000x2776
training_data = np.load("training_data.npy")
#Array che contiene gli indici che descrivono gli estremi degli intervalli validi delle time series. Dim 48000x2
valid_periods = np.load("valid_periods.npy")

categories.shape, training_data.shape, valid_periods.shape


In [None]:
def clean_data(training_data, garbage_threshold = 600):
    cleaned_data = []
    cleaned_categories = []
    cleaned_valid_periods = []
    for i in range(training_data.shape[0]):
        if valid_periods[i][1] - valid_periods[i][0] >= garbage_threshold:
            cleaned_data.append(training_data[i])
            cleaned_categories.append(categories[i])
            cleaned_valid_periods.append(valid_periods[i])
    return np.array(cleaned_data), np.array(cleaned_categories), np.array(cleaned_valid_periods)

In [None]:
def split_for_categories(training_data, categories):
    all_samples_list = []
    for i in np.unique(categories):
        mask = np.where(categories == i, True, False)

        samples_for_category = training_data[mask]
        all_samples_list.append(samples_for_category)
    return np.array(all_samples_list[0]), np.array(all_samples_list[1]), np.array(all_samples_list[2]), np.array(all_samples_list[3]), np.array(all_samples_list[4]), np.array(all_samples_list[5])

In [None]:
cleaned_data, cleaned_categories, cleaned_valid_periods = clean_data(training_data)

cleaned_categories.shape, cleaned_data.shape, cleaned_valid_periods.shape

In [None]:
data_A, data_B, data_C, data_D, data_E, data_F = split_for_categories(cleaned_data, cleaned_categories)

data_A.shape, data_B.shape, data_C.shape, data_D.shape, data_E.shape, data_F.shape

In [None]:
def train_test_split(data, train_ratio = 0.8):
    data = data
    
    rows_to_select = int(len(data) * train_ratio)
    training_set = data[:rows_to_select, :]
    test_set = data[rows_to_select:, :]
    
    return training_set, test_set

In [None]:
train_validation_D, test_D = train_test_split(data_D)
train_D, validation_D = train_test_split(train_validation_D)
len(train_D), len(validation_D), len(test_D)

In [None]:
def build_sequences(df, window=200, stride=1, telescope=9):
    # Sanity check to avoid runtime errors
    assert window % stride == 0
    dataset = []
    labels = []
    temp_df = df.copy()
    padding_check = df.size%window

    #print(temp_df.size)

    if(padding_check != 0):
        # Compute padding length
        padding_len = window - df.size%window
        padding = np.zeros((padding_len), dtype='float32')
        temp_df = np.concatenate((padding,df))
        assert temp_df.size % window == 0

    #print(temp_df.size)
    for idx in np.arange(0,temp_df.size-window-telescope,stride):
        dataset.append(temp_df[idx:idx+window])
        labels.append(temp_df[idx+window:idx+window+telescope])

    return dataset, labels

In [None]:
#data is a pandas series containing 48000 lists (either training, validation or test)
def THE_SEQUENCER(data):
    dataset = []
    labels = []
    for i in range(len(data)):
        time_series = data[i][valid_periods[i][0]:valid_periods[i][1]]
        dset, labs = build_sequences(time_series)
        if len(dset) == 0:
            continue
        dataset.append(dset)
        labels.append(labs)

    return dataset, labels

In [None]:
train_sequences_D, train_labels_D = THE_SEQUENCER(train_D)
validation_sequences_D, validation_labels_D = THE_SEQUENCER(train_D)
test_sequences_D, test_labels_D = THE_SEQUENCER(data_D)

In [None]:
def build_CONV_LSTM_model(input_shape, output_shape, num_neurons):
    # Ensure the input time steps are at least as many as the output time steps
    assert input_shape[0] >= output_shape[0], "For this exercise we want input time steps to be >= of output time steps"
    # Define the input layer with the specified shape
    input_layer = tfkl.Input(shape=input_shape, name='input_layer')
    # Add a Bidirectional LSTM layer with 64 units

    x = tfkl.Bidirectional(tfkl.LSTM(num_neurons, return_sequences=True, name='lstm'), name='bidirectional_lstm')(input_layer)
    
    # Add a 1D Convolution layer with 128 filters and a kernel size of 3
    x = tfkl.Conv1D(128, 3, padding='same', activation='relu', name='conv')(x)
    # Add a final Convolution layer to match the desired output shape
    output_layer = tfkl.Conv1D(output_shape[1], 3, padding='same', name='output_layer')(x)
    # Calculate the size to crop from the output to match the output shape
    crop_size = output_layer.shape[1] - output_shape[0]
    # Crop the output to the desired length
    output_layer = tfkl.Cropping1D((0, crop_size), name='cropping')(output_layer)
    # Construct the model by connecting input and output layers
    model = tf.keras.Model(inputs=input_layer, outputs=output_layer, name='CONV_LSTM_model')
    # Compile the model with Mean Squared Error loss and Adam optimizer
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam())
    return model

In [None]:
X_train = []
y_train = []
for i in range(len(train_sequences_D)):
    for j in range(len(train_sequences_D[i])):
        X_train.append(train_sequences_D[i][j])
        y_train.append(train_labels_D[i][j])
X_train = np.array(X_train)
y_train = np.array(y_train)

In [None]:
X_valid = []
y_valid = []
for i in range(len(validation_sequences_D)):
    for j in range(len(validation_sequences_D[i])):
        X_valid.append(validation_sequences_D[i][j])
        y_valid.append(validation_labels_D[i][j])
X_valid = np.array(X_valid)
y_valid = np.array(y_valid)

In [None]:
pip install optuna

In [None]:
import optuna
input_shape = (200,1)
output_shape = (9,1)

def objective_function(optuna_trial):

    # Generate our trial model.

    num_neurons = optuna_trial.suggest_int("num_neurons", 10, 128)
    batch_size = optuna_trial.suggest_int("batch_size", 32, 128)

    print(f"Current trial parameters: num_neurons={num_neurons}, batch_size={batch_size}")

    model = build_CONV_LSTM_model(input_shape, output_shape, num_neurons)
    
    # Define callbacks for early stopping and learning rate reduction
    early_stopping = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss', mode='min', patience=12, restore_best_weights=True
    )
    reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
        monitor='val_loss', mode='min', patience=10, factor=0.1, min_lr=1e-5
    )

    # Fit the model on the training data.
    # The TFKerasPruningCallback checks for pruning condition every epoch.
    model.fit(
    x = X_train,
    y = y_train,
    batch_size = batch_size, 
    epochs = 300,
    validation_data=(X_valid, y_valid),
    callbacks = [early_stopping, reduce_lr]
    )

    # Evaluate the model accuracy on the validation set.
    return model.evaluate(X_valid, y_valid, verbose=0)

In [None]:
optuna_study = optuna.create_study(direction="minimize")

In [None]:
optuna_study = optuna.create_study(direction="minimize")
optuna_study.optimize(objective_function,
                    n_trials = 50)

In [None]:
print(optuna_study.best_trial.params)
print(optuna_study.best_trial.value)