### Connect to Drive

In [19]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/[2023-2024] AN2DL/Homework2

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive/My Drive/[2023-2024] AN2DL/Homework2


### Import Libraries

In [20]:
seed = 31

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd() + '/configs/'

import warnings
warnings.simplefilter(action = 'ignore', category = FutureWarning)
warnings.simplefilter(action = 'ignore', category = Warning)

import numpy as np
from collections import Counter
np.random.seed(seed)

import logging

import random
random.seed(seed)


import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

#tf.keras.mixed_precision.set_global_policy('mixed_float16')

import keras
from keras.backend import sigmoid
from keras.layers import Activation
from keras.utils import get_custom_objects

import pandas as pd
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler

2.15.0


### Import Data from the ZIP

In [21]:
unzip = False

if unzip:

    !unzip training_dataset.zip

In [22]:
#data loading
data = np.load('training_data.npy')
categories = np.load('categories.npy')
validPeriods = np.load('valid_periods.npy')

In [23]:
data.shape, categories.shape, validPeriods.shape

((48000, 2776), (48000,), (48000, 2))

### Preprocessing Data

In [24]:
# Set a = True to remove the zeros
#saves the starting position of each sequence in an array, then removes the padding in all sequences and saves them in "dataNoPadding"

a = True

if a:

  startingIndexes = []

  for element in validPeriods:

    index = element[0]
    startingIndexes.append(index)

  dataNoPadding = np.array([series[index:] for series, index in zip(data, startingIndexes)])

In [25]:
#extracts the specified cateogry out of the dataset
def categoryExtractor (data, category):

  pos = np.where(categories == category)
  dataCat = data[pos]

  return dataCat

In [26]:
A = categoryExtractor(dataNoPadding, 'A')
B = categoryExtractor(dataNoPadding, 'B')
C = categoryExtractor(dataNoPadding, 'C')
D = categoryExtractor(dataNoPadding, 'D')
E = categoryExtractor(dataNoPadding, 'E')
F = categoryExtractor(dataNoPadding, 'F')

print(A.shape, B.shape, C.shape, D.shape, E.shape, F.shape)

(5728,) (10987,) (10017,) (10016,) (10975,) (277,)


In [27]:
#returns the all the sequence lengths contained in data
def countLen (data):

  #counts the length of the sequence for each sequence in data
  array_lengths = np.array([len(arr) for arr in data])

  return np.sort(array_lengths)

In [28]:
categories = ['A', 'B', 'C', 'D', 'E', 'F']
keys = []

#for each category, save the sequence lengths for further inspection
for el in categories:

  array = locals()[el]
  keys.append(countLen(array))


In [29]:
#inspects how the sequence lengths are distributed
i = 0

for el in categories:

  df = pd.DataFrame({el: keys[i]})
  print(df.describe())
  i += 1

                 A
count  5728.000000
mean    278.180342
std     109.290380
min      46.000000
25%     184.000000
50%     288.000000
75%     316.000000
max    1943.000000
                  B
count  10987.000000
mean     165.942842
std      116.141928
min       42.000000
25%       56.000000
50%      157.000000
75%      219.000000
max     1484.000000
                  C
count  10017.000000
mean     208.146251
std      146.289417
min       42.000000
25%       97.000000
50%      204.000000
75%      272.000000
max     2708.000000
                  D
count  10016.000000
mean     216.990915
std      149.173953
min       42.000000
25%       52.000000
50%      238.000000
75%      288.000000
max     2641.000000
                  E
count  10975.000000
mean     163.046014
std      127.992337
min       42.000000
25%       51.000000
50%      119.000000
75%      288.000000
max     2776.000000
                 F
count   277.000000
mean    194.830325
std     153.410846
min      24.000000
25%      89.00

In [30]:
#removes short sequences using a custom filter
def removeShortSequences(data, i):

  a = 1

  newData = []
  df = pd.DataFrame({el: keys[i]})
  description = df.describe()
  firstQuartile = description.loc['25%']
  firstQuartile = firstQuartile['F']

  if firstQuartile > 100:

    for i in range(len(data)):

      if len(data[i]) >= 80:
        newData.append(data[i])

  else:

    for i in range(len(data)):

      if len(data[i]) > 40:                            # Related to category 'F' (see above its description)
        newData.append(data[i])

  return np.array(newData)

In [31]:
#save the new sequence categories, with the short sequences removed
newA = removeShortSequences(A, 0)
newB = removeShortSequences(B, 1)
newC = removeShortSequences(C, 2)
newD = removeShortSequences(D, 3)
newE = removeShortSequences(E, 4)
newF = removeShortSequences(F, 5)

print(newA.shape, newB.shape, newC.shape, newD.shape, newE.shape, newF.shape)

(5682,) (10987,) (10017,) (10016,) (10975,) (259,)


In [32]:
a = np.concatenate( (newA, newB, newC, newD, newE, newF), axis = 0 )

### Define the hyperparameters

In [33]:
#hyperparamrers
window = 200
stride = 5
telescope = 9

In [34]:
calls = [
            tfk.callbacks.EarlyStopping(monitor = 'val_loss', mode = 'min', patience = 12, restore_best_weights = True),

            tfk.callbacks.ReduceLROnPlateau(monitor = 'val_loss', mode = 'min', patience = 8, factor = 0.1, min_lr = 1e-5)
      ]

### Load Data

In [41]:
# creates sequences and performs padding

flag = False                                              # flag = False for zero padding, flag = True for forward padding

def createSeq (data, window, stride, telescope):

  assert window % stride == 0

  x = []
  y = []
  tempX = tempY = data.copy().values

  #checks if the data is perfectly divided by the window
  paddingCheck = len(data) % window

  if(paddingCheck != 0):
    #padds the sequence such that the window can be applied to the data an integer amount of times
    paddingLen = window - len(data) % window

    if flag: #forward padding

      lastValue = tempX[-1]

      padding = np.full((paddingLen, tempX.shape[1]), lastValue, dtype = 'float32')

      tempX = np.concatenate((tempX, padding))
      tempY = np.concatenate((tempY, padding))

    else: #zero padding

      padding = np.zeros((paddingLen, tempX.shape[1]), dtype = 'float32')
      tempX = np.concatenate((padding, data))

      padding = np.zeros((paddingLen,tempY.shape[1]), dtype = 'float32')
      tempY = np.concatenate((padding,tempY))

    assert len(tempX) % window == 0 #sanity check

  for index in np.arange(0, len(tempX) - window - telescope, stride):

    x.append(tempX[index: index + window])
    y.append(tempY[index + window: index + window + telescope])

  x = x
  y = y

  return np.array(x), np.array(y)


  """
MEAN PADDING

    mean_value = np.mean(data.values)

    padding = np.full((paddingLen, tempX.shape[1]), mean_value, dtype='float32')
    tempX = np.concatenate((padding, data))

    padding = np.full((paddingLen, tempX.shape[1]), mean_value, dtype='float32')
    tempY = np.concatenate((padding,tempY))

"""

"""
ZERO PADDING

    padding = np.zeros((paddingLen, tempX.shape[1]), dtype = 'float32')
    tempX = np.concatenate((padding, data))

    padding = np.zeros((paddingLen,tempY.shape[1]), dtype = 'float32')
    tempY = np.concatenate((padding,tempY))
"""

"""
BACKWARD PADDING

    firstValue = tempX[0]

    padding = np.full((paddingLen, tempX.shape[1]), firstValue, dtype='float32')

    tempX = np.concatenate((padding, tempX))
    tempY = np.concatenate((padding, tempY))


"""

"""
FORWARD PADDING

    lastValue = tempX[-1]

    padding = np.full((paddingLen, tempX.shape[1]), lastValue, dtype='float32')

    tempX = np.concatenate((tempX, padding))
    tempY = np.concatenate((tempY, padding))

"""

"\nFORWARD PADDING\n\n    lastValue = tempX[-1]\n\n    padding = np.full((paddingLen, tempX.shape[1]), lastValue, dtype='float32')\n\n    tempX = np.concatenate((tempX, padding))\n    tempY = np.concatenate((tempY, padding))\n\n"

In [67]:
#loads sequenes one by one and inputs them in createseq (block 1, see below)
def createSequences (data, window, stride, telescope):

  x = []
  y = []


  for el in data:

    #loads sequence
    d = pd.DataFrame({'A': el})
    xTemp, yTemp = createSeq(d, window, stride, telescope)

    if(len(xTemp != 0)): #sanity check

      x.append(xTemp)
      y.append(yTemp)

  x = np.concatenate( x, axis=0 )
  y = np.concatenate( y, axis=0 )


  return x, y


In [68]:
#standardization of values using robust scaler: this technique considers how
#distant each data point is from the input’s median and,
#specifically, it computes the distance by means of the Interquartile Range (IQR)

def applyRobustScaler (data):

  elements = []

  for el in data:

    scaler = RobustScaler()
    reshaped = el.reshape(-1, 1)
    scaled = scaler.fit_transform(reshaped)
    normalized = reshaped.flatten()

    elements.append(normalized)

  return np.array(elements)

In [69]:
# creates train and test set (block 2, see below)

def createTrainTest (data):

  flag = True                                           # flag = False for retraining with test set

  testSize = round(len(data) * 0.1)

  if flag:

    xTest, yTest = createSequences(data[:testSize], window, stride, telescope)
    xTrain, yTrain = createSequences(applyRobustScaler(data[testSize:]), window, stride, telescope)

    print(testSize, len(data[:testSize]))

  else:

    xTest, yTest = createSequences(applyRobustScaler(data[:testSize]), window, stride, telescope)
    xTrain, yTrain = createSequences(applyRobustScaler(data[testSize:]), window, stride, telescope)

  return xTrain, yTrain, xTest, yTest

In [70]:
xTrainA, yTrainA, xTestA, yTestA = createTrainTest(a)

xTrainA.shape, yTrainA.shape, xTestA.shape, yTestA.shape

4794 4794


((885907, 200, 1), (885907, 9, 1), (168722, 200, 1), (168722, 9, 1))

### Define the Model

To train the model set the following parameters in the 'Load Data' Section:

  - Block 1:
             flag = True during the training of model with forward padding
             flag = False during the training of the model with zero padding

  - Block 2:
  
             flag = True

In [None]:
inputShape = xTrainA.shape[1:]
outputShape = yTrainA.shape[1]
batchSize = 128
epochs = 200

In [None]:
#model architecture

def buildModel(input_shape, output_shape):

    input_layer = tfkl.Input(shape=input_shape, name='Input')

    x = tfkl.LSTM(64, return_sequences = True, name='lstm')(input_layer)

    cnn = tfkl.Conv1D(128,3,padding = 'same', activation = 'relu')(x)
    cnn = tfkl.MaxPooling1D()(cnn)

    cnn = tfkl.Conv1D(256,3,padding = 'same', activation = 'relu')(cnn)
    cnn = tfkl.MaxPooling1D()(cnn)

    cnn = tfkl.Conv1D(512,3,padding = 'same', activation = 'relu')(cnn)
    gap = tfkl.GlobalAveragePooling1D()(cnn)

    dropout = tfkl.Dropout(.25, seed = seed)(gap)

    dense = tfkl.Dense(512, activation = tf.keras.activations.mish, kernel_constraint=tfk.constraints.MaxNorm(1.5))(dropout)

    dropout = tfkl.Dropout(.1, seed = seed)(dense)

    dense = tfkl.Dense(128, activation = tf.keras.activations.mish, kernel_constraint=tfk.constraints.MaxNorm(1.5))(dropout)

    output_layer = tfkl.Dense(output_shape, activation = 'linear')(dense)

    model = tf.keras.Model(inputs = input_layer, outputs = output_layer, name='LSTMCNN_Model')

    model.compile(loss = tf.keras.losses.MeanSquaredError(), optimizer = tfk.optimizers.Nadam(learning_rate = 0.001, weight_decay=0.004, beta_1=0.9,
                    beta_2=0.999, epsilon=1e-07, ema_momentum=0.99, name="Nadam"), metrics = tf.keras.metrics.MeanAbsoluteError())

    return model

In [None]:
model = buildModel(inputShape, outputShape)
model.summary()

In [None]:
#model training
history = model.fit(

    x = xTrainA,
    y = yTrainA,
    batch_size = batchSize,
    epochs = epochs,
    validation_split = .1,
    callbacks = calls

).history

In [None]:
#plotting results

best_epoch = np.argmin(history['val_loss'])
plt.figure(figsize=(17,4))
plt.plot(history['loss'], label='Training loss', alpha=.8, color='#ff7f0e')
plt.plot(history['val_loss'], label='Validation loss', alpha=.9, color='#5a9aa5')
plt.axvline(x=best_epoch, label='Best epoch', alpha=.3, ls='--', color='#5a9aa5')
plt.title('Mean Squared Error')
plt.legend()
plt.grid(alpha=.3)
plt.show()

plt.figure(figsize=(18,3))
plt.plot(history['lr'], label='Learning Rate', alpha=.8, color='#ff7f0e')
plt.axvline(x=best_epoch, label='Best epoch', alpha=.3, ls='--', color='#5a9aa5')
plt.legend()
plt.grid(alpha=.3)
plt.show()

In [None]:
model.save('ZeroPadding')

In [None]:
model.save('ForwardPadding')

### Retrain with Test Set

To retrain the model considering also the test set, set the following parameters in the 'Load Data' Section:

  - Block 1:
             flag = True during the training of model with forward padding
             flag = False during the training of the model with zero padding

  - Block 2:
  
             flag = False

In [None]:
#loads zero padding model
model = tfk.models.load_model('ZeroPadding')

In [None]:
#loads forward padding model
model = tfk.models.load_model('ForwardPadding')

In [None]:
x = np.concatenate( (xTrainA, xTestA), axis=0 )
y = np.concatenate( (yTrainA, yTestA), axis=0 )

print(x.shape, y.shape)

In [None]:
inputShape = x.shape[1:]
outputShape = y.shape[1]
batchSize = 128
epochs = 200

In [None]:
#training with the whole dataset
history = model.fit(

    x = x,
    y = y,
    batch_size = batchSize,
    epochs = 5,
    validation_split = .1,
    callbacks = calls

).history

In [None]:
model.save('ZeroPaddingWithTest')

In [None]:
model.save('ForwardPaddingWithTest')

### Predict samples

In [None]:
#predictions and evaluations
predictions = model.predict(xTestA, verbose = 0)

print(f"Predictions shape: {predictions.shape} ")

mean_squared_error = tfk.metrics.mean_squared_error(yTestA.flatten(), predictions.flatten()).numpy()
print(f"Mean Squared Error: {mean_squared_error}")

mean_absolute_error = tfk.metrics.mean_absolute_error(yTestA.flatten(), predictions.flatten()).numpy()
print(f"Mean Absolute Error: {mean_absolute_error}")

### Ensemble

In [None]:
#ensemble of zero padding and forward padding models
models = []

a = tfk.models.load_model('ZeroPaddingWithTest')
b = tfk.models.load_model('ForwardPaddingWithTest')


In [None]:
a._name = "ZeroPaddingWithTest"
b._name = "ForwardPaddingWithTest"


models.append(a)
models.append(b)

In [None]:
#ensemble approach involving two models: one trained on the Zero Padding Dataset and another one trained on the
#Forward Padding Dataset. We implemented a weighted average strategy, assigning a weight of 0.7 to the first
#model and 0.3 to the second

class WeightedSum(tfkl.Layer):

    def __init__(self, a, **kwargs):
        self.a = a
        super(WeightedSum, self).__init__(**kwargs)

    def call(self, model_outputs):
        return self.a * model_outputs[0] + (1 - self.a) * model_outputs[1]

    def compute_output_shape(self, input_shape):
        return input_shape[0]

In [None]:
def ensembleModels(models, model_input):

    # collect outputs of models in a list
    yModels = [model(model_input) for model in models]

    # averaging outputs
    yAvg = WeightedSum(0.7)([a(model_input), b(model_input)])

    # build model from same input and avg output
    ensembleModel = tfk.Model(inputs = model_input, outputs = yAvg, name = 'ensemble')

    return ensembleModel

In [None]:
inputs = tfk.Input(inputShape)
modelEns = ensembleModels(models, inputs)
modelEns.summary()

Model: "ensemble"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 200, 1)]             0         []                            
                                                                                                  
 ZeroPaddingWithTest (Funct  (None, 18)                   864530    ['input_1[0][0]']             
 ional)                                                                                           
                                                                                                  
 ForwardPaddingWithTest (Fu  (None, 18)                   864530    ['input_1[0][0]']             
 nctional)                                                                                        
                                                                                           

In [None]:
modelEns.save('FinalEnsemble')

### Forecasting with 9 Steps Model (AutoRegressive)



In [None]:
model = tfk.models.load_model('FinalEnsemble')

In [None]:
firstPredictions = model.predict(xTestA, verbose = 0) #calculate predictions

In [None]:
#deletes the first 9 samples and adds our predictions
newX = []

i = 0

for el in xTestA:

  elWithoutNine = el[9:]
  elWithPred = np.append(elWithoutNine, firstPredictions[i])

  newX.append(elWithPred)
  i += 1

newX = np.array(newX)

In [None]:
newX = np.expand_dims(newX, axis=-1)

In [None]:
#computes the second set of predictions
SecondPredictions = model.predict(newX, verbose = 0)

In [None]:
#merges the two predictions
pred = []

for i in range(len(firstPredictions)):

  a = np.append(firstPredictions[i], SecondPredictions[i])
  pred.append(a)

pred = np.array(pred)

In [None]:
print(f"Predictions shape: {pred.shape} ")

mean_squared_error = tfk.metrics.mean_squared_error(yTestA.flatten(), pred.flatten()).numpy()
print(f"Mean Squared Error: {mean_squared_error}")

mean_absolute_error = tfk.metrics.mean_absolute_error(yTestA.flatten(), pred.flatten()).numpy()
print(f"Mean Absolute Error: {mean_absolute_error}")