In [None]:
#-----------------------------------* FIRST ATTEMPT *---------------------------------------
# Problem description: develop a forecasting model that is able to predict several uncorrelated time series

# DATA STRUCTURE: 
# Single folder containing the following files:
# -> 'training_data.npy': it contains a numpy array of shape (48000, 2776). 48000 time series of length 2776.
# -> 'valid_periods.npy': it contains a numpy array of type (48000, 2) containing for each of the time series the start and end index of the current series, i.e. the part without padding.
# -> 'categories.npy': it contains a numpy array of shape (48000,), containing for each of the time series the code of its category. The possible categories are in {'A', 'B', 'C', 'D', 'E', 'F'}.
# IMPORTANT: This is a dataset consisting of monovariate time series, i.e. composed of a single feature, belonging to six different domains. The time series of each domain are not to be understood as closely related to each other, but only as collected from similar data sources.
# What is required of you is therefore to build a model that is capable of generalising sufficiently to predict the future samples of the 60 time series of the test set.

In [None]:
# Import libraries and connect to drive personal folder

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/homework_2/Edoardo

# Fix randomness and hide warnings
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl
tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)


# Import other support libraries

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
plt.rc('font', size=16)
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler


In [None]:
# Define function for data inport:
# The function takes as input the path of the folder containing the data and returns the training data, the validation periods and the categories
# Before doing so, it eventually unzips it if the zip flag is set to True

def import_data(path, zip=False):
    if zip:
        !unzip -q $path
    training_data = np.load('training_data.npy')
    valid_periods = np.load('valid_periods.npy')
    categories = np.load('categories.npy')
    return training_data, valid_periods, categories

# Call the function to import the data
training_data, valid_periods, categories = import_data('training_dataset.zip', zip=False)

# Check the shape of the data
print(training_data.shape)
print(valid_periods.shape)
print(categories.shape)


In [None]:
# -----------------------------------* DATA ANALYSIS *---------------------------------------

# Note that all the values are store as float64, so we can convert them to float32 to save memory
if training_data.dtype == np.float64:
    training_data = training_data.astype(np.float32)

# -<-<-<-<-<-<-<-<-<-< FIRST PLOTS AND PADDING REMOVAL >->->->->->->->->->->-

# Plot the first 10 time series

fig, axs = plt.subplots(2, 5, figsize=(20, 10))
axs = axs.ravel()
for i in range(10):
    axs[i].plot(training_data[i])
    axs[i].set_title('Category: {}'.format(categories[i]), fontsize=10)
    axs[i].set_xlabel('Time', fontsize=10)
    axs[i].set_ylabel('Value', fontsize=10)

fig.suptitle('First 10 time series', fontsize=30)

# Define a function to remove the padding from the time series

def remove_padding(data, valid_periods):
    data_no_pad = []
    for i in range(data.shape[0]):
        data_no_pad.append(data[i, valid_periods[i, 0]:valid_periods[i, 1]])
    return np.array(data_no_pad)

# Remove the padding from the time series
training_data_no_pad = remove_padding(training_data, valid_periods)

# Plot the first 10 time series without padding, keeping the information about the orignal temporal location 
fig, axs = plt.subplots(2, 5, figsize=(20, 10))
axs = axs.ravel()
for i in range(10):
    axs[i].plot(range(valid_periods[i, 0], valid_periods[i, 1]), training_data_no_pad[i])
    axs[i].set_title('Category: {}'.format(categories[i]), fontsize=10)
    axs[i].set_xlabel('Time', fontsize=10)
    axs[i].set_ylabel('Value', fontsize=10)

fig.suptitle('First 10 time series without padding', fontsize=30)

# All this samples belong to category D, and are "to be understood as not closely related to each other, but only as collected from similar data sources"
# This means that the time seies within the same categoty are to be considered as uncorrelated, but should there not be a correlationbetween the catoegories?


In [None]:

# -<-<-<-<-<-<-<-<-<-< DATA DISTRIBUTION ANALYSIS >->->->->->->->->->->-


# GROUP DATA BY CATEGORY:

# Transform the categories into numbers
categorical_to_numerical = {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F':5}

if type(categories[0]) == str:
    for category in np.unique(categories):
        categories[categories == category] = categorical_to_numerical[category]
    # To save memory, convert the categories to int32
    categories = categories.astype(np.int32)

# Now plot the distribution of the time series over the different categories
fig, ax = plt.subplots(figsize=(20, 10))
ax.hist(categories, bins=6)
ax.set_title('Distribution of the time series over the different categories', fontsize=20)
ax.set_xlabel('Category')
ax.set_ylabel('Number of time series')
# separate the bars and use different colors
for i in range(6):
    ax.patches[i].set_color('C{}'.format(i))
plt.xticks(np.arange(6))

# OSS: a visible imbalance between the categories is present, with category 5 been extremely underrepresented 
# category 1 is just slightly underrepresented, while the other categories are more or less equally represented)


# -<-<-<-<-<-<-<-<-<-<-<-< DATA NORMLIZATION >->->->->->->->->->->->->-
# AS THE DATA HAS ALREADY BEEN NORMALIZED (EACH ROW HAS BEEN NORMALIZED INDIPENDENTLY)
# WE DEEMED THIS STEP NOT NECESSARY


# Define min_max normalization function
def min_max_norm(data, min, max):
    return (data - min) / (max - min)

# Apply the function over the categories singularly
def min_max_norm_by_category(data, categories):
    # Loop over the categories
    for category in np.unique(categories):
        # For each category compute the min and max
        C_min = np.min(data[categories == category])
        C_max = np.max(data[categories == category])
        # Apply the min_max_norm function to the data of the current category
        data[categories == category] = min_max_norm(data[categories == category], C_min, C_max)
    return data


# -<-<-<-<-<-<-< PLOT OF FIRST TIME SERIES FOR DIFFERENT CATEGORIES >->->->->->-

fig, axs = plt.subplots(2, 3, figsize=(20, 10))
axs = axs.ravel()
for i in range(6):
    axs[i].plot(training_data_no_pad[categories == i][0])
    axs[i].set_title('Category: {}'.format(i), fontsize=10)
    axs[i].set_xlabel('Time', fontsize=10)
    axs[i].set_ylabel('Value', fontsize=10)

fig.suptitle('First time series for each category', fontsize=30)

# -<-<-<-<-<-<-< PLOT OF TIME SERIES BY LENGHT AND DOMAIN  >->->->->->-

# Compute mean lenght of the time series for each category and plot them in a bar plot
mean_lenght = []
for i in range(6):
    mean_lenght.append(np.mean(valid_periods[categories == i, 1] - valid_periods[categories == i, 0]))
fig, ax = plt.subplots(figsize=(20, 10))
ax.bar(np.arange(6), mean_lenght)
ax.set_title('Mean lenght of the time series for each category', fontsize=20)
ax.set_xlabel('Category')
ax.set_ylabel('Mean lenght')
plt.xticks(np.arange(6))
for i in range(6):
    ax.patches[i].set_color('C{}'.format(i))

# OSS: a fairly similar mean lenght for all the categories




In [None]:
# Define some Hyperparameters for the forcasting model
WINDOW_SIZE = 200
BATCH_SIZE = 32
STRIDE = 10
EPOCHS = 100
VALIDATION_SPLIT = 0.2
TELESCOPE = 18

# In this case, given the nature of the problem, as well as the fact that the time series are not correlated
# (so introducing some of them in training phase should now bias the test), we decide
# to make use of the great variaty of time series available, and split data among the time series themselves:

# This way, a certain percentage of the time series will be used for training (80% initial partition), while the remaining ones will be used for validation (20).
# In both case, we define the number of samples to be predicted as TELESCOPE, and the number of samples to be used for the prediction as WINDOW_SIZE.

# For the testing, we respect the imbalances among the different domains using stratify based on the categories

X_train_raw, X_val_raw, y_train_raw, y_val_raw = train_test_split(training_data_no_pad, categories, test_size=VALIDATION_SPLIT, stratify=categories)


In [None]:
# -----------------------------------* DATA PREPROCESSING *---------------------------------------
# Build the sequences for the forecasting model: in this case, we will try to take into account the 
# imbalance between the categories usign a weighted loss function

# The function lets us build the sequences to use for the training of the forecasting model:
# In this case, given a certain domain (category), the function will build the sequences to be used for the training
# by extracting temporal windows out of each time series.
# Please note tha some of the time series will takes as they are, since they are shorter than the window size.
# In some other cases, the time series are "partioned" in windows of the same size, and the last window is padded with zeros
# In any case, for the correct creation of the sequences, dataset to be passed to the function must be the one without padding
def build_sequences_on_domain(data = training_data_no_pad, domain = 0, window=200, stride=20, telescope=100):
    # Sanity check to avoid runtime errors
    assert window % stride == 0
    dataset = []
    labels = []
    temp_df = data[categories == domain]
    temp_label = temp_df
    # For each time series of the specific domain, compute the padding lenght
    # OSS... This is done on the data without padding
    for time_series_n in len(temp_df):
        padding_len = window - len(temp_df[time_series_n])%window
        if(padding_len != 0):
            padding = np.zeros((padding_len), dtype='float32')
            temp_df[time_series_n] = np.concatenate((padding,temp_df[time_series_n]))
            padding = np.zeros((padding_len), dtype='float32')
            temp_label[time_series_n] = np.concatenate((padding,temp_label[time_series_n]))
        assert len(temp_df[time_series_n]) % window == 0

    for time_series_n in range(len(temp_df)):
        # Extract the temporal windows from the time series
        if len(temp_df[time_series_n]) < window + telescope:
            # Generate a padding of dimension telescope to be addded at the beginning of the time series
            padding = np.zeros((telescope), dtype='float32')
            temp_df[time_series_n] = np.concatenate((padding,temp_df[time_series_n]))
            dataset.append(temp_df[time_series_n][0:window])
            labels.append(temp_label[time_series_n][window:window+telescope])
        else:
            for i in np.arange(0, len(temp_df[time_series_n])-window-telescope, stride):
                dataset.append(temp_df[time_series_n][i:i+window])
                labels.append(temp_label[time_series_n][i+window:i+window+telescope])

    dataset = np.array(dataset)
    labels = np.array(labels)
    return dataset, labels

def inspect_multivariate(X, y, columns, telescope, idx=None):
    if(idx==None):
        idx=np.random.randint(0,len(X))

    # Plot three sequences chosen based on idx
    figs, axs = plt.subplots(3, 1, sharex=True, figsize=(17,17))
    for i in range(idx, idx+3):
        axs[i].plot(np.arange(len(X[i])), X[i])
        axs[i].scatter(np.arange(len(X[i]), len(X[i])+telescope), y[i], color='orange')
        axs[i].set_title('Sequence {}'.format(i))
        axs[i].set_ylim(0,1)
    plt.show()

In [None]:
X_train = []
y_train = []
X_val = []
y_val = []
for i in range(6):
    X_train_temp, y_train_temp = build_sequences_on_domain(data=X_train_raw, domain=i, window=WINDOW_SIZE, stride=STRIDE, telescope=TELESCOPE)
    X_val_temp, y_val_temp = build_sequences_on_domain(data=X_val_raw, domain=i, window=WINDOW_SIZE, stride=STRIDE, telescope=TELESCOPE)
    # Concatenate with the previous domains
    X_train = np.concatenate((X_train, X_train_temp))
    y_train = np.concatenate((y_train, y_train_temp))
    X_val = np.concatenate((X_val, X_val_temp))
    y_val = np.concatenate((y_val, y_val_temp))
