In [1]:
seed = 42

import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['PYTHONHASHSEED'] = str(seed)
os.environ['MPLCONFIGDIR'] = os.getcwd()+'/configs/'

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=Warning)

import numpy as np
np.random.seed(seed)

import logging

import random
random.seed(seed)

In [2]:
# Import tensorflow
import tensorflow as tf
from tensorflow import keras as tfk
from tensorflow.keras import layers as tfkl

physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], enable=True)

tf.autograph.set_verbosity(0)
tf.get_logger().setLevel(logging.ERROR)
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)
print(tf.__version__)

2.15.0


In [3]:
import pandas as pd
from pandas import Series
import seaborn as sns
from datetime import datetime
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
plt.rc('font', size=16)
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [4]:
# Mount Drive

from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive/My Drive/HW2

Mounted at /gdrive
/gdrive/My Drive/HW2


In [5]:
training_data = np.load("training_data.npy").astype(np.float32)
categories = np.load("categories.npy")
valid_periods=np.load("valid_periods.npy")
categories_unique = np.unique(categories)

print(training_data.shape)
print(valid_periods.shape)
print(categories.shape)

(48000, 2776)
(48000, 2)
(48000,)


In [6]:
def build_dataset(category, window_length=200, stride=25):
    dataset = []

    skipped_samples = 0
    dataset_window_length = window_length
    sample_map = categories==category

    valid_perios_for_category = valid_periods[sample_map]
    training_data_for_category = training_data[sample_map]

    for i in range(len(valid_perios_for_category)):
        from_time = valid_perios_for_category[i,0]
        to_time = valid_perios_for_category[i,1]
        duration = to_time - from_time

        if(duration<dataset_window_length):
            skipped_samples+=1
            continue

        n_samples = 1 + (duration-dataset_window_length)//stride

        for j in range(n_samples):
            sample = training_data_for_category[i][from_time+j*stride:from_time+j*stride+dataset_window_length]
            dataset.append(sample)

    dataset = np.array(dataset)
    dataset = dataset.reshape((dataset.shape[0], dataset.shape[1], 1))
    print("Skipped samples: ", skipped_samples)

    return dataset

def build_dataset_zero_padding(category, window=200, stride=50):
    dataset = []
    sample_map = categories==category

    periods = valid_periods[sample_map]
    data = training_data[sample_map]

    # Sanity check to avoid runtime errors
    assert window % stride == 0

    for i in range(len(data)):
        # Take only meaningful features
        temp = data[i][periods[i, 0] : periods[i, 1]]
        # Compute padding length
        padding_len = window - len(temp)%window
        # Create padding and concatenate it
        padding = np.zeros(padding_len, dtype='float32')
        temp = np.concatenate((padding,temp))
        # Build features windows with their corresponging labels
        idx = 0
        while idx+window <= len(temp):
            dataset.append(temp[idx:idx+window])
            idx += stride
    dataset = np.array(dataset)
    dataset = dataset.reshape((dataset.shape[0], dataset.shape[1], 1))
    return dataset

def build_dataset_mean_padding(category, window=200, stride=50):
    dataset = []
    sample_map = categories==category

    periods = valid_periods[sample_map]
    data = training_data[sample_map]

    # Sanity check to avoid runtime errors
    assert window % stride == 0

    for i in range(len(data)):
        # Take only meaningful features
        temp = data[i][periods[i, 0] : periods[i, 1]]
        # Compute padding length
        padding_len = window - len(temp)%window
        # Compute mean
        mean = np.mean(temp)
        # Create padding and concatenate it
        padding = np.full(padding_len, mean, dtype='float32')
        temp = np.concatenate((padding, temp))
        # Build features windows with their corresponging labels
        idx = 0
        while idx+window <= len(temp):
            dataset.append(temp[idx:idx+window])
            idx += stride
    dataset = np.array(dataset)
    dataset = dataset.reshape((dataset.shape[0], dataset.shape[1], 1))
    return dataset

def build_dataset_mean_padding(category, window=200, stride=50):
    dataset = []
    sample_map = categories==category

    periods = valid_periods[sample_map]
    data = training_data[sample_map]

    # Sanity check to avoid runtime errors
    assert window % stride == 0

    for i in range(len(data)):
        # Take only meaningful features
        temp = data[i][periods[i, 0] : periods[i, 1]]
        # Compute padding length
        padding_len = window - len(temp)%window
        # Compute mean
        mean = np.mean(temp)
        # Create padding and concatenate it
        padding = np.full(padding_len, mean, dtype='float32')
        temp = np.concatenate((padding, temp))
        # Build features windows with their corresponging labels
        idx = 0
        while idx+window <= len(temp):
            dataset.append(temp[idx:idx+window])
            idx += stride
    dataset = np.array(dataset)
    dataset = dataset.reshape((dataset.shape[0], dataset.shape[1], 1))
    return dataset

def build_dataset_value_padding(category, window=200, stride=50, value=None):
    dataset = []
    sample_map = categories==category

    periods = valid_periods[sample_map]
    data = training_data[sample_map]

    # Sanity check to avoid runtime errors
    assert window % stride == 0

    for i in range(len(data)):
        # Take only meaningful features
        temp = data[i][periods[i, 0] : periods[i, 1]]
        # Compute padding length
        padding_len = window - len(temp)%window
        # Compute mean
        padding_value = value
        if value is None:
            padding_value = temp[0]
        # Create padding and concatenate it
        padding = np.full(padding_len, padding_value, dtype='float32')
        temp = np.concatenate((padding, temp))
        # Build features windows with their corresponging labels
        idx = 0
        while idx+window <= len(temp):
            dataset.append(temp[idx:idx+window])
            idx += stride
    dataset = np.array(dataset)
    dataset = dataset.reshape((dataset.shape[0], dataset.shape[1], 1))
    return dataset

def build_dataset_repeat_padding(category, window=200, stride=50):
    dataset = []
    sample_map = categories==category

    periods = valid_periods[sample_map]
    data = training_data[sample_map]

    # Sanity check to avoid runtime errors
    assert window % stride == 0

    for i in range(len(data)):
        # Take only meaningful features
        temp = data[i][periods[i, 0] : periods[i, 1]]
        # Compute padding length
        padding_len = window - len(temp)%window
        # Create padding and concatenate it
        padding = temp[:padding_len]
        temp = np.concatenate((padding, temp))
        # Build features windows with their corresponging labels
        idx = 0
        while idx+window <= len(temp):
            dataset.append(temp[idx:idx+window])
            idx += stride
    dataset = np.array(dataset)
    dataset = dataset.reshape((dataset.shape[0], dataset.shape[1], 1))
    return dataset

In [13]:
window_length=[20, 80, 20, 80, 20]
telescope=10
stride=[5, 20, 5, 20, 5]

In [14]:
datasets = {}
for category, w, s in zip(categories_unique, window_length, stride):
    dataset = build_dataset(category, window_length=w, stride=s)
    datasets[category] = dataset

Skipped samples:  0
Skipped samples:  3683
Skipped samples:  0
Skipped samples:  2663
Skipped samples:  0


In [15]:
# Standardize data

standardize = False

if standardize:
    for category in categories_unique:
        scaler = StandardScaler()

        data = []
        sample_map = categories==category

        periods = valid_periods[sample_map]
        ts = training_data[sample_map]
        for t, v in zip(ts, periods):
            data += t[v[0]:v[1]].flatten().tolist()

        series = Series(data)
        values = series.values
        values = values.reshape((len(values), 1))
        scaler = scaler.fit(values)

        dataset = datasets[category]
        for i in range(len(dataset)):
            dataset[i] = scaler.transform(dataset[i])


In [16]:
def build_CONV_LSTM_model(input_shape, output_shape):
    # Ensure the input time steps are at least as many as the output time steps
    assert input_shape[0] >= output_shape[0], "For this exercise we want input time steps to be >= of output time steps"

    # Define the input layer with the specified shape
    input_layer = tfkl.Input(shape=input_shape, name='input_layer')

    # Add a Bidirectional LSTM layer with 64 units
    x = tfkl.Bidirectional(tfkl.LSTM(64, return_sequences=True, name='lstm'), name='bidirectional_lstm')(input_layer)

    # Add a 1D Convolution layer with 128 filters and a kernel size of 3
    x = tfkl.Conv1D(128, 3, padding='same', activation='relu', name='conv')(x)

    # Add a final Convolution layer to match the desired output shape
    output_layer = tfkl.Conv1D(output_shape[1], 3, padding='same', name='output_layer')(x)

    # Calculate the size to crop from the output to match the output shape
    crop_size = output_layer.shape[1] - output_shape[0]

    # Crop the output to the desired length
    output_layer = tfkl.Cropping1D((0, crop_size), name='cropping')(output_layer)

    # Construct the model by connecting input and output layers
    model = tf.keras.Model(inputs=input_layer, outputs=output_layer, name='CONV_LSTM_model')

    # Compile the model with Mean Squared Error loss and Adam optimizer
    model.compile(loss=tf.keras.losses.MeanSquaredError(), optimizer=tf.keras.optimizers.Adam(), metrics=['mse', "mae"])

    return model

In [17]:
def train_models(datasets, testOne=None):
    histories = {}
    for category in (categories_unique if testOne is None else categories_unique[testOne]):
        print("Training model for category: ", category)
        X_data = datasets[category]
        X_train, y_train = X_data[:, :-telescope], X_data[:, -telescope:]

        input_shape = X_train.shape[1:]
        output_shape = y_train.shape[1:]
        batch_size = 64
        epochs = 200

        model = build_CONV_LSTM_model(input_shape, output_shape)
        history = model.fit(
            x = X_train,
            y = y_train,
            batch_size = batch_size,
            epochs = epochs,
            validation_split=.1,
            callbacks = [
                tfk.callbacks.EarlyStopping(monitor='val_loss', mode='min', patience=12, restore_best_weights=True),
                tfk.callbacks.ReduceLROnPlateau(monitor='val_loss', mode='min', patience=10, factor=0.1, min_lr=1e-5)
            ]
        ).history

        histories[category] = history

        model.save("Vanilla_200Window/Vanilla_"+category)
        del model

    for c in categories_unique:
        print()
        print("Model " + category + " MSE: "+ str(histories[c]["val_mse"][-1]))
        print("Model " + category + " MAE: " + str(histories[c]["val_mae"][-1]))

In [None]:
train_models(datasets)

Training model for category:  A
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200