In [None]:
import sys
from pathlib import Path

# Add the root project folder to the python path in order to use the packages
path_root = Path( '/project_ghent/HADSP/hadsp/')
sys.path.append(str(path_root))

In [None]:
import numpy as np
from scipy import sparse
from tqdm import tqdm
from importlib import reload

# SEED
SEED = 49387

from matplotlib import pyplot as plt
from seaborn import heatmap, color_palette

# Datasets

Lots of different on availabale : https://towardsdatascience.com/a-data-lakes-worth-of-audio-datasets-b45b88cd4ad

Classification: 
https://arxiv.org/abs/1803.07870

https://github.com/FilippoMB/Time-series-classification-and-clustering-with-Reservoir-Computing

Multivariate:
https://www.timeseriesclassification.com/dataset.php

### HAART 
https://www.cs.ubc.ca/labs/spin/data/

In [None]:
# Charge HAART dataset from https://www.cs.ubc.ca/labs/spin/data/HAART%20DataSet.zip if it's not already done
# download and unzip the dataset
import os
import urllib.request
import zipfile

if not os.path.exists('datasets/HAART DataSet'):
    urllib.request.urlretrieve('https://www.cs.ubc.ca/labs/spin/data/HAART%20DataSet.zip', 'datasets/HAART DataSet.zip')
    # unzip the dataset in "datasets/HAART DataSet" folder
    with zipfile.ZipFile('datasets/HAART DataSet.zip', 'r') as zip_ref:
        zip_ref.extractall('datasets/HAART DataSet')
            
    # delete zip
    os.remove('datasets/HAART DataSet.zip')

import datasets.load_datasets
reload(datasets.load_datasets)
from datasets.load_datasets import load_haart_dataset

sampling_rate, X_train_band, Y_train, X_test_band, Y_test = load_haart_dataset("datasets/HAART DataSet/training.csv", "datasets/HAART DataSet/testWITHLABELS.csv")

X_pretrain_band = np.concatenate(X_train_band[200:], axis=0).T

is_multivariate = True


## Multivariate generation if necessary

Spectrograms_vs_Cochleagrams : https://www.researchgate.net/publication/340510607_Speech_recognition_using_very_deep_neural_networks_Spectrograms_vs_Cochleagrams

### Spectral density and peak selection

In [None]:
from datasets.multivariate_generation import extract_peak_frequencies

if is_multivariate:
    filtered_peak_freqs = extract_peak_frequencies(X_pretrain_band[0].flatten(), sampling_rate, threshold=1e-5, nperseg=1024, visualize=True)
else:
    filtered_peak_freqs = extract_peak_frequencies(X_pretrain.flatten(), sampling_rate, threshold=1e-5, nperseg=1024, visualize=True)

print("Number of frequencies selected :", len(filtered_peak_freqs))

### Applying normal band pass filter on data

In [None]:
import datasets.multivariate_generation 
reload(datasets.multivariate_generation)
from datasets.multivariate_generation import generate_multivariate_dataset, extract_peak_frequencies

if not is_multivariate:
    X_pretrain_band, X_train_band, X_test_band = generate_multivariate_dataset(
        filtered_peak_freqs, X_pretrain, X_train, X_test, sampling_rate, nb_jobs=-1
    )

### Standardizing the amplitudes

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

# pretrain
# Be really carefull of the column order here !
filtered_data = scaler.fit_transform(X_pretrain_band.T)
filtered_data = filtered_data.T

if isinstance(X_train_band, list): # Multiple instances -> classification
    # train
    X_train_band = [scaler.fit_transform(time_series) for time_series in tqdm(X_train_band)]
    
    # test
    X_test_band = [scaler.fit_transform(time_series) for time_series in tqdm(X_test_band)]
else: # TODO: add more check. One instance -> prediction
    print("hello")
    # train
    X_train_band = scaler.fit_transform(X_train_band)

    # test
    X_test_band = scaler.fit_transform(X_test_band)


if not is_multivariate: 
    if isinstance(X_train, list): # Multiple instances -> classification
        # train
        X_train = [scaler.fit_transform(x).flatten() for x in tqdm(X_train)]
    
        # test
        X_test = [scaler.fit_transform(x).flatten() for x in tqdm(X_test)]
    else : # TODO: add more check. One instance -> prediction
        print("hello")
        # train
        X_train = scaler.fit_transform(X_train)
    
        # test
        X_test = scaler.fit_transform(X_test)

        

In [None]:
filtered_data.shape

# Generating datasets

## Reservoir functions

In [None]:
from reservoir.activation_functions import tanh, heaviside, sigmoid

# the activation function choosen for the rest of the experiment
# activation_function = lambda x : sigmoid(2*(x-0.5))
activation_function = lambda x : tanh(x)

plt.plot(np.linspace(0, 1.1, 100), activation_function(np.linspace(0, 1.1, 100)))
plt.grid()

## Plot  pretrain dataset

In [None]:
# Min window size to get all the dynamics ? 
min_window_size = sampling_rate/filtered_peak_freqs[-1]
max_window_size = sampling_rate/filtered_peak_freqs[0]

print(min_window_size)
print(max_window_size)

In [None]:
#Compute the moving average 
window_size = 10

if max_window_size <= window_size or  window_size <= min_window_size:
    raise ValueError(f"window_size must be greater than {min_window_size} and smaller than {max_window_size}. Current window_size is {window_size}.")

weights = np.repeat(1.0, window_size)/window_size
ma = np.array([np.convolve(d, weights, 'valid') for d in (filtered_data)])

END = 1500
START = 1000
DIFF = END - START
#CPlot the two for different frequencies
NB_1 = 1
fig, ax = plt.subplots(3, 1, figsize=(24,18))
ax[0].plot(range(DIFF), filtered_data[NB_1, START:END], label='Time serie')
ax[0].plot(range(DIFF), ma[NB_1, START:END], label='Moving average')
ax[0].legend(fontsize=26)

NB_2 = 10
ax[1].plot(range(DIFF), filtered_data[NB_2, START:END], label='Time serie')
ax[1].plot(range(DIFF), ma[NB_2, START:END], label='Moving average')

#Check that the scaler did a good job
ax[2].plot(range(DIFF), X_pretrain_band[NB_2, START:END], label='Time serie')

for i, ax in enumerate(ax):
    # Format subplot
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.tick_params(axis='both', labelsize=26)
    # draw vertical lines to represent the window for some points
    for x in range(100, 500, 100):
        ax.axvspan(x, x + window_size, color='g', alpha=0.2)

plt.show()

## Data to feed to the reservoir

Here we calculate 
**common_dimension** : the number of different dimensions in the input data
 **K** : the number of euron that will receive a particular time serie as input 
**n** : the dimension of the reservoir 

n = K * common_dimension

In [None]:
import math 

def find_common_dimension(array1, array2):
    matching_indices = None
    matching_values = None

    for i, dim in enumerate(array1.shape):
        if dim in array2.shape:
            matching_indices = i
            matching_values = dim
  
    return matching_indices, matching_values

if isinstance(X_train_band, list): # Multiple instances -> classification
    common_xtrain_index, common_xtrain_size = find_common_dimension(X_train_band[0], filtered_data)
else:
    common_xtrain_index = 0
    common_xtrain_size = filtered_data.shape[common_xtrain_index]
      

print("Common dimension index is :", common_xtrain_index)
print("Number of different time series is :", common_xtrain_size)
if is_multivariate:
    print("\nCheck it ! \nFirst array ", X_train_band[0].shape, " and second array", X_train_band[1].shape)

# We want the size of the reservoir to be at least 200
K = math.ceil(200 / common_xtrain_size)
n = common_xtrain_size * K
print("Dimension of our reservoir :", n)
print("Copy of each time serie :", K)


# Hyperparameter search

In [None]:
from performances.esn_model_evaluation import train_and_predict_model, compute_score
from joblib import Parallel, delayed
from reservoir.reservoir import init_matrices
from connexion_generation.bounded_adsp import run_HADSP_algorithm
from connexion_generation.utility import TwoDimArrayWrapper

N_JOBS = -1

import optuna

def objective(trial):
    # Suggest values for the parameters you want to optimize
    input_scaling = trial.suggest_float('input_scaling', 0.01, 1.0, step=0.01)
    bias_scaling = trial.suggest_float('bias_scaling', 0, 10, step=0.1)
    leaky_rate = 0
    connectivity = 0
    input_connectivity = 1
    INCREMENT = trial.suggest_int('INCREMENT', min_window_size + 1, max_window_size - 1)
    VALUE = trial.suggest_float('VALUE', 0.01, 0.5, step=0.01)
    target_rate = trial.suggest_float('target_rate', 0.5, 1, step=0.01)
    growth_parameter = trial.suggest_float('growth_parameter', 0.01, target_rate, step=0.01)
    ridge = trial.suggest_int('ridge', -10, 1)
    RIDGE_COEF = 10**ridge

    reservoir_size = trial.suggest_int('reservoir_size', 50, 1000, 50)

    K = math.ceil(200 / common_xtrain_size)
    n = common_xtrain_size * K

    # We create an array of the same shape as X_pretrain_band but with the same time serie repeated K times
    frequency_bands = np.repeat(filtered_data, K, axis=0)  
    frequency_bands = TwoDimArrayWrapper(frequency_bands)
    caca = []
    caca_test = []
    for i in range(len(X_train_band)):
        if common_xtrain_index == 1:
            caca.append(np.repeat(X_train_band[i], K, axis=1))
        else:
            caca.append(np.repeat(X_train_band[i], K, axis=0).T) # correct axis depends on X_train_band shape
    for i in tqdm(range(len(X_test_band))):
        if common_xtrain_index == 1:
            caca_test.append(np.repeat(X_test_band[i], K, axis=1))
        else:
            caca_test.append(np.repeat(X_test_band[i], K, axis=0).T)

    
    def initialise_and_train(input_scaling, n, input_connectivity, connectivity, bias_scaling, seed, training_set, visualize=False):
        Win, W, bias = init_matrices(n, input_connectivity, connectivity, seed=seed)
        bias *= bias_scaling
        Win *= input_scaling
            
        W, state_history = run_HADSP_algorithm(W, Win, bias, leaky_rate, activation_function, training_set, INCREMENT, VALUE,
                                target_rate, growth_parameter, max_increment=INCREMENT, visualize=visualize)
        
        connectivity =  W.count_nonzero() / (W.shape[0] * W.shape[1])
        eigen = sparse.linalg.eigs(W, k=1, which="LM", maxiter=W.shape[0] * 20, tol=0.1, return_eigenvectors=False)
        sr = max(abs(eigen))
        
        return Win, W, bias, connectivity, sr
    
    total_score = 0
    for _ in range(3):  # Repeat the process three times
        # HADSP + multi
        Win_hadsp_multi, W_hadsp_multi, bias_hadsp_multi, connectivity_band, sr_hadsp_multi = initialise_and_train(
            input_scaling, n, input_connectivity, connectivity, bias_scaling, SEED, frequency_bands
        )
        
        Y_pred_hadsp_multi = train_and_predict_model(
            W_hadsp_multi, Win_hadsp_multi, bias_hadsp_multi, activation_function, RIDGE_COEF, caca, caca_test, Y_train, N_JOBS
        )
        
        score = compute_score(Y_pred_hadsp_multi, Y_test, "HADSP multi", verbosity=0)
        total_score += score

    average_score = total_score / 3  # Average the score

    return score

storage = optuna.storages.RDBStorage(
    url="sqlite:///optuna_hadsp_db.sqlite3",
    engine_kwargs={"pool_size": 20, "connect_args": {"timeout": 10}},
)


# Create a study object and specify the direction as 'maximize'.
study = optuna.create_study(direction='maximize', storage=storage, study_name="hadsp")

# Optimize the study, the number of trials can be set as per computational resources.
study.optimize(objective, n_trials=100)

# Get the best parameters
best_params = study.best_params
print("Best parameters: ", best_params)

In [None]:
if not is_multivariate:
    # HADSP + uni
    Win_hadsp_uni, W_hadsp_uni, bias_hadsp_uni, connectivity_hadsp_uni, sr_hadsp_uni = initialise_and_train(input_scaling, n,  input_connectivity, connectivity, bias_scaling, SEED, X_pretrain.flatten())
    
    # random + uni
    Win_normal, W_normal, bias_normal =  init_matrices(n, 1, connectivity_hadsp_uni, sr_hadsp_uni)
    bias_normal= bias_normal*bias_scaling
    Win_normal= Win_normal*input_scaling   
    
    eigen_normal = sparse.linalg.eigs(W_normal, k=1, which="LM", maxiter=W_normal.shape[0] * 20, tol=0.1, return_eigenvectors=False)
    sr_normal = max(abs(eigen_normal))
    
    heatmap(W_hadsp_uni.todense(), cmap=color_palette("cividis", as_cmap=True))


### Spectral radius normalisation

In [None]:
print(sr_hadsp_multi)
print(sr_random_multi)
if not is_multivariate:
    print(sr_normal)
    print(sr_hadsp_uni)

# Performance