In [None]:
import sys
from pathlib import Path

# Add the root project folder to the python path in order to use the packages
path_root = Path( '/project_ghent/HADSP/hadsp/')
sys.path.append(str(path_root))

In [None]:
import numpy as np
from scipy import sparse
from tqdm import tqdm
from importlib import reload

# SEED
SEED = 49387

from matplotlib import pyplot as plt
from seaborn import heatmap, color_palette

# Datasets

Lots of different on availabale : https://towardsdatascience.com/a-data-lakes-worth-of-audio-datasets-b45b88cd4ad

Classification: 
https://arxiv.org/abs/1803.07870

https://github.com/FilippoMB/Time-series-classification-and-clustering-with-Reservoir-Computing

Multivariate:
https://www.timeseriesclassification.com/dataset.php

## Torchaudio

https://pytorch.org/audio/stable/datasets.html


## Prediction ahead

Datasets available :

* MackeyGlass
* Lorenz

from datasets.load_datasets import load_dataset_prediction
is_instances_classification = False
dataset_name = "MackeyGlass"
step_ahead=5

is_multivariate, sampling_rate, X_train, X_test, Y_train, Y_test, X_pretrain = load_dataset_prediction(dataset_name, step_ahead, visualize=True)


## Classification

Datasets available :

* FSDD
* HAART
* JapaneseVowels

In [None]:
from datasets.load_datasets import load_dataset_classification
is_instances_classification = True
dataset_name = "JapaneseVowels"

is_multivariate, sampling_rate, X_train, X_test, Y_train, Y_test, X_pretrain = load_dataset_classification(dataset_name)

## Multivariate generation if necessary

Spectrograms_vs_Cochleagrams : https://www.researchgate.net/publication/340510607_Speech_recognition_using_very_deep_neural_networks_Spectrograms_vs_Cochleagrams

Attention ! For multivariate shape should be : (nb_of_timeseries, nb_of_timesteps)

In [None]:
if is_multivariate:
    X_train_band, X_test_band, X_pretrain_band = X_train, X_test, X_pretrain
    del X_train
    del X_test
    del X_pretrain

In [None]:
import datasets.multivariate_generation
reload(datasets.multivariate_generation)

from datasets.multivariate_generation import extract_peak_frequencies

if is_multivariate:
    filtered_peak_freqs = extract_peak_frequencies(X_pretrain_band.T, sampling_rate, threshold=1e-5, nperseg=1024, visualize=True)
else:
    filtered_peak_freqs = extract_peak_frequencies(X_pretrain, sampling_rate, threshold=1e-5, nperseg=1024, visualize=True)

#print("Filtered peak frequencies: ", filtered_peak_freqs)
print("Number of frequencies selected :", len(filtered_peak_freqs))

### Applying normal band pass filter on data and standardisation (inside the function)

In [None]:
from datasets.multivariate_generation import generate_multivariate_dataset, extract_peak_frequencies

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

if not is_multivariate:
    X_pretrain_band, X_train_band, X_test_band = generate_multivariate_dataset(
        filtered_peak_freqs, X_pretrain, X_train, X_test, sampling_rate, is_instances_classification, nb_jobs=-1
    )

### Standardizing the amplitudes

In [None]:
# pretrain
# Be really carefull of the column order here !
filtered_data = scaler.fit_transform(X_pretrain_band.T)
filtered_data = filtered_data.T


if is_instances_classification:
    print("processing for Classification")
    X_train_band = [scaler.fit_transform(time_series) for time_series in tqdm(X_train_band)]
    X_test_band = [scaler.fit_transform(time_series) for time_series in tqdm(X_test_band)]

    if not is_multivariate:
        # train
        X_train = [scaler.fit_transform(x).flatten() for x in tqdm(X_train)]
    
        # test
        X_test = [scaler.fit_transform(x).flatten() for x in tqdm(X_test)]
else :
    print("processing for Prediction")
    train_len= X_train_band.shape[0]

    concatenated_Y = np.concatenate([Y_train, Y_test])
    standardized_Y = scaler.fit_transform(concatenated_Y)
    Y_train = standardized_Y[:train_len]
    Y_test = standardized_Y[train_len:]

    # FOR MULTIVARIATE DATA
    concatenated_X_band = np.concatenate([X_train_band, X_test_band])
    standardized_X_band = []
    for timeseries in concatenated_X_band.T:
        standardized_X_band.append(scaler.fit_transform(timeseries.reshape(-1,1)))
    standardized_X_band = np.array(standardized_X_band).reshape(concatenated_X_band.T.shape).T
    X_train_band = standardized_X_band[:train_len]
    X_test_band = standardized_X_band[train_len:]
        
    if not is_multivariate:
        concatenated_X = np.concatenate([X_train.flatten(), X_test.flatten()])
        standardized_X = scaler.fit_transform(concatenated_X.reshape(-1, 1))
        X_train = standardized_X[:train_len, :]
        X_test = standardized_X[train_len:, :]


## Plot pretraining dataset

In [None]:
# Min window size to get all the dynamics ? 
min_window_size = sampling_rate/np.max(np.hstack(filtered_peak_freqs))
max_window_size = sampling_rate/np.min(np.hstack(filtered_peak_freqs))

print(min_window_size)
print(max_window_size)

In [None]:
#Compute the moving average 
window_size = 10

if max_window_size <= window_size or  window_size <= min_window_size:
    raise ValueError(f"window_size must be greater than {min_window_size} and smaller than {max_window_size}. Current window_size is {window_size}.")

weights = np.repeat(1.0, window_size)/window_size
ma = np.array([np.convolve(d, weights, 'valid') for d in (filtered_data)])

END = 1500
START = 1000
DIFF = END - START
#CPlot the two for different frequencies
NB_1 = 1
fig, ax = plt.subplots(3, 1, figsize=(24,18))
ax[0].plot(range(DIFF), filtered_data[NB_1, START:END], label='Time serie')
ax[0].plot(range(DIFF), ma[NB_1, START:END], label='Moving average')
ax[0].legend(fontsize=26)

NB_2 = 2
ax[1].plot(range(DIFF), filtered_data[NB_2, START:END], label='Time serie')
ax[1].plot(range(DIFF), ma[NB_2, START:END], label='Moving average')

#Check that the scaler did a good job (this is the not scaled version)
ax[2].plot(range(DIFF), X_pretrain_band[NB_2, START:END], label='Time serie')

for i, ax in enumerate(ax):
    # Format subplot
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.tick_params(axis='both', labelsize=26)
    # draw vertical lines to represent the window for some points
    for x in range(100, 500, 100):
        ax.axvspan(x, x + window_size, color='g', alpha=0.2)

plt.show()

## Find common dimension

In [None]:
def find_common_dimension(array1, array2):
    matching_indices = None
    matching_values = None

    for i, dim in enumerate(array1.shape):
        if dim in array2.shape:
            matching_indices = i
            matching_values = dim
  
    return matching_indices, matching_values

if isinstance(X_train_band, list): # Multiple instances -> classification
    common_xtrain_index, common_xtrain_size = find_common_dimension(X_train_band[0], filtered_data)
else:
    common_xtrain_index = 0
    common_xtrain_size = filtered_data.shape[common_xtrain_index]

print("Common dimension index is :", common_xtrain_index)
print("Number of different time series is :", common_xtrain_size)
if is_multivariate:
    print("\nCheck it ! \nFirst array ", X_train_band[0].shape, " and second array", X_train_band[1].shape)


# Generating reservoirs

## Reservoir functions

In [None]:
from reservoir.activation_functions import tanh, heaviside, sigmoid

# the activation function choosen for the rest of the experiment
# activation_function = lambda x : sigmoid(2*(x-0.5))
activation_function = lambda x : tanh(x)

plt.plot(np.linspace(0, 1.1, 100), activation_function(np.linspace(0, 1.1, 100)))
plt.grid()

## Timeseries duplications to adapt to reservoir size

Here we calculate 
**common_dimension** : the number of different dimensions in the input data
 **K** : the number of euron that will receive a particular time serie as input 
**n** : the dimension of the reservoir 

n = K * common_dimension

In [None]:
import math 

RESERVOIR_SIZE = 500

# We want the size of the reservoir to be at least 200
K = math.ceil(RESERVOIR_SIZE / common_xtrain_size)
n = common_xtrain_size * K
print("Dimension of our reservoir :", n)
print("Copy of each time serie :", K)


## Datasets formating and noise

In [None]:
# Define noise parameter
noise_std = 0.001


### Pretrain

In [None]:
# PRETRAIN
filtered_data_noisy = []
for instance in filtered_data:
    # Add noise to the time series
    filtered_data_noisy.append(instance + np.random.normal(0, noise_std, instance.shape))

if not is_multivariate:
    X_pretrain_noisy = X_pretrain + np.random.normal(0, noise_std, X_pretrain.shape)


In [None]:
from connexion_generation.utility import TwoDimArrayWrapper

# We create an array of the same shape as X_pretrain_band but with the same time serie repeated K times
X_pretrain_multi = TwoDimArrayWrapper(np.repeat(filtered_data, K, axis=0)) # filtered_data_noisy or filtered_data
X_pretrain_multi_noisy = TwoDimArrayWrapper(np.repeat(filtered_data_noisy, K, axis=0)) # filtered_data_noisy or filtered_data

if not is_multivariate:
    X_pretrain_uni = X_pretrain.flatten() # X_pretrain_noisy or X_pretrain
    X_pretrain_uni_noisy = X_pretrain_noisy.flatten() # X_pretrain_noisy or X_pretrain

### For classification

#### Add noise

In [None]:
if is_instances_classification:
    # We give Xtrain_band and Xtest_band the same shape as the expected input of the reservoir
    #TRAIN 
    X_train_band_noisy = []
    for instance in tqdm(X_train_band):
        # Add noise to the time series
        X_train_band_noisy.append(instance + np.random.normal(0, noise_std, instance.shape))

    #TEST
    X_test_band_noisy = []
    for instance in tqdm(X_test_band):
        # Add noise to the time series
        X_test_band_noisy.append(instance + np.random.normal(0, noise_std, instance.shape))


#### Duplicate

In [None]:
if is_instances_classification:
    X_train_band_duplicated = []
    X_train_band_noisy_duplicated = []
    if common_xtrain_index == 1:
        for i in tqdm(range(len(X_train_band))):
            X_train_band_duplicated.append(np.repeat(X_train_band[i], K, axis=1))
            X_train_band_noisy_duplicated.append(np.repeat(X_train_band_noisy[i], K, axis=1))
    else:
        raise ValueError("The data formating is not correct.")

    
    X_test_band_duplicated = []
    X_test_band_noisy_duplicated = []
    if common_xtrain_index == 1:
        for i in tqdm(range(len(X_test_band))):
            X_test_band_duplicated.append(np.repeat(X_test_band[i], K, axis=1))
            X_test_band_noisy_duplicated.append(np.repeat(X_test_band_noisy[i], K, axis=1))
    else:
        raise ValueError("The data formating is not correct.")
    X_test_band, X_test_band_noisy, X_train_band, X_train_band_noisy = None, None, None, None


### For prediction

#### Add noise

In [None]:
if not is_instances_classification:
    # UNI
    if not is_multivariate:
        X_train_noisy = X_train + np.random.normal(0, noise_std, X_train.shape)
        X_test_noisy = X_test + np.random.normal(0, noise_std, X_test.shape)

    # MULTI
    X_train_band_noisy = []
    for ts in X_train_band:
        # Add noise to the time series
        X_train_band_noisy.append(ts + np.random.normal(loc=0, scale=noise_std, size=ts.shape))
    X_train_band_noisy = np.array(X_train_band_noisy)
    
    X_test_band_noisy = []
    for ts in X_test_band:
        # Add noise to the time series
        X_test_band_noisy.append(ts + np.random.normal(loc=0, scale=noise_std, size=ts.shape))
    X_test_band_noisy = np.array(X_test_band_noisy)


#### Duplicate

In [None]:
if not is_instances_classification: #if prediction
    X_train_band_duplicated = np.repeat(np.squeeze(np.array(X_train_band)), K, axis=1)
    X_test_band_duplicated = np.repeat(np.squeeze(np.array(X_test_band)), K, axis=1)
    X_train_band_noisy_duplicated = np.repeat(np.squeeze(np.array(X_train_band_noisy)), K, axis=1)
    X_test_band_noisy_duplicated = np.repeat(np.squeeze(np.array(X_test_band_noisy)), K, axis=1)


# Hyperparameter search

## Generated matrix

In [None]:
from performances.esn_model_evaluation import init_and_train_model_for_classification, predict_model_for_classification, compute_score
from joblib import Parallel, delayed
from reservoir.reservoir import init_matrices
from connexion_generation.bounded_hadsp import run_hadsp_algorithm
from connexion_generation.utility import TwoDimArrayWrapper
from performances.esn_model_evaluation import init_and_train_model_for_prediction
from connexion_generation.desp import run_desp_algorithm
import connexion_generation.desp
reload(connexion_generation.desp)

N_JOBS = -1

# TO CALCULATE SCORE FOR PREDICTION
START_STEP = 0
END_STEP = 500
slice_range = slice(START_STEP, END_STEP)

function_name = "hadsp" # "desp" ou "hadsp"
data_type = "noisy" # "normal" ou "noisy"

def objective(trial):
    # Suggest values for the parameters you want to optimize
    # COMMON
    input_scaling = trial.suggest_float('input_scaling', 0.01, 0.2, step=0.01)
    bias_scaling = trial.suggest_float('bias_scaling', 0, 0.2, step=0.01)
    leaky_rate = trial.suggest_float('leaky_rate', 1, 1)
    connectivity = trial.suggest_float('connectivity', 0, 0)
    input_connectivity = trial.suggest_float('input_connectivity', 1, 1)
    network_size = trial.suggest_float('network_size', RESERVOIR_SIZE, RESERVOIR_SIZE)
    weight_increment = trial.suggest_float('weight_increment', 0.01, 0.5, step=0.01)

    ridge = trial.suggest_int('ridge', -10, 1)
    RIDGE_COEF = 10**ridge

    # HADSP
    if function_name == "hadsp":
        target_rate = trial.suggest_float('target_rate', 0.5, 1, step=0.01)
        rate_spread = trial.suggest_float('rate_spread', 0.01, 0.4, step=0.01)
        TIME_INCREMENT = int(min_window_size+1) # int(min_window_size+1) or int(max_window_size)
        MAX_TIME_INCREMENT = int(max_window_size) #int(max_window_size) or None or TIME_INCREMENT
    # DESP
    else:
        min_variance = trial.suggest_float('min_variance', 0.001, 0.01, step=0.001)
        variance_window = trial.suggest_float('variance_window', 0.01, 0.02, step=0.005)
        max_variance = min_variance + variance_window
        TIME_INCREMENT = 100 # int(min_window_size+1) or int(max_window_size)
        MAX_TIME_INCREMENT = TIME_INCREMENT #int(max_window_size) or None or TIME_INCREMENT

    pretrain_data_multi = X_pretrain_multi
    train_data_multi = X_train_band_duplicated # X_train_band_noisy_duplicated or X_train_band_duplicated
    test_data_multi = X_test_band_noisy_duplicated if data_type == "noisy" else X_test_band_duplicated


    
    def initialise_and_train(input_scaling, n, input_connectivity, connectivity, bias_scaling, training_set, visualize=False):
        Win, W, bias = init_matrices(n, input_connectivity, connectivity)
        bias *= bias_scaling
        Win *= input_scaling

        if function_name == "hadsp":
            W, state_history = run_hadsp_algorithm(W, Win, bias, leaky_rate, activation_function, training_set, TIME_INCREMENT, weight_increment,
                                    target_rate, rate_spread, max_increment=MAX_TIME_INCREMENT, mi_based=False, visualize=visualize)
        elif function_name == "desp":
            W, state_history, _ = run_desp_algorithm(W, Win, bias, leaky_rate, activation_function, training_set, TIME_INCREMENT, weight_increment,
                        min_variance, max_variance, max_increment=MAX_TIME_INCREMENT, mi_based=True, n_jobs = 1, visualize=visualize)
        else:
            raise ValueError(f"Invalid function: {function}")
        
        return Win, W, bias

    total_score = 0
    average_nb = 3
    for _ in range(average_nb):  # Repeat the process three times
        # HADSP + multi
        (Win_hadsp_multi,
         W_hadsp_multi,
         bias_hadsp_multi,
         ) = initialise_and_train(input_scaling, n, input_connectivity, connectivity, bias_scaling, pretrain_data_multi)

        if is_instances_classification:
            reservoir_hadsp_multi, readout_hadsp_multi = init_and_train_model_for_classification(W_hadsp_multi, np.diag(Win_hadsp_multi.A.T[0]), bias_hadsp_multi, leaky_rate, activation_function, train_data_multi, Y_train, N_JOBS, RIDGE_COEF, mode="sequence-to-vector")
            Y_pred = predict_model_for_classification(reservoir_hadsp_multi, readout_hadsp_multi, test_data_multi, N_JOBS)
            score = compute_score(Y_pred, Y_test, is_instances_classification)
        else:
            esn_hadsp_multi = init_and_train_model_for_prediction(W_hadsp_multi, np.diag(Win_hadsp_multi.A.T[0]), bias_hadsp_multi, leaky_rate, activation_function, train_data_multi, Y_train, RIDGE_COEF)
            Y_pred =  esn_hadsp_multi.run(test_data_multi, reset=False)
        
            score = compute_score(Y_pred, Y_test, is_instances_classification)

        total_score += score

    average_score = total_score / average_nb  # Average the score

    return average_score


In [None]:
import optuna
from optuna.samplers import TPESampler
import re

def camel_to_snake(name):
    str1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', str1).lower()

storage = optuna.storages.RDBStorage(
    url="sqlite:///optuna_" + camel_to_snake(dataset_name) + "_db.sqlite3",
    engine_kwargs={"pool_size": 20, "connect_args": {"timeout": 10}},
)
study_name = function_name + "_" + dataset_name + "_" + data_type
direction = "maximize" if is_instances_classification else "minimize"
sampler = TPESampler()

def optimize_study(n_trials):
    study = optuna.create_study(storage=storage, sampler=sampler, study_name=study_name, direction=direction, load_if_exists=True)
    study.optimize(objective, n_trials=n_trials)

N_TRIALS = 400
n_jobs = 10
trials_per_process = N_TRIALS // n_jobs

# Use joblib to parallelize the optimization
Parallel(n_jobs=n_jobs)(
    delayed(optimize_study)(trials_per_process) for _ in range(n_jobs)
)


## Random matrix

In [None]:
from performances.esn_model_evaluation import init_and_train_model_for_classification, predict_model_for_classification, compute_score
from joblib import Parallel, delayed
from reservoir.reservoir import init_matrices
from connexion_generation.bounded_hadsp import run_hadsp_algorithm
from connexion_generation.utility import TwoDimArrayWrapper
from performances.esn_model_evaluation import init_and_train_model_for_prediction

N_JOBS = -1

def objective(trial):
    # Suggest values for the parameters you want to optimize
    input_scaling = trial.suggest_float('input_scaling', 0.01, 1.0, step=0.01)
    bias_scaling = trial.suggest_float('bias_scaling', 0, 1, step=0.05)
    leaky_rate = trial.suggest_float('leaky_rate', 1, 1)
    connectivity = trial.suggest_float('connectivity', 0, 1)
    input_connectivity = trial.suggest_float('input_connectivity', 1, 1)
    network_size = trial.suggest_float('network_size', RESERVOIR_SIZE, RESERVOIR_SIZE)
    sr = trial.suggest_float('spectral_radius', 0.4, 1.6, step=0.01)
    ridge = trial.suggest_int('ridge', -10, 1)
    RIDGE_COEF = 10**ridge


    pretrain_data_multi = X_pretrain_multi
    train_data_multi = X_train_band_duplicated # X_train_band_noisy_duplicated or X_train_band_duplicated
    test_data_multi = X_test_band_noisy_duplicated if data_type == "noisy" else X_test_band_duplicated

    
    total_score = 0
    average_nb=3
    for _ in range(average_nb):  # Repeat the process three times
        
        # random + multi
        Win_random_multi, W_random_multi, bias_random_multi =  init_matrices(n, 1, connectivity, sr)
        bias_random_multi= bias_random_multi*bias_scaling
        Win_random_multi= Win_random_multi*input_scaling    

        if is_instances_classification:
            reservoir_random_multi, readout_random_multi = init_and_train_model_for_classification(W_random_multi, np.diag(Win_random_multi.A.T[0]), bias_random_multi, leaky_rate, activation_function, train_data_multi, Y_train, N_JOBS, RIDGE_COEF, mode="sequence-to-vector")
            Y_pred = predict_model_for_classification(reservoir_random_multi, readout_random_multi, test_data_multi, N_JOBS)
            score = compute_score(Y_pred, Y_test, is_instances_classification)
        else:
            esn_random_multi = init_and_train_model_for_prediction(W_random_multi, np.diag(Win_random_multi.A.T[0]), bias_random_multi, leaky_rate, activation_function, train_data_multi, Y_train, RIDGE_COEF)
            Y_pred =  esn_random_multi.run(test_data_multi, reset=False)
        
            score = compute_score(Y_pred, Y_test, is_instances_classification)

        total_score += score
        
    average_score = total_score / average_nb  # Average the score

    
    return average_score


In [None]:
import optuna
from optuna.samplers import TPESampler
import re

def camel_to_snake(name):
    str1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', str1).lower()

storage = optuna.storages.RDBStorage(
    url="sqlite:///optuna_" + camel_to_snake(dataset_name) + "_db.sqlite3",
    engine_kwargs={"pool_size": 20, "connect_args": {"timeout": 10}},
)
study_name = "random_" + dataset_name + "_" + data_type
direction = "maximize" if is_instances_classification else "minimize"
sampler = TPESampler()


def optimize_study(n_trials):
    study = optuna.create_study(storage, sampler, study_name=study_name, direction=direction, load_if_exists=True)
    study.optimize(objective, n_trials=n_trials)


N_TRIALS = 400
n_jobs = 10
trials_per_process = N_TRIALS // n_jobs

# Use joblib to parallelize the optimization
Parallel(n_jobs=n_jobs)(
    delayed(optimize_study)(trials_per_process) for _ in range(n_jobs)
)


# Results



## Mackey Glass
### Ridge parameter
| Dataset     | Algorithm | test   | Curve shape | Best value |
|-------------|-----------|--------|-------------|------------|
| MackeyGlass | DESP      | normal | no extremum | 10         |
| MackeyGlass | HADSP     | normal | no extremum | 9,10       |
| MackeyGlass | random    | normal | no extremum | 9,10       |
| MackeyGlass | DESP      | noisy  | bell        | 7          |
| MackeyGlass | HADSP     | noisy  | bell        | 6          |
| MackeyGlass | random    | noisy  | no extremum | 10         |

### Results, no validation/test

|           |  HADSP |  DESP  | random |
|-----------|--------|--------|--------|
|  normal   | 0.0399 | 0.0379 | 0.0446 |
|  noisy    | 0.119  | 0.109  | 0.104  | 


## Japanese vowels

### Ridge parameter

| Dataset        | Algorithm | test   | Curve shape | Best value |
|----------------|-----------|--------|-------------|------------|
| JapaneseVowels | DESP      | normal | bell        | 2          |
| JapaneseVowels | HADSP     | normal | bell        | 4          |
| JapaneseVowels | random    | normal | bell        | 4          |
| JapaneseVowels | DESP      | noisy  | bell        | 2          |
| JapaneseVowels | HADSP     | noisy  | bell        | 5          |
| JapaneseVowels | random    | noisy  | bell        | 5          |

### Results, no validation/test

|           |  HADSP |  DESP  | random |
|-----------|--------|--------|--------|
|  normal   | 0.641  | 0.649  | 0.612  |
|  noisy    | 0.621  | 0.623  | 0.610  | 