In [None]:
import numpy as np
from joblib import Parallel, delayed
from tqdm import tqdm

from matplotlib import pyplot as plt

SEED = 923984

# Datasets loading

Lots of different on availabale : https://towardsdatascience.com/a-data-lakes-worth-of-audio-datasets-b45b88cd4ad

Regression : http://tseregression.org/ + https://arxiv.org/pdf/2012.02974

## Prediction ahead

Datasets available :

* MackeyGlass
* Lorenz
* Sunspot

In [None]:
from datasets.load_datasets import load_dataset_prediction
is_instances_classification = False
dataset_name = "Sunspot"
step_ahead=5

is_multivariate, sampling_rate, X_train_raw, X_test_raw, Y_train_raw, Y_test = load_dataset_prediction(dataset_name, step_ahead, visualize=True)
use_spectral_representation = False
spectral_representation = None # Can be None, "stft" or "mfcc"


## Classification

Datasets available :

* Custom :  FSDD, HAART, JapaneseVowels
* Aeon : SpokenArabicDigits, CatsDogs, LSST
* Torchaudio: SPEECHCOMMANDS

More on https://www.timeseriesclassification.com/dataset.php or https://pytorch.org/audio/stable/datasets.html

# Activation function

In [None]:
from reservoir.activation_functions import tanh, heaviside, sigmoid

# the activation function choosen for the rest of the experiment
# activation_function = lambda x : sigmoid(2*(x-0.5))tanh(x)
activation_function = lambda x : tanh(x)

plt.plot(np.linspace(0, 3, 100), activation_function(np.linspace(0, 3, 100)))
plt.grid()

# Preprocessing

In [None]:
import math 
 
# Cross validation
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit, StratifiedGroupKFold
from datasets.preprocessing import flexible_indexing

#Preprocessing
from datasets.multivariate_generation import generate_multivariate_dataset, extract_peak_frequencies
from sklearn.preprocessing import MinMaxScaler
from datasets.preprocessing import scale_data
from datasets.preprocessing import add_noise, duplicate_data

# Define noise parameter
noise_std = 0.001


data_type = "normal" # "normal" ou "noisy"

WINDOW_LENGTH = 10
freq_train_data = X_train_raw
flat_train_data = np.concatenate(freq_train_data, axis=0) if is_instances_classification else freq_train_data
extract_peak_frequencies(flat_train_data, sampling_rate, smooth=True, window_length=WINDOW_LENGTH, threshold=1e-5, nperseg=1024, visualize=True)

if is_multivariate:
    X_train_band, X_test_band = X_train_raw, X_test_raw
    del X_train_raw, X_test_raw
    X_val_band = None
else:
    X_test, X_train = X_test_raw, X_train_raw
    X_val, X_val_band = None, None
    del X_train_raw, X_test_raw
Y_train = Y_train_raw
del Y_train_raw
        
# PREPROCESSING    
freq_train_data = X_train_band if is_multivariate else X_train
flat_train_data = np.concatenate(freq_train_data, axis=0) if is_instances_classification else freq_train_data
peak_freqs = extract_peak_frequencies(flat_train_data, sampling_rate, smooth=True, window_length=WINDOW_LENGTH, threshold=1e-5, nperseg=1024, visualize=False)


if not is_multivariate:
    X_train_band = generate_multivariate_dataset(
        X_train, sampling_rate, is_instances_classification, peak_freqs, spectral_representation, hop=100
    )
    
    X_test_band = generate_multivariate_dataset(
        X_test, sampling_rate, is_instances_classification, peak_freqs, spectral_representation, hop=100
    )
elif not use_spectral_representation:
    X_train_band = generate_multivariate_dataset(
        X_train_band, sampling_rate, is_instances_classification, peak_freqs, spectral_representation, hop=100
    )
    X_test_band = generate_multivariate_dataset(
        X_test_band, sampling_rate, is_instances_classification, peak_freqs, spectral_representation, hop=100
    )
else:
    print("Data is already spectral, nothing to do")
    
scaler_multi = MinMaxScaler(feature_range=(0, 1))
X_train_band, X_val_band, X_test_band = scale_data(X_train_band, X_val_band, X_test_band, scaler_multi, is_instances_classification)
            
if not is_multivariate:
    scaler_x_uni = MinMaxScaler(feature_range=(0, 1))
    X_train, X_val, X_test = scale_data(X_train, X_val, X_test, scaler_multi, is_instances_classification)       

# NOISE
if data_type == "noisy":
    if is_instances_classification:
        # UNI
        if not is_multivariate:
            X_train_noisy = [add_noise(instance, noise_std) for instance in tqdm(X_train, desc="TRAIN")]
            X_test_noisy = [add_noise(instance, noise_std) for instance in tqdm(X_test, desc="TEST")]
            
        # MULTI
        X_train_band_noisy = [add_noise(instance, noise_std) for instance in tqdm(X_train_band, desc="TRAIN")]
        X_test_band_noisy = [add_noise(instance, noise_std) for instance in tqdm(X_test_band, desc="TEST")]
    
    else:  #if prediction
        # UNI
        if not is_multivariate:
            X_train_noisy = add_noise(X_train, noise_std)
            X_test_noisy = add_noise(X_test, noise_std)
    
        # MULTI
        X_train_band_noisy = add_noise(X_train_band, noise_std)
        X_test_band_noisy = add_noise(X_test_band, noise_std)

# Define the number of instances you want to select
# Define the number of instances you want to select
x_size = len(X_train_band) if is_multivariate else len(X_train)
num_samples_for_pretrain = 500 if x_size >= 500 else x_size
if is_instances_classification:
    indices = np.random.choice(x_size, num_samples_for_pretrain, replace=False)
else:
    indices = range(x_size)


if data_type == "noisy":
    # Defining pretrain   
    if not is_multivariate:
        X_pretrain_noisy = np.array(X_train_noisy)[indices]
    X_pretrain_band_noisy = np.array(X_train_band_noisy)[indices]

if not is_multivariate:
    X_pretrain_uni = np.array(X_train)[indices]
X_pretrain_band = np.array(X_train_band)[indices]

# Test data evaluation

In [None]:
from scipy import sparse, stats
from numpy import random
import os

#Pretraining
from reservoir.reservoir import init_matrices
from connexion_generation.hag import run_algorithm

# Evaluating
from performances.esn_model_evaluation import init_and_train_model_for_classification, predict_model_for_classification, compute_score
from performances.esn_model_evaluation import init_and_train_model_for_prediction

import optuna
from performances.utility import camel_to_snake

if is_instances_classification:
    file_name = "test_results_classification.csv"
else: 
    file_name = "test_results_prediction.csv"
    

def evaluate_dataset_on_test(dataset_name, function_name, data_type):
    # Get the best trial from the study
    url= "sqlite:///optuna_" + camel_to_snake(dataset_name) + "_db.sqlite3"
    study_name = function_name + "_" + dataset_name + "_" + data_type + "_" + variate_type
    study = optuna.load_study(study_name=study_name, storage=url) # To load an existing study
    best_trial = study.best_trial
    
     # Collect all hyperparameters in a dictionary
    hyperparams = {param_name: param_value for param_name, param_value in best_trial.params.items()}

    # Add default values or other logic adjustments
    if 'variance_target' not in hyperparams and 'min_variance' in hyperparams:
        hyperparams['variance_target'] = hyperparams['min_variance']
    

    if not is_instances_classification:
        use_full_instance = None

    nb_trials = 8
    nb_jobs = 8
    
    start_step, end_step = 30, 500
    SLICE_RANGE = slice(start_step, end_step)
    min_reservoir_size = 500
    min_window_size = sampling_rate/np.max(np.hstack(peak_freqs))
    max_window_size = sampling_rate/np.min(np.hstack(peak_freqs))
    RIDGE_COEF = 10**hyperparams['ridge']
    
    if function_name == "hadsp" or function_name == "desp":
        max_increment_span = int(max_window_size) if int(max_window_size) - 100 < 0 else int(max_window_size) - 100
        MAX_TIME_INCREMENT = hyperparams['time_increment'] + hyperparams['time_increment_span'] #int(max_window_size) or None or TIME_INCREMENT
    
    scores = [] 
    for i in range(nb_trials):
        if variate_type == "multi":
            if is_instances_classification:
                common_index = 1
                common_size = X_train_band[0].shape[common_index]
            else:
                common_index = 1
                common_size = X_train_band.shape[common_index]
        else:
            common_size = len(peak_freqs)
            
        # We want the size of the reservoir to be at least network_size
        K = math.ceil(hyperparams['network_size'] / common_size)
        n = common_size * K
    
        pretrain_data = X_pretrain_band
        train_data = X_train_band  # X_train_band_noisy_duplicated or X_train_band_duplicated
        test_data = X_test_band
    
        # UNSUPERVISED PRETRAINING 
        if function_name == "random_ei":
            Win, W, bias = init_matrices(n, hyperparams['input_connectivity'], hyperparams['connectivity'],  K, w_distribution=stats.uniform(-1, 1), seed=random.randint(0, 1000))
        else:
            Win, W, bias = init_matrices(n, hyperparams['input_connectivity'], hyperparams['connectivity'],  K, seed=random.randint(0, 1000))
        bias *= hyperparams['bias_scaling']
        Win *= hyperparams['input_scaling']
    
        if function_name == "hadsp":
            W, (_, _, _) = run_algorithm(W, Win, bias, hyperparams['leaky_rate'], activation_function, pretrain_data, hyperparams['time_increment'], hyperparams['weight_increment'],
                                     hyperparams['target_rate'], hyperparams['rate_spread'], function_name, is_instance=is_instances_classification, use_full_instance=use_full_instance,
                                     max_increment=MAX_TIME_INCREMENT, max_partners=hyperparams['max_partners'], method=hyperparams['method'], n_jobs=nb_jobs)
        elif function_name == "desp":
                W, (_, _, _) = run_algorithm(W, Win, bias, hyperparams['leaky_rate'], activation_function, pretrain_data, hyperparams['time_increment'], hyperparams['weight_increment'],
                                        hyperparams['variance_target'], hyperparams['variance_spread'], function_name, is_instance=is_instances_classification, use_full_instance = use_full_instance, 
                                        max_increment=MAX_TIME_INCREMENT, max_partners=hyperparams['max_partners'], method = hyperparams['method'], 
                                        intrinsic_saturation=hyperparams['intrinsic_saturation'], intrinsic_coef=hyperparams['intrinsic_coef'], n_jobs = nb_jobs)
        elif function_name == "random" or function_name == "random_ei":
            eigen = sparse.linalg.eigs(W, k=1, which="LM", maxiter=W.shape[0] * 20, tol=0.1, return_eigenvectors=False)
            W *= hyperparams['spectral_radius'] / max(abs(eigen))
        else:
            raise ValueError(f"Invalid function: {function_name}")
        
    
        # TRAINING and EVALUATION
        if is_instances_classification:
            reservoir, readout = init_and_train_model_for_classification(W, Win, bias, hyperparams['leaky_rate'], activation_function, train_data, Y_train, n_jobs = nb_jobs, ridge_coef=RIDGE_COEF, mode="sequence-to-vector", hide_progress=True)
            
            Y_pred = predict_model_for_classification(reservoir, readout, test_data, n_jobs = nb_jobs, hide_progress=True)
            score = compute_score(Y_pred, Y_test, is_instances_classification)
        else:
            esn = init_and_train_model_for_prediction(W, Win, bias, hyperparams['leaky_rate'], activation_function, train_data, Y_train, RIDGE_COEF)
            
            Y_pred =  esn.run(test_data, reset=False)
            score = compute_score(Y_pred, Y_test, is_instances_classification)
    
        scores.append(score)

    return scores


In [None]:
import numpy as np
import pandas as pd
from datetime import datetime

# Create an empty DataFrame to store the results
new_results = pd.DataFrame(columns=['Dataset', 'Function', 'Average Score', 'Standard Deviation', 'Date'])

variate_type = "multi"  # "multi" or "uni"


# Simulate your data and loop for evaluation
print(dataset_name)
for function_name in ["desp", "hadsp", "random", "random_ei"]:
    print(function_name)
    scores = evaluate_dataset_on_test(dataset_name, function_name, data_type)
    
    # Compute the average and standard deviation of the scores
    average_score = np.mean(scores)
    std_deviation = np.std(scores)

    if is_instances_classification:
        formatted_average = f"{round(average_score * 100, 5)} %"
        formatted_std = f"± {round(std_deviation * 100, 5)} %"
    else:
        formatted_average = f"{round(average_score, 5)}"
        formatted_std = f"± {round(std_deviation, 5)}"
    
    # Capture the current date
    current_date = datetime.now().strftime('%Y-%m-%d')
    
    # Create a new DataFrame row with the Date column
    new_row = pd.DataFrame({
        'Dataset': [dataset_name],
        'Function': [function_name],
        'Average Score': [formatted_average],
        'Standard Deviation': [formatted_std],
        'Date': [current_date]
    })
    
    # Concatenate the new row to the results DataFrame
    new_results = pd.concat([new_results, new_row], ignore_index=True)


# Display the DataFrame
print(new_results)

# Load the existing CSV
if os.path.exists(file_name):
    previous_results = pd.read_csv(file_name)
else:
    columns = ['Dataset', 'Function', 'Average Score', 'Standard Deviation', 'Date']
    previous_results = pd.DataFrame(columns=columns)
    previous_results.to_csv(file_name, index=False)
    print(f"{file_name} created successfully.")
    
tots_results = pd.concat([new_results, previous_results], axis=0)

tots_results.to_csv(file_name, index=False)
print("Results saved to 'pipeline_results.csv'.")

# Visualisation

In [None]:
import os
import pandas as pd 

if 'file_name' not in locals() and 'file_name' not in globals():
    file_name = "test_results_classification.csv"    #  test_results_classification.csv or test_results_prediction.csv


if os.path.exists(file_name):
    previous_results = pd.read_csv(file_name)
else:
    # File does not exist, create it with the necessary columns
    columns = ['Dataset', 'Function', 'Average Score', 'Standard Deviation', 'Date']
    previous_results = pd.DataFrame(columns=columns)
    # Save the empty DataFrame as a CSV
    previous_results.to_csv(file_name, index=False)
    print(f"{file_name} created successfully.")

print(previous_results)
print("Results saved to 'pipeline_results.csv'.")

In [None]:
from seaborn import heatmap, color_palette
from matplotlib import pyplot as plt
import numpy as np

all_results = pd.read_csv(file_name)

df = pd.DataFrame(all_results)
# Clean 'Average Score' and 'Standard Deviation' column (remove '±' and '%' and convert to float)
df['Average Score'] = df['Average Score'].astype(str).str.replace('%', '').astype(float)
df['Standard Deviation'] = df['Standard Deviation'].str.replace('±', '').str.replace('%', '').astype(float)
df['Function'] = df['Function'].str.replace('desp', 'variance HAG')
df['Function'] = df['Function'].str.replace('hadsp', 'mean HAG')
df['Function'] = df['Function'].str.replace('random', 'E-ESN')
df['Function'] = df['Function'].str.replace('random_ei', 'ESN')

if file_name == "test_results_classification.csv":
    df['Dataset'] = df['Dataset'].str.replace('SpokenArabicDigits', 'Spoken\nArabic\nDigits')
    df['Dataset'] = df['Dataset'].str.replace('SPEECHCOMMANDS', 'SPEECH\nCOMMANDS')
    df['Dataset'] = df['Dataset'].str.replace('JapaneseVowels', 'Japanese\nVowels')

fig, ax = plt.subplots(figsize=(10, 6))
functions = df['Function'].unique()
x = np.arange(len(df['Dataset'].unique()))  # The label locations
width = 0.2  # Width of the bars
colors = color_palette("tab20")

for i, func in enumerate(functions):
    values = df[df['Function'] == func]
    ax.bar(x + i * width, values['Average Score'], width, label=func, yerr=values['Standard Deviation'], capsize=5, color=colors[i % len(colors)])

ax.set_xlabel('Dataset')
ax.set_ylabel('Average Score')
fontsize=18
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.tick_params(axis='both', labelsize=fontsize)
plt.xlabel('Steps', size=fontsize)
plt.ylabel('Average Uncoupled Dynamics', size=fontsize)
plt.legend(title='Generation', fontsize=fontsize)
ax.set_xticks(x + width * (len(functions) - 1) / 2)
ax.set_xticklabels(df['Dataset'].unique())
plt.legend(fontsize=fontsize)

plt.tight_layout()
plt.show()