In [None]:
import numpy as np

from matplotlib import pyplot as plt

SEED = 923984

# Datasets loading

Lots of different on availabale : https://towardsdatascience.com/a-data-lakes-worth-of-audio-datasets-b45b88cd4ad

Regression : http://tseregression.org/ + https://arxiv.org/pdf/2012.02974

## Classification

### Aeon


https://www.timeseriesclassification.com/dataset.php

* SpokenArabicDigits
* CatsDogs
* LSST

### Torchaudio

https://pytorch.org/audio/stable/datasets.html


### Custom

Datasets available :

* FSDD
* HAART
* JapaneseVowels

## Prediction ahead

Datasets available :

* MackeyGlass
* Lorenz

In [None]:
from datasets.load_datasets import load_dataset_prediction
is_instances_classification = False
dataset_name = "Lorenz"
step_ahead=5

is_multivariate, sampling_rate, X_train_raw, X_test_raw, Y_train_raw, Y_test = load_dataset_prediction(dataset_name, step_ahead, visualize=True)
use_spectral_representation = False

# Activation function

In [None]:
from reservoir.activation_functions import tanh, heaviside, sigmoid

# the activation function choosen for the rest of the experiment
# activation_function = lambda x : sigmoid(2*(x-0.5))tanh(x)
activation_function = lambda x : tanh(x)

plt.plot(np.linspace(0, 3, 100), activation_function(np.linspace(0, 3, 100)))
plt.grid()

# Preprocessing

In [None]:
import math 
 
# Cross validation
from sklearn.model_selection import StratifiedKFold, TimeSeriesSplit, StratifiedGroupKFold
from datasets.preprocessing import flexible_indexing

#Preprocessing
from datasets.multivariate_generation import generate_multivariate_dataset, extract_peak_frequencies
from sklearn.preprocessing import MinMaxScaler
from datasets.preprocessing import scale_data
from datasets.preprocessing import add_noise, duplicate_data

# Define noise parameter
noise_std = 0.001


data_type = "normal" # "normal" ou "noisy"

WINDOW_LENGTH = 10
freq_train_data = X_train_raw
flat_train_data = np.concatenate(freq_train_data, axis=0) if is_instances_classification else freq_train_data
extract_peak_frequencies(flat_train_data, sampling_rate, smooth=True, window_length=WINDOW_LENGTH, threshold=1e-5, nperseg=1024, visualize=True)

if is_multivariate:
    X_train_band, X_test_band = X_train_raw, X_test_raw
    del X_train_raw, X_test_raw
    X_val_band = None
else:
    X_test, X_train = X_test_raw, X_train_raw
    X_val, X_val_band = None, None
    del X_train_raw, X_test_raw
Y_train = Y_train_raw
del Y_train_raw
        
# PREPROCESSING    
freq_train_data = X_train_band if is_multivariate else X_train
flat_train_data = np.concatenate(freq_train_data, axis=0) if is_instances_classification else freq_train_data
peak_freqs = extract_peak_frequencies(flat_train_data, sampling_rate, smooth=True, window_length=WINDOW_LENGTH, threshold=1e-5, nperseg=1024, visualize=False)

spectral_representation = None # Can be None, "stft" or "mfcc"

if not is_multivariate:
    X_train_band = generate_multivariate_dataset(
        X_train, sampling_rate, is_instances_classification, peak_freqs, spectral_representation, hop=100
    )
    
    X_test_band = generate_multivariate_dataset(
        X_test, sampling_rate, is_instances_classification, peak_freqs, spectral_representation, hop=100
    )
elif not use_spectral_representation:
    X_train_band = generate_multivariate_dataset(
        X_train_band, sampling_rate, is_instances_classification, peak_freqs, spectral_representation, hop=100
    )
    X_test_band = generate_multivariate_dataset(
        X_test_band, sampling_rate, is_instances_classification, peak_freqs, spectral_representation, hop=100
    )
else:
    print("Data is already spectral, nothing to do")
    
scaler_multi = MinMaxScaler(feature_range=(0, 1))
X_train_band, X_val_band, X_test_band = scale_data(X_train_band, X_val_band, X_test_band, scaler_multi, is_instances_classification)
            
if not is_multivariate:
    scaler_x_uni = MinMaxScaler(feature_range=(0, 1))
    X_train, X_val, X_test = scale_data(X_train, X_val, X_test, scaler_multi, is_instances_classification)       

# NOISE
if data_type == "noisy":
    if is_instances_classification:
        # UNI
        if not is_multivariate:
            X_train_noisy = [add_noise(instance, noise_std) for instance in tqdm(X_train, desc="TRAIN")]
            X_test_noisy = [add_noise(instance, noise_std) for instance in tqdm(X_test, desc="TEST")]
            
        # MULTI
        X_train_band_noisy = [add_noise(instance, noise_std) for instance in tqdm(X_train_band, desc="TRAIN")]
        X_test_band_noisy = [add_noise(instance, noise_std) for instance in tqdm(X_test_band, desc="TEST")]
    
    else:  #if prediction
        # UNI
        if not is_multivariate:
            X_train_noisy = add_noise(X_train, noise_std)
            X_test_noisy = add_noise(X_test, noise_std)
    
        # MULTI
        X_train_band_noisy = add_noise(X_train_band, noise_std)
        X_test_band_noisy = add_noise(X_test_band, noise_std)

# Define the number of instances you want to select
# Define the number of instances you want to select
x_size = len(X_train_band) if is_multivariate else len(X_train)
num_samples_for_pretrain = 500 if x_size >= 500 else x_size
if is_instances_classification:
    indices = np.random.choice(x_size, num_samples_for_pretrain, replace=False)
else:
    indices = range(x_size)


if data_type == "noisy":
    # Defining pretrain   
    if not is_multivariate:
        X_pretrain_noisy = np.array(X_train_noisy)[indices]
    X_pretrain_band_noisy = np.array(X_train_band_noisy)[indices]

if not is_multivariate:
    X_pretrain_uni = np.array(X_train)[indices]
X_pretrain_band = np.array(X_train_band)[indices]

# Test data evaluation

In [None]:
import optuna
import re

def camel_to_snake(name):
    str1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', str1).lower()
    
function_name = "random" # "desp" ou "hadsp" or "random"
variate_type = "multi" # "multi" ou "uni"
if variate_type == "uni" and is_multivariate:
    raise ValueError(f"Invalid variable type: {variate_type}")
    
study_name = function_name + "_" + dataset_name + "_" + data_type + "_" + variate_type
url= "sqlite:///optuna_" + camel_to_snake(dataset_name) + "_db.sqlite3"

study = optuna.load_study(study_name=study_name, storage=url) # To load an existing study

# Get the best trial
best_trial = study.best_trial

# Print the best hyperparameters
print("Best hyperparameters: ")
# Loop through the hyperparameters and assign them to variables
use_full_instance = True if is_instances_classification else False

for param_name, param_value in best_trial.params.items():
    globals()[param_name] = param_value
    print(param_name, param_value)

In [None]:
import random 
from scipy import sparse

#Pretraining
from reservoir.reservoir import init_matrices
from connexion_generation.hadsp import run_algorithm

# Evaluating
from performances.esn_model_evaluation import init_and_train_model_for_classification, predict_model_for_classification, compute_score
from performances.esn_model_evaluation import init_and_train_model_for_prediction

nb_trials = 8
nb_jobs = 8

# score for prediction
start_step, end_step = 30, 500
SLICE_RANGE = slice(start_step, end_step)
min_reservoir_size = 500
min_window_size = sampling_rate/np.max(np.hstack(peak_freqs))
max_window_size = sampling_rate/np.min(np.hstack(peak_freqs))
RIDGE_COEF = 10**ridge

if function_name == "hadsp" or function_name == "desp":
    max_increment_span = int(max_window_size) if int(max_window_size) - 100 < 0 else int(max_window_size) - 100
    MAX_TIME_INCREMENT = time_increment + time_increment_span #int(max_window_size) or None or TIME_INCREMENT

total_score = 0
for i in range(nb_trials):
    if variate_type == "multi":
        if is_instances_classification:
            common_index = 1
            common_size = X_train_band[0].shape[common_index]
        else:
            common_index = 1
            common_size = X_train_band.shape[common_index]
    else:
        common_size = len(peak_freqs)
        
    # We want the size of the reservoir to be at least network_size
    K = math.ceil(network_size / common_size)
    n = common_size * K

    pretrain_data = X_pretrain_band
    train_data = X_train_band  # X_train_band_noisy_duplicated or X_train_band_duplicated
    test_data = X_test_band

    # UNSUPERVISED PRETRAINING 
    Win, W, bias = init_matrices(n, input_connectivity, connectivity,  K, seed=random.randint(0, 1000))
    bias *= bias_scaling
    Win *= input_scaling

    if function_name == "hadsp":
        W, (_, _, _) = run_algorithm(W, Win, bias, leaky_rate, activation_function, pretrain_data, time_increment, weight_increment,
                             target_rate, rate_spread, function_name, is_instance=is_instances_classification, use_full_instance = use_full_instance, 
                             max_increment=MAX_TIME_INCREMENT, max_partners=max_partners, method = method, n_jobs = nb_jobs)
    elif function_name == "desp":
        W, (_, _, _) = run_algorithm(W, Win, bias, leaky_rate, activation_function, pretrain_data, time_increment, weight_increment,
                                min_variance, variance_spread, function_name, is_instance=is_instances_classification, use_full_instance = use_full_instance, 
                                max_increment=MAX_TIME_INCREMENT, max_partners=max_partners, method = method, 
                                intrinsic_saturation=intrinsic_saturation, intrinsic_coef=intrinsic_coef, n_jobs = nb_jobs)
    elif function_name == "random":
        eigen = sparse.linalg.eigs(W, k=1, which="LM", maxiter=W.shape[0] * 20, tol=0.1, return_eigenvectors=False)
        W *= spectral_radius / max(abs(eigen))
    else:
        raise ValueError(f"Invalid function: {function_name}")
    

    # TRAINING and EVALUATION
    if is_instances_classification:
        reservoir, readout = init_and_train_model_for_classification(W, Win, bias, leaky_rate, activation_function, train_data, Y_train, n_jobs = nb_jobs, ridge_coef=RIDGE_COEF, mode="sequence-to-vector")
        
        Y_pred = predict_model_for_classification(reservoir, readout, test_data, n_jobs = nb_jobs)
        score = compute_score(Y_pred, Y_test, is_instances_classification)
    else:
        esn = init_and_train_model_for_prediction(W, Win, bias, leaky_rate, activation_function, train_data, Y_train, RIDGE_COEF)
        
        Y_pred =  esn.run(test_data, reset=False)
        score = compute_score(Y_pred, Y_test, is_instances_classification)

    total_score += score

average_score = total_score / nb_trials  # Average the score

if is_instances_classification:
    print(average_score*100, " %")
else:
    print(average_score)