In [None]:
import sys
from pathlib import Path

# Add the root project folder to the python path in order to use the packages
path_root = Path( '/project_ghent/HADSP/hadsp/')
sys.path.append(str(path_root))

In [None]:
import numpy as np
from scipy import sparse
from tqdm import tqdm
from importlib import reload

# SEED
SEED = 49387

from matplotlib import pyplot as plt
from seaborn import heatmap, color_palette

# Datasets

Lots of different on availabale : https://towardsdatascience.com/a-data-lakes-worth-of-audio-datasets-b45b88cd4ad

Classification: 
https://arxiv.org/abs/1803.07870

https://github.com/FilippoMB/Time-series-classification-and-clustering-with-Reservoir-Computing

Multivariate:
https://www.timeseriesclassification.com/dataset.php

## Torchaudio

https://pytorch.org/audio/stable/datasets.html


In [None]:
# load dataset using torchaudio
from sklearn.model_selection import StratifiedShuffleSplit
from torchaudio.datasets import VoxCeleb1Identification, SPEECHCOMMANDS
from torch.utils.data import random_split, DataLoader

dataset = SPEECHCOMMANDS(root="datasets/", download=True)

sampling_rate = dataset[0][1]
X = [sample[0][0] for sample in dataset]
Y = [sample[2] for sample in dataset]

dataset_size = len(dataset)  # Total number of samples in the dataset

split = int(np.floor(test_split * dataset_size))

# Use StratifiedShuffleSplit to get train/test indices
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=SEED)
train_index, test_index = next(sss.split(X, Y))

# Split data and labels using the indices
X_train = X[train_index]
Y_train = Y[train_index]
X_test = X[test_index]
Y_test = Y[test_index]


is_multivariate = False

## Prediction ahead

Datasets available :

* MackeyGlass
* Lorenz

In [None]:
from datasets.load_datasets import load_dataset_prediction
is_instances_classification = False
name = "MackeyGlass"
step_ahead=5

is_multivariate, sampling_rate, X_train, X_test, Y_train, Y_test, X_pretrain = load_dataset_prediction(name, step_ahead, visualize=True)


## Classification

Datasets available :

* FSDD
* HAART
* JapaneseVowels

In [None]:
from datasets.load_datasets import load_dataset_classification
is_instances_classification = True
name = "HAART"

is_multivariate, sampling_rate, X_train, X_test, Y_train, Y_test, X_pretrain = load_dataset_classification(name)

## Multivariate generation if necessary

Spectrograms_vs_Cochleagrams : https://www.researchgate.net/publication/340510607_Speech_recognition_using_very_deep_neural_networks_Spectrograms_vs_Cochleagrams

Attention ! For multivariate shape should be : (nb_of_timeseries, nb_of_timesteps)

In [None]:
if is_multivariate:
    X_train_band, X_test_band, X_pretrain_band = X_train, X_test, X_pretrain
    del X_train
    del X_test
    del X_pretrain

In [None]:
import datasets.multivariate_generation
reload(datasets.multivariate_generation)

from datasets.multivariate_generation import extract_peak_frequencies

if is_multivariate:
    filtered_peak_freqs = extract_peak_frequencies(X_pretrain_band.T, sampling_rate, threshold=1e-5, nperseg=1024, visualize=True)
else:
    filtered_peak_freqs = extract_peak_frequencies(X_pretrain, sampling_rate, threshold=1e-5, nperseg=1024, visualize=True)

#print("Filtered peak frequencies: ", filtered_peak_freqs)
print("Number of frequencies selected :", len(filtered_peak_freqs))

### Applying normal band pass filter on data and standardisation (inside the function)

In [None]:
from datasets.multivariate_generation import generate_multivariate_dataset, extract_peak_frequencies

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0, 1))

if not is_multivariate:
    X_pretrain_band, X_train_band, X_test_band = generate_multivariate_dataset(
        filtered_peak_freqs, X_pretrain, X_train, X_test, sampling_rate, scaler, is_instances_classification, nb_jobs=-1
    )

### Standardizing the amplitudes

In [None]:
# pretrain
# Be really carefull of the column order here !
filtered_data = scaler.fit_transform(X_pretrain_band.T)
filtered_data = filtered_data.T


if is_instances_classification:
    print("processing for Classification")
    X_train_band = [scaler.fit_transform(time_series) for time_series in tqdm(X_train_band)]
    X_test_band = [scaler.fit_transform(time_series) for time_series in tqdm(X_test_band)]

    if not is_multivariate:
        # train
        X_train = [scaler.fit_transform(x).flatten() for x in tqdm(X_train)]
    
        # test
        X_test = [scaler.fit_transform(x).flatten() for x in tqdm(X_test)]
else :
    print("processing for Prediction")
    train_len= X_train_band.shape[0]

    concatenated_Y = np.concatenate([Y_train, Y_test])
    standardized_Y = scaler.fit_transform(concatenated_Y)
    Y_train = standardized_Y[:train_len]
    Y_test = standardized_Y[train_len:]

    # FOR MULTIVARIATE DATA
    concatenated_X_band = np.concatenate([X_train_band, X_test_band])
    standardized_X_band = scaler.fit_transform(concatenated_X_band)
    X_train_band = standardized_X_band[:train_len]
    X_test_band = standardized_X_band[train_len:]
        
    if not is_multivariate:
        concatenated_X = np.concatenate([X_train.flatten(), X_test.flatten()])
        standardized_X = scaler.fit_transform(concatenated_X.reshape(-1, 1))
        X_train = standardized_X[:train_len]
        X_test = standardized_X[train_len:]


In [None]:
Y_train.shape

## Plot pretraining dataset

In [None]:
# Min window size to get all the dynamics ? 
min_window_size = sampling_rate/np.max(np.hstack(filtered_peak_freqs))
max_window_size = sampling_rate/np.min(np.hstack(filtered_peak_freqs))

print(min_window_size)
print(max_window_size)

In [None]:
#Compute the moving average 
window_size = 10

if max_window_size <= window_size or  window_size <= min_window_size:
    raise ValueError(f"window_size must be greater than {min_window_size} and smaller than {max_window_size}. Current window_size is {window_size}.")

weights = np.repeat(1.0, window_size)/window_size
ma = np.array([np.convolve(d, weights, 'valid') for d in (filtered_data)])

END = 1500
START = 1000
DIFF = END - START
#CPlot the two for different frequencies
NB_1 = 1
fig, ax = plt.subplots(3, 1, figsize=(24,18))
ax[0].plot(range(DIFF), filtered_data[NB_1, START:END], label='Time serie')
ax[0].plot(range(DIFF), ma[NB_1, START:END], label='Moving average')
ax[0].legend(fontsize=26)

NB_2 = 2
ax[1].plot(range(DIFF), filtered_data[NB_2, START:END], label='Time serie')
ax[1].plot(range(DIFF), ma[NB_2, START:END], label='Moving average')

#Check that the scaler did a good job (this is the not scaled version)
ax[2].plot(range(DIFF), X_pretrain_band[NB_2, START:END], label='Time serie')

for i, ax in enumerate(ax):
    # Format subplot
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    ax.tick_params(axis='both', labelsize=26)
    # draw vertical lines to represent the window for some points
    for x in range(100, 500, 100):
        ax.axvspan(x, x + window_size, color='g', alpha=0.2)

plt.show()

## Find common dimension

In [None]:
def find_common_dimension(array1, array2):
    matching_indices = None
    matching_values = None

    for i, dim in enumerate(array1.shape):
        if dim in array2.shape:
            matching_indices = i
            matching_values = dim
  
    return matching_indices, matching_values

if isinstance(X_train_band, list): # Multiple instances -> classification
    common_xtrain_index, common_xtrain_size = find_common_dimension(X_train_band[0], filtered_data)
else:
    common_xtrain_index = 0
    common_xtrain_size = filtered_data.shape[common_xtrain_index]

print("Common dimension index is :", common_xtrain_index)
print("Number of different time series is :", common_xtrain_size)
if is_multivariate:
    print("\nCheck it ! \nFirst array ", X_train_band[0].shape, " and second array", X_train_band[1].shape)


# Generating reservoirs

## Reservoir functions

In [None]:
from reservoir.activation_functions import tanh, heaviside, sigmoid

# the activation function choosen for the rest of the experiment
# activation_function = lambda x : sigmoid(2*(x-0.5))
activation_function = lambda x : tanh(x)

plt.plot(np.linspace(0, 1.1, 100), activation_function(np.linspace(0, 1.1, 100)))
plt.grid()

## Timeseries duplications to adapt to reservoir size

Here we calculate 
**common_dimension** : the number of different dimensions in the input data
 **K** : the number of euron that will receive a particular time serie as input 
**n** : the dimension of the reservoir 

n = K * common_dimension

In [None]:
import math 

RESERVOIR_SIZE = 1000

# We want the size of the reservoir to be at least 200
K = math.ceil(RESERVOIR_SIZE / common_xtrain_size)
n = common_xtrain_size * K
print("Dimension of our reservoir :", n)
print("Copy of each time serie :", K)


## Datasets formating and noise

In [None]:
# Define noise parameter
noise_std = 0.001


### Pretrain

In [None]:
# PRETRAIN
filtered_data_noisy = []
for instance in filtered_data:
    # Add noise to the time series
    filtered_data_noisy.append(instance + np.random.normal(0, noise_std, instance.shape))

if not is_multivariate:
    X_pretrain_noisy = X_pretrain + np.random.normal(0, noise_std, X_pretrain.shape)


In [None]:
from connexion_generation.utility import TwoDimArrayWrapper

# We create an array of the same shape as X_pretrain_band but with the same time serie repeated K times
X_pretrain_multi = TwoDimArrayWrapper(np.repeat(filtered_data, K, axis=0)) # filtered_data_noisy or filtered_data
X_pretrain_multi_noisy = TwoDimArrayWrapper(np.repeat(filtered_data_noisy, K, axis=0)) # filtered_data_noisy or filtered_data

if not is_multivariate:
    X_pretrain_uni = X_pretrain.flatten() # X_pretrain_noisy or X_pretrain
    X_pretrain_uni_noisy = X_pretrain_noisy.flatten() # X_pretrain_noisy or X_pretrain

### For classification

In [None]:
if is_instances_classification:
    # We give Xtrain_band and Xtest_band the same shape as the expected input of the reservoir
    #TRAIN 
    X_train_band_noisy = []
    for instance in tqdm(X_train_band):
        # Add noise to the time series
        X_train_band_noisy.append(instance + np.random.normal(0, noise_std, instance.shape))

    X_train_band_duplicated = []
    X_train_band_noisy_duplicated = []
    for i in tqdm(range(len(X_train_band))):
        if common_xtrain_index == 1:
            X_train_band_duplicated.append(np.repeat(X_train_band[i], K, axis=1))
            X_train_band_noisy_duplicated.append(np.repeat(X_train_band_noisy[i], K, axis=1))
        else:
            X_train_band_duplicated.append(np.repeat(X_train_band[i], K, axis=0).T) # correct axis depends on X_train_band shape
            X_train_band_noisy_duplicated.append(np.repeat(X_train_band_noisy[i], K, axis=0).T) # correct axis depends on X_train_band shape


    #TEST
    X_test_band_noisy = []
    for instance in tqdm(X_test_band):
        # Add noise to the time series
        X_test_band_noisy.append(instance + np.random.normal(0, noise_std, instance.shape))

    X_test_band_duplicated = []
    X_test_band_noisy_duplicated = []
    for i in tqdm(range(len(X_test_band))):
        if common_xtrain_index == 1:
            X_test_band_duplicated.append(np.repeat(X_test_band[i], K, axis=1))
            X_test_band_noisy_duplicated.append(np.repeat(X_test_band_noisy[i], K, axis=1))
        else:
            X_test_band_duplicated.append(np.repeat(X_test_band[i], K, axis=0).T)
            X_test_band_noisy_duplicated.append(np.repeat(X_test_band_noisy[i], K, axis=0).T)
            
    X_test_band, X_test_band_noisy, X_train_band, X_train_band_noisy = None, None, None, None


### For prediction

In [None]:
if not is_instances_classification:
    X_train_band = np.repeat(np.squeeze(np.array(X_train_band)), K, axis=1).T
    X_test_band = np.repeat(np.squeeze(np.array(X_test_band)), K, axis=1).T

    # UNI
    if not is_multivariate:
        X_train_noisy = X_train + np.random.normal(0, noise_std, X_train.shape)
        X_test_noisy = X_test + np.random.normal(0, noise_std, X_test.shape)

    # MULTI
    X_train_band_noisy = []
    for ts in X_train_band:
        # Add noise to the time series
        X_train_band_noisy.append(ts + np.random.normal(loc=0, scale=noise_std, size=ts.shape))
    X_train_band_noisy = np.array(X_train_band_noisy)
    
    X_test_band_noisy = []
    for ts in X_test_band:
        # Add noise to the time series
        X_test_band_noisy.append(ts + np.random.normal(loc=0, scale=noise_std, size=ts.shape))
    X_test_band_noisy = np.array(X_test_band_noisy)


## Construct matrices

### Shared parameters

In [None]:
TIME_INCREMENT = int(min_window_size+1)
MAX_TIME_INCREMENT = int(max_window_size) #int(max_window_size) or None
WEIGHT_INCREMENT = 0.05
target_rate = 0.7
RATE_SPREAD = 0.1

target_variance = 0.1
VARIANCE_SPREAD = 0.3

bias_scaling = 1
input_scaling = 0.1
leaky_rate = 1

if max_window_size <= TIME_INCREMENT or  TIME_INCREMENT <= min_window_size:
    raise ValueError(f"INCREMENT must be greater than {min_window_size} and smaller than {max_window_size}. Current INCREMENT is {INCREMENT}.")


### Function to initialise and generate reservoir

In [None]:
from reservoir.reservoir import init_matrices
from connexion_generation.bounded_hadsp import run_hadsp_algorithm
from connexion_generation.desp import run_desp_algorithm

input_connectivity = 1
connectivity = 0

def initialise_and_train(input_scaling, n, input_connectivity, connectivity, bias_scaling, seed, training_set, visualize=False):
    Win, W, bias = init_matrices(n, input_connectivity, connectivity, seed=seed)
    bias *= bias_scaling
    Win *= input_scaling
        
    W, state_history = run_hadsp_algorithm(W, Win, bias, leaky_rate, activation_function, training_set, TIME_INCREMENT, WEIGHT_INCREMENT,
                            target_rate, RATE_SPREAD, max_increment=MAX_TIME_INCREMENT, mi_based=False, visualize=visualize)
    
    connectivity =  W.count_nonzero() / (W.shape[0] * W.shape[1])
    eigen = sparse.linalg.eigs(W, k=1, which="LM", maxiter=W.shape[0] * 20, tol=0.1, return_eigenvectors=False)
    sr = max(abs(eigen))
    
    return Win, W, bias, connectivity, sr, state_history


### Multivariate matrices

In [None]:
# HADSP + multi
(Win_hadsp_multi, 
 W_hadsp_multi, 
 bias_hadsp_multi, 
 connectivity_band, 
 sr_hadsp_multi, 
 state_history_hadsp_multi) = initialise_and_train(input_scaling, n, input_connectivity, connectivity, bias_scaling, SEED, X_pretrain_multi_noisy)

# random + multi
Win_random_multi, W_random_multi, bias_random_multi =  init_matrices(n, 1, connectivity_band, sr_hadsp_multi)
bias_random_multi= bias_random_multi*bias_scaling
Win_random_multi= Win_random_multi*input_scaling

eigen_random_multi = sparse.linalg.eigs(W_random_multi, k=1, which="LM", maxiter=W_random_multi.shape[0] * 20, tol=0.1, return_eigenvectors=False)
sr_random_multi = max(abs(eigen_random_multi))

from matplotlib.colors import ListedColormap

custom_colormap = ListedColormap(np.vstack((plt.cm.cividis(0.0), plt.cm.cividis(np.linspace(0.5, 1, 128)))))
heatmap(W_random_multi.todense(), cmap=custom_colormap, cbar=True)

### Univariate matrices

In [None]:
if not is_multivariate:
    # HADSP + uni
    (Win_hadsp_uni, 
     W_hadsp_uni, 
     bias_hadsp_uni, 
     connectivity_hadsp_uni, 
     sr_hadsp_uni,
     state_history_hadsp_uni) = initialise_and_train(input_scaling, n,  input_connectivity, connectivity, bias_scaling, SEED, X_pretrain_uni_noisy)
    
    # random + uni
    Win_normal, W_normal, bias_normal =  init_matrices(n, 1, connectivity_hadsp_uni, sr_hadsp_uni)
    bias_normal= bias_normal*bias_scaling
    Win_normal= Win_normal*input_scaling   
    
    eigen_normal = sparse.linalg.eigs(W_normal, k=1, which="LM", maxiter=W_normal.shape[0] * 20, tol=0.1, return_eigenvectors=False)
    sr_normal = max(abs(eigen_normal))
    
    from matplotlib.colors import ListedColormap
    
    custom_colormap = ListedColormap(np.vstack((plt.cm.cividis(0.0), plt.cm.cividis(np.linspace(0.5, 1, 128)))))
    heatmap(W_hadsp_uni.todense(), cmap=custom_colormap, cbar=True)

### Spectral radius normalisation

In [None]:
print(sr_hadsp_multi)
print(sr_random_multi)
if not is_multivariate:
    print(sr_normal)
    print(sr_hadsp_uni)

# Performance

In [None]:
from joblib import Parallel, delayed
from sklearn.metrics import mutual_info_score

N_JOBS = -1
RIDGE_COEF = 1e-7

## Classification

In [None]:
if not is_instances_classification:
    raise ValueError("This is not the right Prediction ahead section.")

### Classification for multivariate

In [None]:
print("X_train_band_duplicated example shape :", X_train_band_duplicated[1].shape)     
print("We should have :", X_train_band_duplicated[0].shape[1], "==", n)

In [None]:
from performances.esn_model_evaluation import init_and_train_model_for_classification
# To remember : 
#  For reservoirpy   pre_s = W @ r + Win @ (u + noise_gen(dist=dist, shape=u.shape, gain=g_in)) + bias

train_data_multi = X_train_band_duplicated # X_train_band_noisy_duplicated or X_train_band_duplicated
test_data_multi = X_test_band_noisy_duplicated # X_test_band_noisy_duplicated or X_test_band_duplicated

reservoir_hadsp_multi, readout_hadsp_multi = init_and_train_model_for_classification(W_hadsp_multi, np.diag(Win_hadsp_multi.A.T[0]), bias_hadsp_multi, activation_function, RIDGE_COEF, train_data_multi, Y_train, N_JOBS)

reservoir_random_multi, readout_random_multi = init_and_train_model_for_classification(W_random_multi, np.diag(Win_random_multi.A.T[0]), bias_random_multi, activation_function, RIDGE_COEF, train_data_multi, Y_train, N_JOBS)


In [None]:
from performances.esn_model_evaluation import predict_model_for_classification, compute_score

Y_pred_hadsp_multi = predict_model_for_classification(reservoir_hadsp_multi, readout_hadsp_multi, test_data_multi, N_JOBS)
score = compute_score(Y_pred_hadsp_multi, Y_test, "HADSP multi")

Y_pred_random_multi = predict_model_for_classification(reservoir_random_multi, readout_random_multi, test_data_multi, N_JOBS)
score = compute_score(Y_pred_random_multi, Y_test, "random multi")

### Classification for univariate

In [None]:
if not is_multivariate: 
    # Create a list to store the arrays with the same shape as the expected input of the reservoir

    train_data_uni = [np.stack([ts]*n).T for ts in X_train]
    test_data_uni = [np.stack([ts]*n).T for ts in X_test]

    print("number of instances in pipi :", len(train_data_uni), "should be equal to", len(X_train))     
    print("pipi example shape :", train_data_uni[0].shape)     
    print("We should have :", train_data_uni[0].shape[1], "==", n)

In [None]:
if not is_multivariate:
    Y_pred_hadsp_uni = init_and_train_model_for_classification(W_hadsp_uni, Win_hadsp_uni, bias_hadsp_uni, activation_function, RIDGE_COEF, train_data_uni, Y_train, N_JOBS)
    
    Y_pred_normal = init_and_train_model_for_classification(W_normal, Win_normal, bias_normal, activation_function, RIDGE_COEF, train_data_uni, Y_train, N_JOBS)

In [None]:
if not is_multivariate:
    Y_pred_hadsp_uni = predict_model_for_classification(reservoir_hadsp_uni, readout_hadsp_uni, test_data_uni, N_JOBS)
    score = compute_score(Y_pred_hadsp_uni, Y_test, "HADSP uni")
    
    Y_pred_normal = predict_model_for_classification(reservoir_random_uni, readout_random_uni, test_data_uni, N_JOBS)
    score = compute_score(Y_pred_normal, Y_test, "random uni")

## Prediction ahead

In [None]:
if is_instances_classification:
    raise ValueError("This is not the right Classification section.")

### Plot datasets
Noisy or normal dataset can be ploted

In [None]:
# Concatenate train and test arrays for plotting
combined_data = np.concatenate((X_train_band, X_test_band), axis=1)

# noisy version
combined_data_noisy = np.concatenate((X_train_band, X_test_band_noisy), axis=1)
combined_Y =np.concatenate((Y_train, Y_test), axis=0)

# Calculate the merge point index
merge_point_index = len(X_train_band[0])

# Define the range around the merge point to plot
start_index = merge_point_index - 100
end_index = merge_point_index + 100

# Plot for a subset N features within a range arround transition from train to test
N = 3
plt.figure(figsize=(16, 5))
for i in [63, 478, 873]: 
    plt.plot(range(start_index, end_index), combined_data_noisy[i, start_index:end_index], label=f'Feature {i}')
plt.plot(range(start_index, end_index), combined_Y[start_index:end_index], label="Prediction")
plt.title('Feature Values Around Merge Point')
plt.xlabel('Sample Index')
plt.ylabel('Feature Value')
plt.legend()
plt.show()

### Training
Noisy or normal dataset can be used

In [None]:
from performances.esn_model_evaluation import init_and_train_model_for_prediction

if not is_multivariate:
    train_data_uni = X_train # X_train_noisy or X_train
    
    # Training random + MG
    reservoir_random_uni, readout_random_uni = init_and_train_model_for_prediction(W_normal, Win_normal, bias_normal, activation_function, RIDGE_COEF, train_data_uni, Y_train)
    
    # Training for HADSP + MG
    reservoir_hadsp_uni, readout_hadsp_uni = init_and_train_model_for_prediction(W_hadsp_uni, Win_hadsp_uni, bias_hadsp_uni, activation_function, RIDGE_COEF, train_data_uni, Y_train)

train_data_multi = X_train_band.T # X_train_band_noisy or train_band_inputs

# Training random + bandfilter
reservoir_random_multi, readout_random_multi = init_and_train_model_for_prediction(W_random_multi, np.diag(Win_random_multi.A.T[0]), bias_random_multi, activation_function, RIDGE_COEF, train_data_multi, Y_train)

# Training output HASDP + bandfilter
reservoir_hadsp_multi, readout_hadsp_multi = init_and_train_model_for_prediction(W_hadsp_multi, np.diag(Win_hadsp_multi.A.T[0]), bias_hadsp_multi, activation_function, RIDGE_COEF, train_data_multi, Y_train)
                                                                                   

### Prediction
Noisy or normal dataset can be used

In [None]:
from reservoir.reservoir import run

if not is_multivariate:
    test_data_uni = X_test_noisy # X_test_noisy or X_test

    # Prediction for random + MG
    y_pred_random_uni = readout_random_uni.run(reservoir_random_uni.run(test_data_uni, reset=False)) 

    # Prediction for HADSP + MG
    y_pred_hadsp_uni = readout_hadsp_uni.run(reservoir_hadsp_uni.run(test_data_uni, reset=False)) 

test_data_multi = X_test_band_noisy.T # X_test_band_noisy or test_band_inputs

# Prediction for random + bandfilter
y_pred_random_multi = readout_random_multi.run(reservoir_random_multi.run(test_data_multi, reset=False)) 

# Prediction for HADSP + bandfilter
y_pred_hadsp_multi = readout_hadsp_multi.run(reservoir_hadsp_multi.run(test_data_multi, reset=False)) 


In [None]:
from plots.performances import plot_results
from reservoir.losses import nrmse, nrmse_multivariate
from sklearn.metrics import mean_squared_error

END_STEP = 500
START_STEP = 100
slice_range = slice(START_STEP, END_STEP)

if not is_multivariate:
    print("nrmse normal         :", float(nrmse(Y_test[slice_range], y_pred_random_uni[slice_range])))
    print("nrmse hadsp          :", float(nrmse(Y_test[slice_range], y_pred_hadsp_uni[slice_range])))
print("nrmse random + band  :", float(nrmse_multivariate(Y_test[slice_range], y_pred_random_multi[slice_range])))
print("nrmse hadsp + band   :", float(nrmse_multivariate(Y_test[slice_range], y_pred_hadsp_multi[slice_range])))

plot_results(y_pred_hadsp_multi, Y_test, 0, 300)


In [None]:
# moving average of the y
span=5
pad_width = span // 2

ave_y_random_uni = np.convolve(np.pad(y_pred_random_uni.flatten(), pad_width, mode='edge'), np.ones(span), 'valid') / span
ave_y_hadsp_uni = np.convolve(np.pad(y_pred_hadsp_uni.flatten(), pad_width, mode='edge') , np.ones(span), 'valid') / span
ave_y_random_multi = np.convolve(np.pad(y_pred_random_multi.flatten(), pad_width, mode='edge'), np.ones(span), 'valid') / span
ave_y_hadsp_multi = np.convolve(np.pad(y_pred_hadsp_multi.flatten(), pad_width, mode='edge'), np.ones(span), 'valid') / span

print("nrmse normal         :", float(nrmse(Y_test[slice_range], ave_y_random_uni[slice_range])))
print("nrmse hadsp          :", float(nrmse(Y_test[slice_range], ave_y_hadsp_uni[slice_range])))
print("nrmse random + band  :", float(nrmse(Y_test[slice_range], ave_y_random_multi[slice_range])))
print("nrmse hadsp + band   :", float(nrmse(Y_test[slice_range], ave_y_hadsp_multi[slice_range])))
 
plot_results(ave_y_hadsp_multi.reshape(-1,1), Y_test, 300)

In [None]:
nrmse_array_random_uni = []
nrmse_array_hadsp_uni = []
nrmse_array_random_multi = []
nrmse_array_hadsp_multi = []

for i in range(len(Y_test)-100- step_ahead):
    Y_test_i = Y_test[i:100+i]
    nrmse_array_random_uni.append(nrmse(Y_test_i, y_pred_random_uni[i:100+i]))
    nrmse_array_hadsp_uni.append(nrmse(Y_test_i, y_pred_hadsp_uni[i:100+i]))
    nrmse_array_random_multi.append(nrmse(Y_test_i, y_pred_random_multi[i:100+i]))
    nrmse_array_hadsp_multi.append(nrmse(Y_test_i, y_pred_hadsp_multi[i:100+i]))
    
log10_nrmse_random_uni= np.log10(nrmse_array_random_uni)
log10_nrmse_hadsp_uni = np.log10(nrmse_array_hadsp_uni)
log10_nrmse_random_multi = np.log10(nrmse_array_random_multi)
log10_nrmse_hadsp_multi = np.log10(nrmse_array_hadsp_multi)
plt.figure()
plt.plot(log10_nrmse_random_uni[:1000])
plt.plot(log10_nrmse_hadsp_uni[:1000])
plt.plot(log10_nrmse_random_multi[:1000])
plt.plot(log10_nrmse_hadsp_multi[:1000])

plt.xlabel('Time steps')
plt.ylabel('Log10 NRMSE')
plt.legend(["HADSP+band", "random", " random + bandfilter", "HADSP"])
plt.show()

# Analysis

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.cluster.hierarchy import linkage, leaves_list

scaler = MinMaxScaler(feature_range=(-1, 1))

# pretrain
# Be really carefull of the column order here !
df_data = scaler.fit_transform(X_pretrain_band.T)
df_data = df_data.T
df = pd.DataFrame(df_data.T)
# Initialize a progress bar for total number of series
progress_bar = tqdm(total=df.shape[1]**2, position=0, leave=True)

# Initialize an empty correlation matrix
correlation_matrix = pd.DataFrame(index=df.columns, columns=df.columns)

# Calculate the correlation for each pair of series
for col1 in df.columns:
    progress_bar.set_description(f"Processing {col1}")
    for col2 in df.columns:
        correlation_matrix.loc[col1, col2] = df[col1].corr(df[col2], method='pearson', min_periods=5)

        progress_bar.update(1)  # Update the progress bar after processing each series
    
progress_bar.close()

# Convert correlation_matrix to numeric as it is stored as objects due to tqdm
correlation_matrix = correlation_matrix.apply(pd.to_numeric)

# Perform hierarchical clustering
linked = linkage(correlation_matrix, 'single')

# Get the order of rows/columns after hierarchical clustering
row_order = leaves_list(linked)

# Reorder the correlation matrix
sorted_corr_matrix = correlation_matrix.iloc[row_order, :].iloc[:, row_order]

# Visualize the sorted correlation matrix with a heatmap
plt.figure(figsize=(10,7))
sns.heatmap(sorted_corr_matrix, annot=False, cmap='vlag', vmin=-1, vmax=1)
plt.title('Clustered Pairwise Correlation of Time Series')
plt.show()


In [None]:
from scipy.sparse import coo_matrix

row_order_r = np.array([i + k for i in row_order*K for k in range(K)])

# Convert the sparse matrix to a dense format (if memory allows)
dense_matrix = W_hadsp_multi.toarray()

# Reorder the dense matrix using the repeated ordering
reordered_matrix = dense_matrix[np.ix_(row_order_r, row_order_r)]

# Convert the reordered dense matrix back to a sparse format if needed
sparse_reordered_matrix = coo_matrix(reordered_matrix)

In [None]:
heatmap(sparse_reordered_matrix.todense(), cmap=color_palette("vlag", as_cmap=True))

## Motifs

In [None]:
import analysis.topology
reload(analysis.topology)
from analysis.topology import motif_distribution, draw_motifs_distribution

motifs_count = motif_distribution(W_hadsp_multi.A)
draw_motifs_distribution(motifs_count)

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import poisson, binom

def analyze_connectivity_matrix(matrix):
    # Extract weights from the matrix (ignoring the diagonal and zeros)
    weights = matrix.flatten()
    weights = weights[weights != 0]
    bin_centers, counts = np.unique(weights, return_counts=True)
    
    # Calculate the difference for all centers
    diffs = np.diff(bin_centers)
    # Add the last difference for the last bin
    diffs = np.append(diffs, diffs[-1])
    
    # Calculate the bin edges based on bin centers and differences
    bin_edges = bin_centers - diffs/2
    # Add the last bin edge
    bin_edges = np.append(bin_edges, bin_centers[-1] + diffs[-1]/2)
    
    # Plot histogram
    plt.bar(bin_centers, counts, align='center', alpha=0.6, width=np.diff(bin_centers).min())
    
    # Fit to Poisson distribution
    lambda_est = np.mean(weights)
    plt.plot(bin_centers, poisson.pmf(range(len(bin_centers)), lambda_est)*counts[0], 'r-', label='Poisson fit')
    
    # Fit to Binomial distribution using derived relations
    mean = np.mean(weights)
    variance = np.var(weights)
    
    # Calculate p and n estimates
    p_est = mean ** 2 / (n * mean - variance) if (n * mean - variance) != 0 else 0
    n_est = int(round(mean / p_est)) if p_est != 0 else 0  # n should be integer

    # Check parameter validity
    if not(0 < p_est < 1):
        print("Estimated parameters are not valid for the Binomial distribution.")
    else:
        x_vals = range(len(bin_centers))
        plt.plot(bin_centers, binom.pmf(x_vals, n_est, p_est) * counts[0], 'g-', label='Binomial fit')

    plt.legend()
    plt.show()

    return {"Poisson": lambda_est, "Binomial": (n_est, p_est)}


# Assuming W_hadsp_multi.A is your connectivity matrix
analyze_connectivity_matrix(W_hadsp_multi.A)

In [None]:
bin_centers = np.array([1.5, 3.5, 5.5, 7.5])
poisson.pmf(np.arange(len([1, 2, 3, 4, 5, 6])), 0.08)

In [None]:
weights.shape

In [None]:
W_hadsp_multi.A.flatten().shape