# Notebook for making test runs

Libraries


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.fft import fft, fftfreq
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from xgboost import XGBClassifier
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.utils import to_categorical
from sklearn.neighbors import KNeighborsClassifier
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
from keras.regularizers import l2
from keras.callbacks import EarlyStopping

# Librerias GTDA
from gtda.time_series import SingleTakensEmbedding, takens_embedding_optimal_parameters
from gtda.homology import VietorisRipsPersistence
from gtda.plotting import plot_point_cloud
import plotly.graph_objects as go
#from nolitsa import dimension, delay (nolitsa==0.1)

import math
import gudhi
import ripser
from persim import plot_diagrams, PersistenceImager

In [2]:
def compute_frequency_domain(signals, start_idx, end_idx, sampling_rate=1.0):
    frequency_domain_data = []
    for i in range(start_idx, end_idx):
        # Compute the FFT of the signal
        signal_fft = fft(signals[i])
        # Compute the corresponding frequencies
        frequencies = fftfreq(len(signal_fft), d=1/sampling_rate)
        # Filter frequencies and magnitudes in the range -0.05 to 0.05
        filtered_indices = np.where((frequencies >= -0.05) & (frequencies <= 0.05))
        filtered_frequencies = frequencies[filtered_indices].tolist()
        filtered_magnitudes = np.abs(signal_fft)[filtered_indices].tolist()
        # Store filtered frequencies and magnitudes
        frequency_domain_data.append({
            'Signal': i+1, 
            'Frequencies': filtered_frequencies, 
            'Magnitudes': filtered_magnitudes
        })
    return frequency_domain_data

def procesamiento_pca(datos_np, hacerPCA: bool=True):
    frequency_domain_results = compute_frequency_domain(datos_np, start_idx=0, end_idx=len(datos_np), sampling_rate=1.0)

    # Convert the results into a DataFrame
    frequency_domain_df = pd.DataFrame(frequency_domain_results)

    # Extracting magnitudes of Fourier series data
    magnitudes = frequency_domain_df['Magnitudes'].values

    # Reshaping magnitudes to match the expected input shape for t-SNE
    reshaped_magnitudes = np.vstack(magnitudes)

    # Normalize the data
    scaler = StandardScaler()
    normalized_magnitudes = scaler.fit_transform(reshaped_magnitudes)

    #En caso de poner false
    pca_result = normalized_magnitudes

    # Optionally apply PCA before t-SNE
    if hacerPCA:
        pca = PCA(n_components=50)
        pca_result = pca.fit_transform(normalized_magnitudes)
    

    return pca_result

# Load the models

In [3]:
# load the models
import joblib
import pickle

# Load random forest model
rf_model = joblib.load('models/random_forest_model.pkl')
# Load XGBoost model
xgb_model = joblib.load('models/xgb_model.pkl')
# Load logistic regression model
logistic_model = joblib.load('models/logistic_model.pkl')
# Load neural network model
from keras.models import load_model
nn_model1 = load_model('models/neural_network_model1.keras')
nn_model2 = load_model('models/neural_network_model2.keras')
# Load CNN model
cnn_model = load_model('models/cnn_model.keras')
# Load the svm model
svm_model = joblib.load('models/svm_model.pkl')
# tda model
tda_model = load_model('data/cnn1_model_0.4.keras')


AttributeError: module 'ml_dtypes' has no attribute 'float4_e2m1fn'


# Load the data

In [4]:
list_of_R=[0.6, 0.5, 0.4, 0.2, 0.15, 0.065, 'rand100', 'rand250', 'rand500']
lista_modelos=[rf_model, xgb_model, svm_model, logistic_model, nn_model1, nn_model2, cnn_model, tda_model]
nombres_modelos = ['Random Forest', 'XGBoost', 'SVM', 'Logistic Regression', 'Neural Network 1', 'Neural Network 2', 'CNN', 'TDA CNN']
df = pd.DataFrame(columns=list_of_R, index=nombres_modelos)
df

Unnamed: 0,0.6,0.5,0.4,0.2,0.15,0.065,rand100,rand250,rand500
Random Forest,,,,,,,,,
XGBoost,,,,,,,,,
SVM,,,,,,,,,
Logistic Regression,,,,,,,,,
Neural Network 1,,,,,,,,,
Neural Network 2,,,,,,,,,
CNN,,,,,,,,,
TDA CNN,,,,,,,,,


In [None]:
def flatten_images(imagenes):
    """
    Flatten the images for input into the model.
    """
    flattened_images = []
    for img in imagenes:
        flattened_images.append(img.flatten())
    return np.array(flattened_images)

def create_folds(X,Y,Z, k=10):
    """
    Divide the data into k folds for cross-validation.
    """


    # Step 1: Create and shuffle indices
    indices = np.arange(1500)
    np.random.seed(42)
    np.random.shuffle(indices)

    # Step 2: Split indices into k folds
    fold_indices = np.array_split(indices, k)

    X_folded = []
    Y_folded = []
    Z_folded = []

    # Step 3: Use fold indices to slice each array
    for i, fold in enumerate(fold_indices):
        X_fold = X[fold]
        Y_fold = Y[fold]
        Z_fold = Z[fold]

        X_folded.append(X_fold)
        Y_folded.append(Y_fold)
        Z_folded.append(Z_fold)
        
       
        
    return X_folded, Y_folded, Z_folded


signal_name= 'noisy_signals'
labels_name ='labels'

print(len(lista_modelos), "models loaded.")

acc_global=[]
rec_global=[]
acc_desv_global=[]
rec_desv_global=[]

for i in list_of_R:
    print(f"----Processing R value: {i}-----")
    # Load the data
    datos_np = np.load(f'data/{signal_name}_{i}.npy')
    imagenes=np.load(f'data/imagenes_{i}.npy', allow_pickle=True)
    labels = np.load(f'data/{labels_name}_{i}.npy')
    imagenes = flatten_images(imagenes)
    
    

    # Divide datos_np and labels into 10 folds
    # Shuffle data
    

    # Create folds
    datos_np, labels, imagenes = create_folds(datos_np, labels, imagenes, k=10)
    

    # Initialize accuracy scores for each model
    acc_fold = []
    rec_fold = []
    acc_desv_fold = []
    rec_desv_fold = []
    for q in range(len(lista_modelos)):
        acc_fold.append(0)
        rec_fold.append(0)

    for w in range(len(datos_np)):
        print(f"Fold {w+1} of 10")
        # Process the data using PCA
        X_train = procesamiento_pca(datos_np[w])
        y_train = labels[w]
        imagenes_fold = imagenes[w]

        

        for j in range(len(lista_modelos)):
            if j<len(lista_modelos)-1:
                # Train the model
                y_pred=lista_modelos[j].predict(X_train)
            elif j==len(lista_modelos)-1:
                # Train the TDA model
                y_pred=lista_modelos[j].predict(imagenes_fold)

            # Calculate accuracy
            print(f"Model {j+1} ({nombres_modelos[j]}) - Fold {w+1}")
            if hasattr(lista_modelos[j], 'summary'):
                y_pred = (y_pred> 0.5).astype(int)
        
            accuracy = accuracy_score(y_train, y_pred)
            recall = recall_score(y_train, y_pred)
            print("Accuracy:", accuracy)
            #print("Recall Score:", recall_score)
            acc_fold[j] += accuracy  # Accumulate accuracy for each model
            rec_fold[j] += recall
            #recall_scores.append(recall_score)

    #acc_desv_fold = np.array(acc_fold).std()
    acc_fold = [x/10 for x in acc_fold]  # Average the accuracy over the folds
    rec_fold = [x/10 for x in rec_fold]  # Average the recall over the folds


    print("Average accuracy for this R value:", acc_fold)
    print("Average recall for this R value:", rec_fold)
    acc_global.append(acc_fold)
    rec_global.append(rec_fold)

    print(acc_global)
    print(rec_global)

    """"        
    # Process the data using PCA
    X_test = procesamiento_pca(datos_np)
    y_test = labels

    for j in range(len(lista_modelos)):
        if j<len(lista_modelos)-1:
            y_pred_ruido = lista_modelos[j].predict(X_test)
        elif j==len(lista_modelos)-1:
            y_pred_ruido = lista_modelos[j].predict(imagenes)

        # Calculate accuracy
        print(f"Model {j+1} ({nombres_modelos[j]}) - R: {j}")

        # if model is sequential, Convert probabilities to binary predictions
        if hasattr(lista_modelos[j], 'summary'):
            y_pred_ruido = (y_pred_ruido > 0.5).astype(int)
        accuracy_ruido = accuracy_score(y_test, y_pred_ruido)
        print("Accuracy:", accuracy_ruido)
    """

    


8 models loaded.
----Processing R value: 0.6-----
Fold 1 of 10
Model 1 (Random Forest) - Fold 1
Accuracy: 0.9733333333333334
Model 2 (XGBoost) - Fold 1
Accuracy: 0.96
Model 3 (SVM) - Fold 1
Accuracy: 0.9533333333333334
Model 4 (Logistic Regression) - Fold 1
Accuracy: 0.96
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Model 5 (Neural Network 1) - Fold 1
Accuracy: 0.94
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Model 6 (Neural Network 2) - Fold 1
Accuracy: 0.9733333333333334
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 
Model 7 (CNN) - Fold 1
Accuracy: 0.6533333333333333
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 182ms/step
Model 8 (TDA CNN) - Fold 1
Accuracy: 0.7733333333333333
Fold 2 of 10
Model 1 (Random Forest) - Fold 2
Accuracy: 0.98
Model 2 (XGBoost) - Fold 2
Accuracy: 0.98
Model 3 (SVM) - Fold 2
Accuracy: 0.9666666666666667
Model 4 (Logistic Regression) - Fold 2
Accuracy: 0.96666666

In [29]:
len(acc_global)

9

In [31]:
# Change the name of the columns to str
df.columns = [str(col) for col in df.columns]
df

Unnamed: 0,0.6,0.5,0.4,0.2,0.15,0.065,rand100,rand250,rand500
Random Forest,,,,,,,,,
XGBoost,,,,,,,,,
SVM,,,,,,,,,
Logistic Regression,,,,,,,,,
Neural Network 1,,,,,,,,,
Neural Network 2,,,,,,,,,
CNN,,,,,,,,,
TDA CNN,,,,,,,,,


In [32]:
for i in range(len(df.columns)):
    df[df.columns[i]]= acc_global[i]



In [33]:
df

Unnamed: 0,0.6,0.5,0.4,0.2,0.15,0.065,rand100,rand250,rand500
Random Forest,0.948,0.839333,0.803333,0.584,0.546,0.491333,0.530667,0.518,0.510667
XGBoost,0.969333,0.953333,0.896667,0.620667,0.576667,0.498,0.532,0.517333,0.51
SVM,0.955333,0.848,0.728667,0.562,0.54,0.510667,0.528667,0.516667,0.507333
Logistic Regression,0.951333,0.899333,0.834667,0.611333,0.572667,0.494,0.528667,0.518,0.507333
Neural Network 1,0.924667,0.824667,0.79,0.61,0.579333,0.498,0.526,0.514667,0.518
Neural Network 2,0.945333,0.874,0.812667,0.614,0.571333,0.490667,0.526667,0.513333,0.508667
CNN,0.625333,0.558,0.564,0.529333,0.52,0.492667,0.46,0.492,0.454
TDA CNN,0.773333,0.752667,0.726,0.608,0.579333,0.513333,0.538,0.504,0.528667


In [34]:
# save csv
df.to_csv('data/accuracy_results.csv', index=True)

In [36]:
for i in range(len(df.columns)):
    df[df.columns[i]]= rec_global[i]
df
df.to_csv('data/recall_results.csv', index=True)

In [37]:
df

Unnamed: 0,0.6,0.5,0.4,0.2,0.15,0.065,rand100,rand250,rand500
Random Forest,0.952102,0.945705,0.938042,0.935436,0.918745,0.841908,0.370489,0.364714,0.356156
XGBoost,0.940882,0.92085,0.914019,0.903061,0.866836,0.770141,0.373499,0.366075,0.354789
SVM,0.970531,0.942041,0.954979,0.947772,0.920208,0.891079,0.349261,0.343203,0.329254
Logistic Regression,0.912299,0.868607,0.870745,0.823072,0.788446,0.714182,0.364364,0.364782,0.354618
Neural Network 1,0.874273,0.779299,0.789309,0.719423,0.713387,0.65525,0.336348,0.332333,0.33943
Neural Network 2,0.905555,0.845006,0.857395,0.800552,0.765322,0.695213,0.35758,0.351207,0.346487
CNN,0.981885,0.955433,0.953374,0.917855,0.906283,0.899446,0.963868,0.937759,0.945691
TDA CNN,0.603733,0.601812,0.610386,0.688541,0.68995,0.65796,0.108309,0.102416,0.107165


In [None]:
acc_global=[]
for i in range(len(list_of_R)):
    
    acc_fold = []
    for k in range(len(lista_modelos)):
        acc_fold.append(0)
    #print(acc_fold)
    for j in range(10):
        
        
        for k in range(len(lista_modelos)):
            acc_fold[k] += k
            #acc_modelos.append(k)
        #acc_fold.append(acc_modelos)
    acc_fold = [x/10 for x in acc_fold]  # Average the accuracy over the folds
    acc_global.append(acc_fold)
            
    

In [14]:
liston=[1,2,3,4,5,6,7,8]
# calculate the avg for liston
avg_liston = sum(liston) / len(liston)
# calculate the std for liston
std_liston =np.array(liston).std()
print("Average of liston:", avg_liston)
print("Standard deviation of liston:", std_liston)
print(liston)

Average of liston: 4.5
Standard deviation of liston: 2.29128784747792
[1, 2, 3, 4, 5, 6, 7, 8]


In [None]:
# make a list of len(10) zeros
acc_global

[[0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0],
 [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0]]

In [None]:
len(acc_global)

9

In [None]:
len(acc_global[0])

10

In [None]:
len(acc_global[0][0])

8

In [None]:
(acc_global[0][:][:0])

[]

In [None]:
signal_name= 'imagenes'
labels_name ='labels'
#list_of_R=[0.6, 0.5, 0.4, 0.2, 0.15, 0.065]
#list_of_R = ['rand100', 'rand250', 'rand500']

for i in list_of_R:
    # Load the data
    #load list imagenes as numpy array
    imagenes=np.load(f'data/{signal_name}_{i}.npy', allow_pickle=True)
    labels = np.load(f'data/{labels_name}_{i}.npy')

    lol=[]
    for i in range(len(imagenes)):
        lol.append(imagenes[i].flatten())
    imagenes=np.array(lol)
    imagenes.shape

    X = imagenes

    y=labels

    # Reshape the data to add a channel dimension
    X = X.reshape(-1, imagenes.shape[1], 1)

    # Evaluate the model
    val_loss, val_accuracy = tda_model.evaluate(X, y)
    print(f'Validation Loss: {val_loss}, Validation Accuracy: {val_accuracy}')

[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 220ms/step - accuracy: 0.5080 - loss: 0.7493
Validation Loss: 0.7508118748664856, Validation Accuracy: 0.503333330154419
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 223ms/step - accuracy: 0.4939 - loss: 0.7515
Validation Loss: 0.7493586540222168, Validation Accuracy: 0.5113333463668823
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 222ms/step - accuracy: 0.5006 - loss: 0.7446
Validation Loss: 0.7461341023445129, Validation Accuracy: 0.5059999823570251
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 230ms/step - accuracy: 0.4940 - loss: 0.7488
Validation Loss: 0.7502086758613586, Validation Accuracy: 0.4973333477973938
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 219ms/step - accuracy: 0.4951 - loss: 0.7507
Validation Loss: 0.7497367262840271, Validation Accuracy: 0.49933332204818726
[1m47/47[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 

: 