# Ruido grande

In [1]:
import numpy as np
noisy_signals = np.load("all_noisy_signals_alto.npy")
gw_signals = np.load("all_gw_signals_alto.npy")
labels = np.load("all_labels_alto.npy")

print(f"Number of noisy signals: {len(noisy_signals)}")
print(f"Number of timesteps per series: {len(noisy_signals[0])}")

Number of noisy signals: 5000
Number of timesteps per series: 8692


In [2]:
print(noisy_signals.shape)

(5000, 8692)


In [3]:
print(labels.shape)

(5000,)


In [4]:
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# get the index corresponding to the first pure noise time series
background_idx = np.argmin(labels)
# get the index corresponding to the first noise + gravitational wave time series
signal_idx = np.argmax(labels)

ts_noise = noisy_signals[background_idx]
ts_background = noisy_signals[signal_idx]
ts_signal = gw_signals[signal_idx]

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=list(range(len(ts_noise))), y=ts_noise, mode="lines", name="noise"),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(
        x=list(range(len(ts_background))),
        y=ts_background,
        mode="lines",
        name="background",
    ),
    row=1,
    col=2,
)

fig.add_trace(
    go.Scatter(x=list(range(len(ts_signal))), y=ts_signal, mode="lines", name="signal"),
    row=1,
    col=2,
)
fig.show()

In [5]:
from gtda.time_series import SingleTakensEmbedding
embedding_dimension = 30
embedding_time_delay = 30
stride = 5

embedder = SingleTakensEmbedding(
    parameters_type="search", n_jobs=6, time_delay=embedding_time_delay, dimension=embedding_dimension, stride=stride
)

y_gw_embedded = embedder.fit_transform(gw_signals[0])

In [6]:
from sklearn.decomposition import PCA
from gtda.plotting import plot_point_cloud

pca = PCA(n_components=3)
y_gw_embedded_pca = pca.fit_transform(y_gw_embedded)

plot_point_cloud(y_gw_embedded_pca)

In [7]:
embedding_dimension = 30
embedding_time_delay = 30
stride = 5

embedder = SingleTakensEmbedding(
    parameters_type="search", n_jobs=6, time_delay=embedding_time_delay, dimension=embedding_dimension, stride=stride
)

y_noise_embedded = embedder.fit_transform(noisy_signals[background_idx])

pca = PCA(n_components=3)
y_noise_embedded_pca = pca.fit_transform(y_noise_embedded)

plot_point_cloud(y_noise_embedded_pca)

# TDA con PCA

In [55]:
from gtda.diagrams import PersistenceEntropy, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding

embedding_dimension = 200
embedding_time_delay = 10
stride = 10

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

batch_pca = CollectionTransformer(PCA(n_components=3), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)


steps = [("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("scaling", scaling),
         ("entropy", entropy)]
topological_transfomer = Pipeline(steps)

In [56]:
features = topological_transfomer.fit_transform(noisy_signals)

In [57]:
print(features.shape)

(5000, 2)


In [58]:
from sklearn.metrics import accuracy_score, roc_auc_score


def print_scores(fitted_model):
    res = {
        "Accuracy on train:": accuracy_score(fitted_model.predict(X_train), y_train),
        "ROC AUC on train:": roc_auc_score(
            y_train, fitted_model.predict_proba(X_train)[:, 1]
        ),
        "Accuracy on valid:": accuracy_score(fitted_model.predict(X_valid), y_valid),
        "ROC AUC on valid:": roc_auc_score(
            y_valid, fitted_model.predict_proba(X_valid)[:, 1]
        ),
    }

    for k, v in res.items():
        print(k, round(v, 3))

# Pruebas con TDA en varios modelos

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [60]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.696
ROC AUC on train: 0.765
Accuracy on valid: 0.674
ROC AUC on valid: 0.741


In [61]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.628
ROC AUC on valid: 0.629


In [62]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.67
ROC AUC on valid: 0.719


In [63]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)


Accuracy on train: 0.775
ROC AUC on train: 0.855
Accuracy on valid: 0.648
ROC AUC on valid: 0.699


In [64]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.689
ROC AUC on train: 0.746
Accuracy on valid: 0.684
ROC AUC on valid: 0.743


# Pruebas con solamente PCA

In [18]:
print(features.shape)

(5000, 2)


In [19]:

# Crear un objeto PCA que reducirá el espacio de características a 2 dimensiones
pca = PCA(n_components=2)

# Ajustar el PCA a los datos y transformar los datos al nuevo espacio
noisy_signals_reduced = pca.fit_transform(noisy_signals)

# Ahora 'noisy_signals_reduced' tiene forma (500, 2)
print(noisy_signals_reduced.shape)

(5000, 2)


In [20]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    noisy_signals_reduced, labels, test_size=0.1, random_state=42
)

In [21]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.507
ROC AUC on train: 0.5
Accuracy on valid: 0.476
ROC AUC on valid: 0.5


In [22]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 0.507
ROC AUC on train: 0.5
Accuracy on valid: 0.476
ROC AUC on valid: 0.5


In [23]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 0.507
ROC AUC on train: 0.5
Accuracy on valid: 0.476
ROC AUC on valid: 0.5


In [24]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.7
ROC AUC on train: 0.77
Accuracy on valid: 0.506
ROC AUC on valid: 0.538


In [25]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.549
ROC AUC on train: 0.57
Accuracy on valid: 0.568
ROC AUC on valid: 0.551


# TDA con UMAP

In [26]:
from gtda.diagrams import PersistenceEntropy, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from umap import UMAP


embedding_dimension = 200
embedding_time_delay = 10
stride = 10

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

batch_pca = CollectionTransformer(UMAP(n_components=3, n_neighbors=15, min_dist=0.001, metric='euclidean'), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)


steps = [("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("scaling", scaling),
         ("entropy", entropy)]
topological_transfomer = Pipeline(steps)

In [27]:
features = topological_transfomer.fit_transform(noisy_signals)


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [28]:
print(features.shape)

(5000, 2)


# Pruebas con TDA en varios modelos

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [30]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.582
ROC AUC on train: 0.616
Accuracy on valid: 0.572
ROC AUC on valid: 0.631


In [31]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.54
ROC AUC on valid: 0.541


In [32]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.558
ROC AUC on valid: 0.583


In [33]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.71
ROC AUC on train: 0.775
Accuracy on valid: 0.562
ROC AUC on valid: 0.566


In [34]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.561
ROC AUC on train: 0.582
Accuracy on valid: 0.584
ROC AUC on valid: 0.576


# TDA con TSNE

In [35]:
from gtda.diagrams import PersistenceEntropy, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from sklearn.manifold import TSNE


embedding_dimension = 200
embedding_time_delay = 10
stride = 10

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

batch_pca = CollectionTransformer(TSNE(n_components=3, perplexity=30, learning_rate=200), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)


steps = [("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("scaling", scaling),
         ("entropy", entropy)]
topological_transfomer = Pipeline(steps)

In [36]:
noisy_signals.shape

(5000, 8692)

In [37]:
import numpy as np

# Supongamos que 'datos' es tu array original de NumPy con forma (3000, 8692)
# datos = np.random.rand(3000, 8692)  # Ejemplo de creación de un array grande

# Determinamos el número de filas a seleccionar
num_filas_seleccionar = 1500

# Generamos índices aleatorios sin repetición
indices_aleatorios = np.random.choice(noisy_signals.shape[0], num_filas_seleccionar, replace=False)

# Seleccionamos las filas correspondientes a esos índices
noisy_signals_ = noisy_signals[indices_aleatorios]

In [38]:
labels_ = labels[indices_aleatorios]

In [39]:
features = topological_transfomer.fit_transform(noisy_signals_)

In [40]:
print(features.shape)

(1500, 2)


# Pruebas con TDA en varios modelos

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels_, test_size=0.1, random_state=42
)

In [42]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.612
ROC AUC on train: 0.659
Accuracy on valid: 0.553
ROC AUC on valid: 0.568


In [43]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.487
ROC AUC on valid: 0.486


In [44]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.5
ROC AUC on valid: 0.51


In [45]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.702
ROC AUC on train: 0.774
Accuracy on valid: 0.513
ROC AUC on valid: 0.536


In [46]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.503
ROC AUC on train: 0.565
Accuracy on valid: 0.507
ROC AUC on valid: 0.535


# TDA + PCA + Amplitude

In [47]:
from gtda.diagrams import PersistenceEntropy, Scaler, Amplitude
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
import numpy as np

from sklearn.preprocessing import FunctionTransformer

# Función para imprimir la forma de los datos
def print_shape(X):
    print(f"Shape of data: {X.shape}")
    return X

# Envoltura de la función para usar en el pipeline
print_shape_transformer = FunctionTransformer(print_shape)

# Reducir la dimensión del embedding si no se justifica una tan alta
embedding_dimension = 100  # Reducido de 200 para simplificar el modelo
embedding_time_delay = 10
stride = 5  # Reducido para una mayor granularidad en el embedding

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

# Utilizar PCA directamente, sin CollectionTransformer, para simplificar
batch_pca = CollectionTransformer(PCA(n_components=3), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)

# Añadir un cálculo de la amplitud para evaluar la importancia de las características topológicas
amplitude = Amplitude(metric='bottleneck') #, metric_params={'n_jobs': -1}

# Define los transformadores que actúan en paralelo
parallel_processing = FeatureUnion([
    ("amplitude", amplitude),
    ("entropy", entropy)
])

# Ahora construimos el pipeline incluyendo esta transformación en varios puntos
steps = [
    ("embedder", embedder),
    ("print_shape_after_embedding", print_shape_transformer),  # Después de embedding
    ("pca", batch_pca),
    ("print_shape_after_pca", print_shape_transformer),  # Después de PCA
    ("persistence", persistence),
    ("print_shape_after_persistence", print_shape_transformer),  # Después de calcular la persistencia
    ("scaling", scaling),
    ("print_shape_after_scaling", print_shape_transformer),
    ("combined_features", parallel_processing)
    
]

topological_transfomer = Pipeline(steps)

features = topological_transfomer.fit_transform(noisy_signals)

Shape of data: (5000, 1541, 100)
Shape of data: (5000, 1541, 3)
Shape of data: (5000, 2432, 3)
Shape of data: (5000, 2432, 3)


In [48]:
print(features.shape)

(5000, 4)


# Pruebas con TDA en varios modelos

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [50]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.777
ROC AUC on train: 0.85
Accuracy on valid: 0.782
ROC AUC on valid: 0.853


In [51]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.708
ROC AUC on valid: 0.708


In [52]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.762
ROC AUC on valid: 0.854


In [53]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.825
ROC AUC on train: 0.91
Accuracy on valid: 0.746
ROC AUC on valid: 0.816


In [54]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.759
ROC AUC on train: 0.826
Accuracy on valid: 0.776
ROC AUC on valid: 0.827


# Búsqueda de parámetros para TDA + PCA + Amplitude

In [65]:
import numpy as np
noisy_signals = np.load("all_noisy_signals_alto.npy")
gw_signals = np.load("all_gw_signals_alto.npy")
labels = np.load("all_labels_alto.npy")

print(f"Number of noisy signals: {len(noisy_signals)}")
print(f"Number of timesteps per series: {len(noisy_signals[0])}")

Number of noisy signals: 5000
Number of timesteps per series: 8692


In [66]:
from gtda.diagrams import PersistenceEntropy, Scaler, Amplitude
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
import numpy as np
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [68]:
# Lista de configuraciones de parámetros
parametros = [
    {"emb_dim": 20, "emb_time": 25, "stride_variable": 5, "pca_components": 5, "tam_entrenamiento": 0.75},
    {"emb_dim": 30, "emb_time": 20, "stride_variable": 7, "pca_components": 4, "tam_entrenamiento": 0.8},
    {"emb_dim": 50, "emb_time": 15, "stride_variable": 10, "pca_components": 3, "tam_entrenamiento": 0.85},
    {"emb_dim": 100, "emb_time": 10, "stride_variable": 3, "pca_components": 2, "tam_entrenamiento": 0.9},
    {"emb_dim": 150, "emb_time": 5, "stride_variable": 13, "pca_components": 1, "tam_entrenamiento": 0.85}
]

for param in parametros:
    emb_dim = param['emb_dim']
    emb_time = param['emb_time']
    stride_variable = param['stride_variable']
    pca_components = param['pca_components']
    tam_entrenamiento = param['tam_entrenamiento']
    
    print(f"Dimensión Embedding: {emb_dim} Tiempo de retardo Embedding: {emb_time} Stride: {stride_variable} Componentes PCA: {pca_components} Tamaño de entrenamiento: {tam_entrenamiento}")


    # Envoltura de la función para usar en el pipeline
    print_shape_transformer = FunctionTransformer(print_shape)

    # Reducir la dimensión del embedding si no se justifica una tan alta
    embedding_dimension = emb_dim  # Reducido de 200 para simplificar el modelo
    embedding_time_delay = emb_time
    stride = stride_variable  # Reducido para una mayor granularidad en el embedding

    embedder = TakensEmbedding(time_delay=embedding_time_delay,
                               dimension=embedding_dimension,
                               stride=stride)

    # Utilizar PCA directamente, sin CollectionTransformer, para simplificar
    batch_pca = CollectionTransformer(PCA(n_components=pca_components), n_jobs=-1)

    persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

    scaling = Scaler()

    entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)

    # Añadir un cálculo de la amplitud para evaluar la importancia de las características topológicas
    amplitude = Amplitude(metric='bottleneck') #, metric_params={'n_jobs': -1}

    # Define los transformadores que actúan en paralelo
    parallel_processing = FeatureUnion([
        ("amplitude", amplitude),
        ("entropy", entropy)
    ])

    # Ahora construimos el pipeline incluyendo esta transformación en varios puntos
    steps = [
        ("embedder", embedder),
        ("pca", batch_pca),
        ("persistence", persistence),
        ("scaling", scaling),
        ("combined_features", parallel_processing)

    ]

    topological_transfomer = Pipeline(steps)

    features = topological_transfomer.fit_transform(noisy_signals)

    X_train, X_valid, y_train, y_valid = train_test_split(
        features, labels, test_size= (1 - tam_entrenamiento), random_state=42
    )

    # Listas para almacenar los resultados de validación y seguimiento de los mejores modelos
    accuracy_valid_scores = []
    roc_auc_valid_scores = []
    best_accuracy = {"model": None, "score": 0}
    best_roc_auc = {"model": None, "score": 0}

    def print_scores(fitted_model, model_name):
        res = {
            "Accuracy on train:": accuracy_score(fitted_model.predict(X_train), y_train),
            "ROC AUC on train:": roc_auc_score(
                y_train, fitted_model.predict_proba(X_train)[:, 1]
            ),
            "Accuracy on valid:": accuracy_score(fitted_model.predict(X_valid), y_valid),
            "ROC AUC on valid:": roc_auc_score(
                y_valid, fitted_model.predict_proba(X_valid)[:, 1]
            ),
        }

        for k, v in res.items():
            print(k, round(v, 3))

        # Almacenar los resultados de validación
        accuracy_valid_scores.append(res["Accuracy on valid:"])
        roc_auc_valid_scores.append(res["ROC AUC on valid:"])

        # Actualizar el mejor modelo si es necesario
        if res["Accuracy on valid:"] > best_accuracy["score"]:
            best_accuracy["model"] = model_name
            best_accuracy["score"] = res["Accuracy on valid:"]
        if res["ROC AUC on valid:"] > best_roc_auc["score"]:
            best_roc_auc["model"] = model_name
            best_roc_auc["score"] = res["ROC AUC on valid:"]

    # Definición de modelos
    models = {
        "Logistic Regression": LogisticRegression(),
        "Decision Tree": DecisionTreeClassifier(),
        "Random Forest": RandomForestClassifier(),
        "KNN": KNeighborsClassifier(),
        "Gaussian NB": GaussianNB()
    }

    # Entrenar y evaluar cada modelo
    for name, model in models.items():
        print(f"Evaluating {name}")
        model.fit(X_train, y_train)
        print_scores(model, name)
        print("-" * 30)

    # Calcular e imprimir promedios de los resultados de validación
    print("Average Accuracy on valid:", sum(accuracy_valid_scores) / len(accuracy_valid_scores))
    print("Average ROC AUC on valid:", sum(roc_auc_valid_scores) / len(roc_auc_valid_scores))
    print(f"Best model by Accuracy: {best_accuracy['model']} with a score of {best_accuracy['score']:.3f}")
    print(f"Best model by ROC AUC: {best_roc_auc['model']} with a score of {best_roc_auc['score']:.3f}")

Dimensión Embedding: 20 Tiempo de retardo Embedding: 25 Stride: 5 Componentes PCA: 5 Tamaño de entrenamiento: 0.75
Evaluating Logistic Regression
Accuracy on train: 0.698
ROC AUC on train: 0.77
Accuracy on valid: 0.677
ROC AUC on valid: 0.748
------------------------------
Evaluating Decision Tree
Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.698
ROC AUC on valid: 0.699
------------------------------
Evaluating Random Forest
Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.739
ROC AUC on valid: 0.817
------------------------------
Evaluating KNN
Accuracy on train: 0.813
ROC AUC on train: 0.9
Accuracy on valid: 0.718
ROC AUC on valid: 0.774
------------------------------
Evaluating Gaussian NB
Accuracy on train: 0.733
ROC AUC on train: 0.801
Accuracy on valid: 0.726
ROC AUC on valid: 0.793
------------------------------
Average Accuracy on valid: 0.71168
Average ROC AUC on valid: 0.7660918259390475
Best model by Accuracy: Random Forest with a score o

# Búsqueda de hiperparámetros en los modelos

In [69]:
# Reducir la dimensión del embedding si no se justifica una tan alta
embedding_dimension = 100  # Reducido de 200 para simplificar el modelo
embedding_time_delay = 10
stride = 3  # Reducido para una mayor granularidad en el embedding

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

# Utilizar PCA directamente, sin CollectionTransformer, para simplificar
batch_pca = CollectionTransformer(PCA(n_components=2), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)

# Añadir un cálculo de la amplitud para evaluar la importancia de las características topológicas
amplitude = Amplitude(metric='bottleneck') #, metric_params={'n_jobs': -1}

# Define los transformadores que actúan en paralelo
parallel_processing = FeatureUnion([
    ("amplitude", amplitude),
    ("entropy", entropy)
])

# Ahora construimos el pipeline incluyendo esta transformación en varios puntos
steps = [
    ("embedder", embedder),
    ("pca", batch_pca),
    ("persistence", persistence),
    ("scaling", scaling),
    ("combined_features", parallel_processing)
    
]

topological_transfomer = Pipeline(steps)

features = topological_transfomer.fit_transform(noisy_signals)

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [76]:
def print_scores(fitted_model):
    res = {
        "Accuracy on train:": accuracy_score(fitted_model.predict(X_train), y_train),
        "ROC AUC on train:": roc_auc_score(
            y_train, fitted_model.predict_proba(X_train)[:, 1]
        ),
        "Accuracy on valid:": accuracy_score(fitted_model.predict(X_valid), y_valid),
        "ROC AUC on valid:": roc_auc_score(
            y_valid, fitted_model.predict_proba(X_valid)[:, 1]
        ),
    }

    for k, v in res.items():
        if 'on valid' in k:  # Filtra para incluir solo las claves que contienen 'on valid'
            print(k, round(v, 3))
    print("-" * 30)


In [77]:
print('Regresión Logística')
model_1 = LogisticRegression(C=0.01, solver='liblinear', max_iter=100)
model_1.fit(X_train, y_train)
print_scores(model_1)

model_2 = LogisticRegression(C=1, solver='sag', max_iter=200)
model_2.fit(X_train, y_train)
print_scores(model_2)

model_3 = LogisticRegression(C=10, solver='newton-cg', max_iter=300)
model_3.fit(X_train, y_train)
print_scores(model_3)

model_4 = LogisticRegression(C=0.05, solver='saga', max_iter=500, penalty='l2')
model_4.fit(X_train, y_train)
print_scores(model_4)

model_5 = LogisticRegression(C=0.5, solver='lbfgs', max_iter=100, penalty='l2')
model_5.fit(X_train, y_train)
print_scores(model_5)

print('')
print('Árboles de decisión')
tree_model_1 = DecisionTreeClassifier(max_depth=3, min_samples_split=2, criterion='gini')
tree_model_1.fit(X_train, y_train)
print_scores(tree_model_1)

tree_model_2 = DecisionTreeClassifier(max_depth=10, min_samples_split=10, criterion='entropy')
tree_model_2.fit(X_train, y_train)
print_scores(tree_model_2)

tree_model_3 = DecisionTreeClassifier(max_depth=15, min_samples_split=5, criterion='gini')
tree_model_3.fit(X_train, y_train)
print_scores(tree_model_3)

print('')
print('Random Forest')
rf_model_1 = RandomForestClassifier(n_estimators=50, max_features='sqrt', max_depth=5)
rf_model_1.fit(X_train, y_train)
print_scores(rf_model_1)

rf_model_2 = RandomForestClassifier(n_estimators=100, max_features='log2', max_depth=10)
rf_model_2.fit(X_train, y_train)
print_scores(rf_model_2)

rf_model_3 = RandomForestClassifier(n_estimators=200, max_features='sqrt', max_depth=20)
rf_model_3.fit(X_train, y_train)
print_scores(rf_model_3)

rf_model_4 = RandomForestClassifier(n_estimators=150, max_features='sqrt', max_depth=25, min_samples_split=4)
rf_model_4.fit(X_train, y_train)
print_scores(rf_model_4)

rf_model_5 = RandomForestClassifier(n_estimators=300, max_features='log2', max_depth=30, min_samples_leaf=2)
rf_model_5.fit(X_train, y_train)
print_scores(rf_model_5)

print('')
print('KNN')
knn_model_1 = KNeighborsClassifier(n_neighbors=3, weights='uniform')
knn_model_1.fit(X_train, y_train)
print_scores(knn_model_1)

knn_model_2 = KNeighborsClassifier(n_neighbors=7, weights='distance')
knn_model_2.fit(X_train, y_train)
print_scores(knn_model_2)

knn_model_3 = KNeighborsClassifier(n_neighbors=15, weights='uniform')
knn_model_3.fit(X_train, y_train)
print_scores(knn_model_3)

print('')
print('Naive Bayes')
nb_model_1 = GaussianNB(var_smoothing=1e-9)
nb_model_1.fit(X_train, y_train)
print_scores(nb_model_1)

nb_model_2 = GaussianNB(var_smoothing=1e-8)
nb_model_2.fit(X_train, y_train)
print_scores(nb_model_2)

nb_model_3 = GaussianNB(var_smoothing=1e-10)
nb_model_3.fit(X_train, y_train)
print_scores(nb_model_3)


Regresión Logística
Accuracy on valid: 0.724
ROC AUC on valid: 0.839
------------------------------
Accuracy on valid: 0.852
ROC AUC on valid: 0.922
------------------------------
Accuracy on valid: 0.87
ROC AUC on valid: 0.927
------------------------------
Accuracy on valid: 0.802
ROC AUC on valid: 0.919
------------------------------
Accuracy on valid: 0.844
ROC AUC on valid: 0.921
------------------------------

Árboles de decisión
Accuracy on valid: 0.846
ROC AUC on valid: 0.922
------------------------------
Accuracy on valid: 0.822
ROC AUC on valid: 0.862
------------------------------
Accuracy on valid: 0.816
ROC AUC on valid: 0.814
------------------------------

Random Forest
Accuracy on valid: 0.88
ROC AUC on valid: 0.93
------------------------------
Accuracy on valid: 0.872
ROC AUC on valid: 0.926
------------------------------
Accuracy on valid: 0.864
ROC AUC on valid: 0.925
------------------------------
Accuracy on valid: 0.868
ROC AUC on valid: 0.924
------------------