# Ruido Intervalo

In [1]:
import numpy as np
noisy_signals = np.load("all_noisy_signals_ambos.npy")
gw_signals = np.load("all_gw_signals_ambos.npy")
labels = np.load("all_labels_ambos.npy")

print(f"Number of noisy signals: {len(noisy_signals)}")
print(f"Number of timesteps per series: {len(noisy_signals[0])}")

Number of noisy signals: 2500
Number of timesteps per series: 8692


In [2]:
print(noisy_signals.shape)

(2500, 8692)


In [3]:
print(labels.shape)

(2500,)


In [4]:
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# get the index corresponding to the first pure noise time series
background_idx = np.argmin(labels)
# get the index corresponding to the first noise + gravitational wave time series
signal_idx = np.argmax(labels)

ts_noise = noisy_signals[background_idx]
ts_background = noisy_signals[signal_idx]
ts_signal = gw_signals[signal_idx]

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=list(range(len(ts_noise))), y=ts_noise, mode="lines", name="noise"),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(
        x=list(range(len(ts_background))),
        y=ts_background,
        mode="lines",
        name="background",
    ),
    row=1,
    col=2,
)

fig.add_trace(
    go.Scatter(x=list(range(len(ts_signal))), y=ts_signal, mode="lines", name="signal"),
    row=1,
    col=2,
)
fig.show()

In [5]:
from gtda.time_series import SingleTakensEmbedding
embedding_dimension = 30
embedding_time_delay = 30
stride = 5

embedder = SingleTakensEmbedding(
    parameters_type="search", n_jobs=6, time_delay=embedding_time_delay, dimension=embedding_dimension, stride=stride
)

y_gw_embedded = embedder.fit_transform(gw_signals[0])

In [6]:
from sklearn.decomposition import PCA
from gtda.plotting import plot_point_cloud

pca = PCA(n_components=3)
y_gw_embedded_pca = pca.fit_transform(y_gw_embedded)

plot_point_cloud(y_gw_embedded_pca)

In [7]:
embedding_dimension = 30
embedding_time_delay = 30
stride = 5

embedder = SingleTakensEmbedding(
    parameters_type="search", n_jobs=6, time_delay=embedding_time_delay, dimension=embedding_dimension, stride=stride
)

y_noise_embedded = embedder.fit_transform(noisy_signals[background_idx])

pca = PCA(n_components=3)
y_noise_embedded_pca = pca.fit_transform(y_noise_embedded)

plot_point_cloud(y_noise_embedded_pca)

# TDA con PCA

In [55]:
from gtda.diagrams import PersistenceEntropy, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding

embedding_dimension = 200
embedding_time_delay = 10
stride = 10

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

batch_pca = CollectionTransformer(PCA(n_components=3), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)


steps = [("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("scaling", scaling),
         ("entropy", entropy)]
topological_transfomer = Pipeline(steps)

In [56]:
features = topological_transfomer.fit_transform(noisy_signals)

In [57]:
print(features.shape)

(2500, 2)


In [58]:
from sklearn.metrics import accuracy_score, roc_auc_score


def print_scores(fitted_model):
    res = {
        "Accuracy on train:": accuracy_score(fitted_model.predict(X_train), y_train),
        "ROC AUC on train:": roc_auc_score(
            y_train, fitted_model.predict_proba(X_train)[:, 1]
        ),
        "Accuracy on valid:": accuracy_score(fitted_model.predict(X_valid), y_valid),
        "ROC AUC on valid:": roc_auc_score(
            y_valid, fitted_model.predict_proba(X_valid)[:, 1]
        ),
    }

    for k, v in res.items():
        print(k, round(v, 3))

# Pruebas con TDA en varios modelos

In [59]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [60]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.546
ROC AUC on train: 0.577
Accuracy on valid: 0.56
ROC AUC on valid: 0.571


In [61]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.576
ROC AUC on valid: 0.582


In [62]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.54
ROC AUC on valid: 0.552


In [63]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)


Accuracy on train: 0.699
ROC AUC on train: 0.756
Accuracy on valid: 0.524
ROC AUC on valid: 0.559


In [64]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.546
ROC AUC on train: 0.577
Accuracy on valid: 0.58
ROC AUC on valid: 0.57


# Pruebas con solamente PCA

In [97]:
print(features.shape)

(2500, 2)


In [98]:

# Crear un objeto PCA que reducirá el espacio de características a 2 dimensiones
pca = PCA(n_components=2)

# Ajustar el PCA a los datos y transformar los datos al nuevo espacio
noisy_signals_reduced = pca.fit_transform(noisy_signals)

# Ahora 'noisy_signals_reduced' tiene forma (500, 2)
print(noisy_signals_reduced.shape)

(2500, 2)


In [99]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    noisy_signals_reduced, labels, test_size=0.1, random_state=42
)

In [100]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.5
ROC AUC on train: 0.5
Accuracy on valid: 0.544
ROC AUC on valid: 0.5


In [101]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 0.5
ROC AUC on train: 0.5
Accuracy on valid: 0.544
ROC AUC on valid: 0.5


In [102]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 0.5
ROC AUC on train: 0.5
Accuracy on valid: 0.456
ROC AUC on valid: 0.5


In [103]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.707
ROC AUC on train: 0.774
Accuracy on valid: 0.544
ROC AUC on valid: 0.568


In [104]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.548
ROC AUC on train: 0.563
Accuracy on valid: 0.604
ROC AUC on valid: 0.557


# TDA con UMAP

In [26]:
from gtda.diagrams import PersistenceEntropy, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from umap import UMAP


embedding_dimension = 200
embedding_time_delay = 10
stride = 10

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

batch_pca = CollectionTransformer(UMAP(n_components=3, n_neighbors=15, min_dist=0.001, metric='euclidean'), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)


steps = [("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("scaling", scaling),
         ("entropy", entropy)]
topological_transfomer = Pipeline(steps)

In [27]:
features = topological_transfomer.fit_transform(noisy_signals)


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [28]:
print(features.shape)

(2500, 2)


# Pruebas con TDA en varios modelos

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [30]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.541
ROC AUC on train: 0.561
Accuracy on valid: 0.544
ROC AUC on valid: 0.576


In [31]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.516
ROC AUC on valid: 0.513


In [32]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.508
ROC AUC on valid: 0.5


In [33]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.712
ROC AUC on train: 0.78
Accuracy on valid: 0.52
ROC AUC on valid: 0.521


In [34]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.528
ROC AUC on train: 0.56
Accuracy on valid: 0.608
ROC AUC on valid: 0.592


# TDA con TSNE

In [35]:
from gtda.diagrams import PersistenceEntropy, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from sklearn.manifold import TSNE


embedding_dimension = 200
embedding_time_delay = 10
stride = 10

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

batch_pca = CollectionTransformer(TSNE(n_components=3, perplexity=30, learning_rate=200), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)


steps = [("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("scaling", scaling),
         ("entropy", entropy)]
topological_transfomer = Pipeline(steps)

In [36]:
noisy_signals.shape

(2500, 8692)

In [37]:
import numpy as np

# Supongamos que 'datos' es tu array original de NumPy con forma (3000, 8692)
# datos = np.random.rand(3000, 8692)  # Ejemplo de creación de un array grande

# Determinamos el número de filas a seleccionar
num_filas_seleccionar = 1500

# Generamos índices aleatorios sin repetición
indices_aleatorios = np.random.choice(noisy_signals.shape[0], num_filas_seleccionar, replace=False)

# Seleccionamos las filas correspondientes a esos índices
noisy_signals_ = noisy_signals[indices_aleatorios]

In [38]:
labels_ = labels[indices_aleatorios]

In [39]:
features = topological_transfomer.fit_transform(noisy_signals_)

In [40]:
print(features.shape)

(1500, 2)


# Pruebas con TDA en varios modelos

In [41]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels_, test_size=0.1, random_state=42
)

In [42]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.557
ROC AUC on train: 0.581
Accuracy on valid: 0.593
ROC AUC on valid: 0.657


In [43]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.54
ROC AUC on valid: 0.54


In [44]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.513
ROC AUC on valid: 0.551


In [45]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.71
ROC AUC on train: 0.772
Accuracy on valid: 0.567
ROC AUC on valid: 0.612


In [46]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.49
ROC AUC on train: 0.567
Accuracy on valid: 0.513
ROC AUC on valid: 0.526


# Otro TDA 

In [47]:
from gtda.diagrams import PersistenceEntropy, Scaler, Amplitude
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
import numpy as np

from sklearn.preprocessing import FunctionTransformer

# Función para imprimir la forma de los datos
def print_shape(X):
    print(f"Shape of data: {X.shape}")
    return X

# Envoltura de la función para usar en el pipeline
print_shape_transformer = FunctionTransformer(print_shape)

# Reducir la dimensión del embedding si no se justifica una tan alta
embedding_dimension = 100  # Reducido de 200 para simplificar el modelo
embedding_time_delay = 10
stride = 5  # Reducido para una mayor granularidad en el embedding

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

# Utilizar PCA directamente, sin CollectionTransformer, para simplificar
batch_pca = CollectionTransformer(PCA(n_components=3), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)

# Añadir un cálculo de la amplitud para evaluar la importancia de las características topológicas
amplitude = Amplitude(metric='bottleneck') #, metric_params={'n_jobs': -1}

# Define los transformadores que actúan en paralelo
parallel_processing = FeatureUnion([
    ("amplitude", amplitude),
    ("entropy", entropy)
])

# Ahora construimos el pipeline incluyendo esta transformación en varios puntos
steps = [
    ("embedder", embedder),
    ("print_shape_after_embedding", print_shape_transformer),  # Después de embedding
    ("pca", batch_pca),
    ("print_shape_after_pca", print_shape_transformer),  # Después de PCA
    ("persistence", persistence),
    ("print_shape_after_persistence", print_shape_transformer),  # Después de calcular la persistencia
    ("scaling", scaling),
    ("print_shape_after_scaling", print_shape_transformer),
    ("combined_features", parallel_processing)
    
]

topological_transfomer = Pipeline(steps)

features = topological_transfomer.fit_transform(noisy_signals)

Shape of data: (2500, 1541, 100)
Shape of data: (2500, 1541, 3)
Shape of data: (2500, 2430, 3)
Shape of data: (2500, 2430, 3)


In [48]:
print(features.shape)

(2500, 4)


# Pruebas con TDA en varios modelos

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [50]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.558
ROC AUC on train: 0.599
Accuracy on valid: 0.564
ROC AUC on valid: 0.62


In [51]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.508
ROC AUC on valid: 0.507


In [52]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.628
ROC AUC on valid: 0.652


In [53]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.715
ROC AUC on train: 0.79
Accuracy on valid: 0.604
ROC AUC on valid: 0.628


In [54]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.55
ROC AUC on train: 0.597
Accuracy on valid: 0.576
ROC AUC on valid: 0.612


In [None]:
from gtda.diagrams import PersistenceEntropy, Scaler, Amplitude
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
import numpy as np

# Reducir la dimensión del embedding si no se justifica una tan alta
embedding_dimension = 100  # Reducido de 200 para simplificar el modelo
embedding_time_delay = 10
stride = 5  # Reducido para una mayor granularidad en el embedding

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

# Utilizar PCA directamente, sin CollectionTransformer, para simplificar
batch_pca = CollectionTransformer(PCA(n_components=3), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)

# Añadir un cálculo de la amplitud para evaluar la importancia de las características topológicas
amplitude = Amplitude(metric='bottleneck')

# Define los transformadores que actúan en paralelo
parallel_processing = FeatureUnion([
    ("amplitude", amplitude),
    ("entropy", entropy)
])

# Ahora construimos el pipeline incluyendo esta transformación en varios puntos
steps = [
    ("embedder", embedder),
    ("pca", batch_pca),
    ("persistence", persistence),
    ("scaling", scaling),
    ("combined_features", parallel_processing)
    
]

topological_transfomer = Pipeline(steps)

features = topological_transfomer.fit_transform(noisy_signals)