# Ruido chico

In [163]:
import numpy as np
noisy_signals = np.load("all_noisy_signals_Rchico.npy")
gw_signals = np.load("all_gw_signals_Rchico.npy")
labels = np.load("all_labels_Rchico.npy")

print(f"Number of noisy signals: {len(noisy_signals)}")
print(f"Number of timesteps per series: {len(noisy_signals[0])}")

Number of noisy signals: 3000
Number of timesteps per series: 8692


In [164]:
print(noisy_signals.shape)

(3000, 8692)


In [165]:
print(labels.shape)

(3000,)


In [166]:
import numpy as np
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# get the index corresponding to the first pure noise time series
background_idx = np.argmin(labels)
# get the index corresponding to the first noise + gravitational wave time series
signal_idx = np.argmax(labels)

ts_noise = noisy_signals[background_idx]
ts_background = noisy_signals[signal_idx]
ts_signal = gw_signals[signal_idx]

fig = make_subplots(rows=1, cols=2)

fig.add_trace(
    go.Scatter(x=list(range(len(ts_noise))), y=ts_noise, mode="lines", name="noise"),
    row=1,
    col=1,
)

fig.add_trace(
    go.Scatter(
        x=list(range(len(ts_background))),
        y=ts_background,
        mode="lines",
        name="background",
    ),
    row=1,
    col=2,
)

fig.add_trace(
    go.Scatter(x=list(range(len(ts_signal))), y=ts_signal, mode="lines", name="signal"),
    row=1,
    col=2,
)
fig.show()

In [167]:
from gtda.time_series import SingleTakensEmbedding
embedding_dimension = 30
embedding_time_delay = 30
stride = 5

embedder = SingleTakensEmbedding(
    parameters_type="search", n_jobs=6, time_delay=embedding_time_delay, dimension=embedding_dimension, stride=stride
)

y_gw_embedded = embedder.fit_transform(gw_signals[0])

In [168]:
from sklearn.decomposition import PCA
from gtda.plotting import plot_point_cloud

pca = PCA(n_components=3)
y_gw_embedded_pca = pca.fit_transform(y_gw_embedded)

plot_point_cloud(y_gw_embedded_pca)

In [169]:
embedding_dimension = 30
embedding_time_delay = 30
stride = 5

embedder = SingleTakensEmbedding(
    parameters_type="search", n_jobs=6, time_delay=embedding_time_delay, dimension=embedding_dimension, stride=stride
)

y_noise_embedded = embedder.fit_transform(noisy_signals[background_idx])

pca = PCA(n_components=3)
y_noise_embedded_pca = pca.fit_transform(y_noise_embedded)

plot_point_cloud(y_noise_embedded_pca)

# TDA con PCA

In [170]:
from gtda.diagrams import PersistenceEntropy, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding

embedding_dimension = 200
embedding_time_delay = 10
stride = 10

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

batch_pca = CollectionTransformer(PCA(n_components=3), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)


steps = [("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("scaling", scaling),
         ("entropy", entropy)]
topological_transfomer = Pipeline(steps)

In [171]:
features = topological_transfomer.fit_transform(noisy_signals)

In [172]:
print(features.shape)

(3000, 2)


In [173]:
from sklearn.metrics import accuracy_score, roc_auc_score


def print_scores(fitted_model):
    res = {
        "Accuracy on train:": accuracy_score(fitted_model.predict(X_train), y_train),
        "ROC AUC on train:": roc_auc_score(
            y_train, fitted_model.predict_proba(X_train)[:, 1]
        ),
        "Accuracy on valid:": accuracy_score(fitted_model.predict(X_valid), y_valid),
        "ROC AUC on valid:": roc_auc_score(
            y_valid, fitted_model.predict_proba(X_valid)[:, 1]
        ),
    }

    for k, v in res.items():
        print(k, round(v, 3))

# Pruebas con TDA en varios modelos

In [174]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [175]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.513
ROC AUC on train: 0.5
Accuracy on valid: 0.52
ROC AUC on valid: 0.505


In [176]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.56
ROC AUC on valid: 0.558


In [177]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.523
ROC AUC on valid: 0.525


In [178]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)


Accuracy on train: 0.694
ROC AUC on train: 0.75
Accuracy on valid: 0.55
ROC AUC on valid: 0.544


In [179]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.499
ROC AUC on train: 0.497
Accuracy on valid: 0.507
ROC AUC on valid: 0.494


# Pruebas con solamente PCA

In [180]:
print(features.shape)

(3000, 2)


In [181]:

# Crear un objeto PCA que reducirá el espacio de características a 2 dimensiones
pca = PCA(n_components=2)

# Ajustar el PCA a los datos y transformar los datos al nuevo espacio
noisy_signals_reduced = pca.fit_transform(noisy_signals)

# Ahora 'noisy_signals_reduced' tiene forma (500, 2)
print(noisy_signals_reduced.shape)

(3000, 2)


In [182]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    noisy_signals_reduced, labels, test_size=0.1, random_state=42
)

In [183]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.513
ROC AUC on train: 0.5
Accuracy on valid: 0.52
ROC AUC on valid: 0.5


In [184]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 0.513
ROC AUC on train: 0.5
Accuracy on valid: 0.52
ROC AUC on valid: 0.5


In [185]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 0.513
ROC AUC on train: 0.5
Accuracy on valid: 0.52
ROC AUC on valid: 0.5


In [186]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.678
ROC AUC on train: 0.739
Accuracy on valid: 0.507
ROC AUC on valid: 0.497


In [187]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.512
ROC AUC on train: 0.513
Accuracy on valid: 0.517
ROC AUC on valid: 0.48


# TDA con UMAP

In [188]:
from gtda.diagrams import PersistenceEntropy, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from umap import UMAP


embedding_dimension = 200
embedding_time_delay = 10
stride = 10

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

batch_pca = CollectionTransformer(UMAP(n_components=3, n_neighbors=15, min_dist=0.001, metric='euclidean'), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)


steps = [("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("scaling", scaling),
         ("entropy", entropy)]
topological_transfomer = Pipeline(steps)

In [189]:
features = topological_transfomer.fit_transform(noisy_signals)


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [190]:
print(features.shape)

(3000, 2)


# Pruebas con TDA en varios modelos

In [197]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [198]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.512
ROC AUC on train: 0.529
Accuracy on valid: 0.517
ROC AUC on valid: 0.474


In [199]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.47
ROC AUC on valid: 0.47


In [200]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.48
ROC AUC on valid: 0.476


In [201]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.677
ROC AUC on train: 0.738
Accuracy on valid: 0.467
ROC AUC on valid: 0.458


In [202]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.535
ROC AUC on train: 0.543
Accuracy on valid: 0.523
ROC AUC on valid: 0.521


# TDA con TSNE

In [203]:
from gtda.diagrams import PersistenceEntropy, Scaler
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from sklearn.manifold import TSNE


embedding_dimension = 200
embedding_time_delay = 10
stride = 10

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

batch_pca = CollectionTransformer(TSNE(n_components=3, perplexity=30, learning_rate=200), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)


steps = [("embedder", embedder),
         ("pca", batch_pca),
         ("persistence", persistence),
         ("scaling", scaling),
         ("entropy", entropy)]
topological_transfomer = Pipeline(steps)

In [207]:
noisy_signals.shape

(3000, 8692)

In [208]:
import numpy as np

# Supongamos que 'datos' es tu array original de NumPy con forma (3000, 8692)
# datos = np.random.rand(3000, 8692)  # Ejemplo de creación de un array grande

# Determinamos el número de filas a seleccionar
num_filas_seleccionar = 1500

# Generamos índices aleatorios sin repetición
indices_aleatorios = np.random.choice(noisy_signals.shape[0], num_filas_seleccionar, replace=False)

# Seleccionamos las filas correspondientes a esos índices
noisy_signals_ = noisy_signals[indices_aleatorios]

In [213]:
labels_ = labels[indices_aleatorios]

In [209]:
features = topological_transfomer.fit_transform(noisy_signals_)

In [212]:
print(features.shape)

(1500, 2)


# Pruebas con TDA en varios modelos

In [214]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels_, test_size=0.1, random_state=42
)

In [215]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.514
ROC AUC on train: 0.513
Accuracy on valid: 0.613
ROC AUC on valid: 0.453


In [216]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.573
ROC AUC on valid: 0.574


In [217]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.48
ROC AUC on valid: 0.46


In [218]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.681
ROC AUC on train: 0.735
Accuracy on valid: 0.5
ROC AUC on valid: 0.494


In [219]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.491
ROC AUC on train: 0.512
Accuracy on valid: 0.373
ROC AUC on valid: 0.504


# Otro TDA 

In [249]:
from gtda.diagrams import PersistenceEntropy, Scaler, Amplitude
from gtda.homology import VietorisRipsPersistence
from gtda.metaestimators import CollectionTransformer
from gtda.pipeline import Pipeline
from gtda.time_series import TakensEmbedding
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA
import numpy as np

from sklearn.preprocessing import FunctionTransformer

# Función para imprimir la forma de los datos
def print_shape(X):
    print(f"Shape of data: {X.shape}")
    return X

# Envoltura de la función para usar en el pipeline
print_shape_transformer = FunctionTransformer(print_shape)

# Reducir la dimensión del embedding si no se justifica una tan alta
embedding_dimension = 100  # Reducido de 200 para simplificar el modelo
embedding_time_delay = 10
stride = 5  # Reducido para una mayor granularidad en el embedding

embedder = TakensEmbedding(time_delay=embedding_time_delay,
                           dimension=embedding_dimension,
                           stride=stride)

# Utilizar PCA directamente, sin CollectionTransformer, para simplificar
batch_pca = CollectionTransformer(PCA(n_components=3), n_jobs=-1)

persistence = VietorisRipsPersistence(homology_dimensions=[0, 1], n_jobs=-1)

scaling = Scaler()

entropy = PersistenceEntropy(normalize=True, nan_fill_value=-10)

# Añadir un cálculo de la amplitud para evaluar la importancia de las características topológicas
amplitude = Amplitude(metric='bottleneck') #, metric_params={'n_jobs': -1}

# Define los transformadores que actúan en paralelo
parallel_processing = FeatureUnion([
    ("amplitude", amplitude),
    ("entropy", entropy)
])

# Ahora construimos el pipeline incluyendo esta transformación en varios puntos
steps = [
    ("embedder", embedder),
    ("print_shape_after_embedding", print_shape_transformer),  # Después de embedding
    ("pca", batch_pca),
    ("print_shape_after_pca", print_shape_transformer),  # Después de PCA
    ("persistence", persistence),
    ("print_shape_after_persistence", print_shape_transformer),  # Después de calcular la persistencia
    ("scaling", scaling),
    ("print_shape_after_scaling", print_shape_transformer),
    ("combined_features", parallel_processing)
    
]

topological_transfomer = Pipeline(steps)

features = topological_transfomer.fit_transform(noisy_signals)

Shape of data: (3000, 1541, 100)
Shape of data: (3000, 1541, 3)
Shape of data: (3000, 2409, 3)
Shape of data: (3000, 2409, 3)


In [251]:
print(features.shape)

(3000, 4)


# Pruebas con TDA en varios modelos

In [252]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

In [253]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)
print_scores(model)

Accuracy on train: 0.516
ROC AUC on train: 0.5
Accuracy on valid: 0.513
ROC AUC on valid: 0.523


In [254]:
from sklearn.tree import DecisionTreeClassifier

tree_model = DecisionTreeClassifier()
tree_model.fit(X_train, y_train)
print_scores(tree_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.547
ROC AUC on valid: 0.546


In [255]:
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
print_scores(rf_model)

Accuracy on train: 1.0
ROC AUC on train: 1.0
Accuracy on valid: 0.533
ROC AUC on valid: 0.545


In [256]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
print_scores(knn_model)

Accuracy on train: 0.711
ROC AUC on train: 0.768
Accuracy on valid: 0.523
ROC AUC on valid: 0.527


In [257]:
from sklearn.naive_bayes import GaussianNB

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
print_scores(nb_model)

Accuracy on train: 0.514
ROC AUC on train: 0.507
Accuracy on valid: 0.527
ROC AUC on valid: 0.497
