# **Importar Librerías**

In [1]:
import scipy.io # To use the '.mat' files
import seaborn as sns
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
# Transformada Rápida de Fourier
from scipy.fft import fft, fftfreq
# Filtrado Pasa Bajo
from scipy.signal import butter, filtfilt
# Validación cruzada
from sklearn.model_selection import cross_val_score

# **Datos**

In [2]:
from google.colab import drive
drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [2]:
# Importar el archivo CSV con los datos filtrados (Pasa Bajo)
df = pd.read_csv('all_faults_filtered.csv')

# Obtener tipos de fallos únicos
faults = df['fault'].unique()

# Frecuencia de muestreo y frecuencia de corte para el filtro
fs = 48000  # Por ejemplo, 48 kHz
fc = 2000   # Frecuencia de corte, ajustar según sea necesario

print("Fallas:",faults)

Fallas: ['7IR' '7B' '7OR6' '7OR3' '7OR12' '14IR' '14B' '14OR' '21IR' '21B' '21OR6'
 '21OR3' '21OR21' 'N']


# **Transformada Rápida de Fourier**

In [3]:

# Obtener los valores únicos de la columna 'fault'
faults = df['fault'].unique()

# Crear una lista vacía para guardar los arreglos X
X_list = []

# Crear un bucle para cada valor de 'fault'
for f in faults:
    # Obtener los datos del tipo de fallo específico
    data = df[df['fault'] == f]['DE_data'].values
    # Aplicar la Transformada Rápida de Fourier (FFT)
    fft_result = fft(data)
    # Calcular la frecuencia correspondiente
    freq = fftfreq(len(fft_result), d=1/fs)
    # Apilar las frecuencias y las amplitudes como columnas
    X = np.column_stack((freq, np.abs(fft_result)))
    # Añadir el arreglo X a la lista
    X_list.append(X)

# Apilar verticalmente los arreglos X de la lista
X = np.vstack(X_list)

# Arreglo de Y
Y = df['fault'].to_numpy().reshape(-1, 1)
Y = Y.ravel()


In [4]:
print("X Shape: (",X.shape[0],",",X.shape[1],")")
print("Y Shape: (",Y.shape[0],")")

X Shape: ( 2782629 , 2 )
Y Shape: ( 2782629 )


# Análisis

In [5]:
#X = df['DE_data'].to_numpy().reshape(-1, 1)
#Y = df['fault'].to_numpy().reshape(-1, 1)
#Y = Y.ravel()

# Separar datos
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size = 0.75)

## 1. Modelo Random Forest

Referencia: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html


In [6]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators = 30,
                                          criterion = 'entropy',
                                          max_features= 'sqrt',
                                          verbose=1)

In [7]:
%%time
rfc.fit(X_train, Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


CPU times: total: 8min 42s
Wall time: 8min 46s


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  8.7min finished


## Prueba

In [11]:
from sklearn.metrics import precision_score, recall_score

In [9]:
Y_pred = rfc.predict(X_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   10.5s finished


In [13]:
# Calcular la precisión
precision = precision_score(Y_test, Y_pred, average='macro')  # Usa 'micro', 'macro', 'weighted' o None para problemas multiclase

# Calcular el recall
recall = recall_score(Y_test, Y_pred, average='macro')  # Usa 'micro', 'macro', 'weighted' o None para problemas multiclase

# Imprimir los resultados
print("Precisión: {:.5f}".format(precision))
print("Recall: {:.5f}".format(recall))

Precisión: 0.85314
Recall: 0.84766


In [8]:
%%time
rfc.score(X_train, Y_train)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:   49.3s finished


CPU times: total: 32.8 s
Wall time: 58.3 s


0.9991676932741279

In [9]:
%%time
rfc.score(X_test, Y_test)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    8.0s finished


CPU times: total: 9.16 s
Wall time: 10.5 s


0.8507743747646114

In [None]:
# GRAFICAR ÁRBOLES (Parece que se satura en google colab)
from sklearn.tree import plot_tree
plt.figure(figsize=(12,8))
plot_tree(rfc.estimators_[0], filled = True)
plt.show()

## 2. Evaluación con validación cruzada

In [None]:
%%time
score_rfc = cross_val_score( rfc, X, Y, cv = 30) # cv: Numero de splits que va a generar


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  5.5min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    1.6s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  5.1min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:  5.0min finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    0.7s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_j

In [None]:
print(score_rfc)
print("Promedio: ",score_rfc.mean()*100,"%")