1. Cargar los datasets

In [9]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler

# Directorio donde están los CSV reducidos
DATA_DIR = "DatasetReducido"

# Cargar los datasets ya reducidos y convertidos a CSV
datasets = {
    "FaultFree_Training": pd.read_csv(os.path.join(DATA_DIR, "FaultFree_Training_reduced.csv")),
    "FaultFree_Testing": pd.read_csv(os.path.join(DATA_DIR, "FaultFree_Testing_reduced.csv")),
    "Faulty_Training": pd.read_csv(os.path.join(DATA_DIR, "Faulty_Training_reduced.csv")),
    "Faulty_Testing": pd.read_csv(os.path.join(DATA_DIR, "Faulty_Testing_reduced.csv")),
}

print("Datasets CSV cargados correctamente.")

Datasets CSV cargados correctamente.


In [10]:
process_vars = [f"xmeas_{i}" for i in range(1, 42)]
actuator_vars = [f"xmv_{i}" for i in range(1, 12)]
all_vars = process_vars + actuator_vars

for name, df in datasets.items():
    df["fault_present"] = np.where(df["faultNumber"] > 0, 1, 0)

print("Columnas 'fault_present' creadas.")

Columnas 'fault_present' creadas.


In [11]:
df_normal_full = pd.concat([
    datasets["FaultFree_Training"],
    datasets["FaultFree_Testing"]
], ignore_index=True)

scaler = StandardScaler()
scaler.fit(df_normal_full[all_vars])

print("Normalizador entrenado únicamente con datos sin fallo.")

Normalizador entrenado únicamente con datos sin fallo.


In [12]:
def add_temporal_features(df, variables, ventanas=[5, 10]):
    """
    Añade diferencias temporales, medias móviles y desviaciones móviles
    por cada variable, usando agrupación por simulationRun.
    Esta versión evita fragmentación creando todas las columnas primero
    y concatenándolas de una vez al final.
    """

    # Ordenamos por si acaso
    df = df.sort_values(["simulationRun", "sample"]).copy()

    # Diccionario donde guardaremos todas las nuevas columnas
    nuevas_cols = {}

    for var in variables:

        # ---- Diferencia temporal de primer orden ----
        nuevas_cols[f"{var}_diff1"] = (
            df.groupby("simulationRun")[var].diff()
        )

        # ---- Medias y desviaciones móviles ----
        for w in ventanas:

            # Media móvil
            nuevas_cols[f"{var}_ma{w}"] = (
                df.groupby("simulationRun")[var]
                  .rolling(w)
                  .mean()
                  .reset_index(0, drop=True)
            )

            # Desviación típica móvil
            nuevas_cols[f"{var}_std{w}"] = (
                df.groupby("simulationRun")[var]
                  .rolling(w)
                  .std()
                  .reset_index(0, drop=True)
            )

    # Convertimos el diccionario en DataFrame
    df_nuevas = pd.DataFrame(nuevas_cols)

    # Concatenamos TODO de una sola vez → evita el PerformanceWarning
    df = pd.concat([df, df_nuevas], axis=1).copy()

    return df

In [13]:
def add_fault_timing_features(df):
    df = df.sort_values(["simulationRun", "sample"]).copy()

    # Tiempo desde el inicio del fallo
    df["time_since_fault"] = df.groupby("simulationRun")["fault_present"].cumsum()
    df["time_since_fault"] = df["time_since_fault"].where(df["fault_present"] == 1, 0)

    # Etapas del fallo (segmentadas por intervalos)
    df["fault_stage"] = 0
    positive_idx = df["fault_present"] == 1
    df.loc[positive_idx, "fault_stage"] = (df.loc[positive_idx, "time_since_fault"] // 10).astype(int)

    return df

In [14]:
OUTPUT_DIR = "DatasetProcesado"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Lista de features finales (se actualiza más adelante)
final_features = []

def process_and_save(df, name):
    global final_features

    # Añadir ingeniería de características
    df = add_temporal_features(df, all_vars)
    df = add_fault_timing_features(df)

    # Seleccionamos las columnas generadas
    feature_cols = [c for c in df.columns if any(k in c for k in ["_scaled", "diff1", "ma", "std", "time_since_fault", "fault_stage"])]

    # Primera vez: almacenamos features finales
    if not final_features:
        final_features = feature_cols

    # Asegurar que todas las features existen
    for c in final_features:
        if c not in df.columns:
            df[c] = 0

    # Escalar usando el scaler ajustado antes
    df_scaled = df.copy()
    df_scaled[all_vars] = scaler.transform(df[all_vars].fillna(0))

    # Guardar como CSV final
    out_file = os.path.join(OUTPUT_DIR, f"TEP_features_{name}.csv")
    df_scaled[["simulationRun", "sample", "faultNumber", "fault_present"] + final_features].to_csv(out_file, index=False)

    print("Guardado:", out_file)

In [15]:
# Procesar datasets de entrenamiento y prueba
process_and_save(datasets["Faulty_Training"], "faulty_training")
process_and_save(datasets["FaultFree_Training"], "faultfree_training")
process_and_save(datasets["Faulty_Testing"], "faulty_testing")
process_and_save(datasets["FaultFree_Testing"], "faultfree_testing")

print("Todos los CSV procesados correctamente.")

Guardado: DatasetProcesado\TEP_features_faulty_training.csv
Guardado: DatasetProcesado\TEP_features_faultfree_training.csv
Guardado: DatasetProcesado\TEP_features_faulty_testing.csv
Guardado: DatasetProcesado\TEP_features_faultfree_testing.csv
Todos los CSV procesados correctamente.


2. Comprobación

In [16]:
df = pd.read_csv("DatasetProcesado/TEP_features_faulty_training.csv")
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
simulationRun,100000.0,249.000000,146.694973,69.000000,105.000000,259.000000,378.000000,451.000000
sample,100000.0,250.500000,144.338000,1.000000,125.750000,250.500000,375.250000,500.000000
faultNumber,100000.0,10.500000,5.766310,1.000000,5.750000,10.500000,15.250000,20.000000
fault_present,100000.0,1.000000,0.000000,1.000000,1.000000,1.000000,1.000000,1.000000
xmeas_1_diff1,99990.0,-0.000002,0.207191,-0.770380,-0.042550,0.000000,0.038720,0.800960
...,...,...,...,...,...,...,...,...
xmv_11_std5,99960.0,2.335955,4.218431,0.000000,0.556895,1.099803,2.043104,38.057158
xmv_11_ma10,99910.0,18.775350,1.906220,11.991693,17.602800,18.608350,19.660775,30.625800
xmv_11_std10,99910.0,2.879244,3.883368,0.000000,0.925123,1.570825,3.124215,26.901302
time_since_fault,100000.0,5000.500000,2886.765765,1.000000,2500.750000,5000.500000,7500.250000,10000.000000
