# Feature Engineering & Extraction (FD001)

Ce notebook charge les données brutes C-MAPSS FD001, effectue le nettoyage, la normalisation, et l'extraction de features (moyennes/écarts-types glissants).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

# Config
DATA_PATH = Path('../data/raw/NASA_CMAPSS')
PROCESSED_PATH = Path('../data/processed')
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

# Colonnes
index_cols = ['unit_number', 'time_cycles']
setting_cols = ['setting_1', 'setting_2', 'setting_3']
sensor_cols = [f'sensor_{i}' for i in range(1, 22)]
cols = index_cols + setting_cols + sensor_cols

# Chargement
print("Chargement des données...")
train = pd.read_csv(DATA_PATH / 'train_FD001.txt', sep='\\s+', header=None, names=cols)
test = pd.read_csv(DATA_PATH / 'test_FD001.txt', sep='\\s+', header=None, names=cols)
y_test = pd.read_csv(DATA_PATH / 'RUL_FD001.txt', sep='\\s+', header=None, names=['RUL'])

print(f"Train: {train.shape}, Test: {test.shape}")

# --- 1. Calcul de la RUL (Target) sur le Train ---
def add_rul(df):
    max_cycles = df.groupby('unit_number')['time_cycles'].transform('max')
    df['RUL'] = max_cycles - df['time_cycles']
    return df

train = add_rul(train)

# --- 2. Normalisation ---
scaler = MinMaxScaler()
train[sensor_cols] = scaler.fit_transform(train[sensor_cols])
test[sensor_cols] = scaler.transform(test[sensor_cols])

# --- 3. Feature Engineering (Rolling Windows) ---
def compute_rolling_features(df, window=20):
    sensor_cols = [c for c in df.columns if 'sensor' in c]
    rolled = df.groupby('unit_number')[sensor_cols].rolling(window=window, min_periods=1)
    
    feat_mean = rolled.mean().reset_index(level=0, drop=True).add_suffix(f'_mean{window}')
    feat_std = rolled.std().reset_index(level=0, drop=True).add_suffix(f'_std{window}')
    
    return pd.concat([df, feat_mean, feat_std], axis=1).fillna(0)

print("Calcul des features...")
train_feat = compute_rolling_features(train)
test_feat = compute_rolling_features(test)

# --- 4. Sauvegarde ---
train_feat.to_csv(PROCESSED_PATH / 'train_FD001_features.csv', index=False)
test_feat.to_csv(PROCESSED_PATH / 'test_FD001_features.csv', index=False)

print("Données traitées sauvegardées dans", PROCESSED_PATH)