In [1]:
# @title Setup for Google Colab
# Run this cell if you are using Google Colab to set up the environment.

try:
    import google.colab
    IN_COLAB = True
except:
    IN_COLAB = False

if IN_COLAB:
    print("Running in Google Colab. Setting up environment...")

    # 1. Clone the repository
    !git clone https://github.com/Boussyf0/MANTIS-Maintenance-Intelligence-System-.git mantis_repo

    # 2. Change working directory
    import os
    os.chdir('mantis_repo')

    # 3. Create data directories
    if not os.path.exists('data/raw/NASA_CMAPSS'):
        os.makedirs('data/raw/NASA_CMAPSS')

    # 4. Download and unzip dataset (Robust w/ mirrors)
    if not os.path.exists('data/raw/NASA_CMAPSS/train_FD001.txt'):
        print("Downloading NASA CMAPSS Data...")

        urls = [
            'https://data.nasa.gov/api/views/s96h-rxk2/files/8b8e05a8-6f16-43b6-96b6-81a171ef9948?download=true&filename=CMAPSSData.zip',
            'https://raw.githubusercontent.com/senthilnayagan/CMS_DeepLearning/master/CMAPSSData.zip',
            'https://data.nasa.gov/docs/legacy/CMAPSSData.zip'
        ]

        success = False
        for url in urls:
            print(f"Trying {url}...")
            try:
                exit_code = os.system(f'wget "{url}" -O data/raw/NASA_CMAPSS/CMAPSSData.zip')
                if exit_code == 0:
                    success = True
                    print("Download successful.")
                    break
            except Exception as e:
                print(f"Failed: {e}")

        if success:
            !unzip -o data/raw/NASA_CMAPSS/CMAPSSData.zip -d data/raw/NASA_CMAPSS/
            print("Data extracted.")
        else:
            print("CRITICAL: All download mirrors failed. Please upload data manually.")

    # 5. Switch to notebooks directory so relative paths work
    os.chdir('notebooks')
    print("Setup complete. Current working directory:", os.getcwd())

Running in Google Colab. Setting up environment...
Cloning into 'mantis_repo'...
remote: Enumerating objects: 749, done.[K
remote: Counting objects: 100% (95/95), done.[K
remote: Compressing objects: 100% (79/79), done.[K
remote: Total 749 (delta 4), reused 70 (delta 4), pack-reused 654 (from 1)[K
Receiving objects: 100% (749/749), 102.09 MiB | 18.37 MiB/s, done.
Resolving deltas: 100% (182/182), done.
Downloading NASA CMAPSS Data...
Trying https://data.nasa.gov/api/views/s96h-rxk2/files/8b8e05a8-6f16-43b6-96b6-81a171ef9948?download=true&filename=CMAPSSData.zip...
Trying https://raw.githubusercontent.com/senthilnayagan/CMS_DeepLearning/master/CMAPSSData.zip...
Trying https://data.nasa.gov/docs/legacy/CMAPSSData.zip...
Download successful.
Archive:  data/raw/NASA_CMAPSS/CMAPSSData.zip
  inflating: data/raw/NASA_CMAPSS/Damage Propagation Modeling.pdf  
  inflating: data/raw/NASA_CMAPSS/readme.txt  
  inflating: data/raw/NASA_CMAPSS/RUL_FD001.txt  
  inflating: data/raw/NASA_CMAPSS/RU

# Feature Engineering & Extraction (FD001)

Ce notebook charge les données brutes C-MAPSS FD001, effectue le nettoyage, la normalisation, et l'extraction de features (moyennes/écarts-types glissants).


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from sklearn.preprocessing import MinMaxScaler

# Config
DATA_PATH = Path('../../data/raw/NASA_CMAPSS')
PROCESSED_PATH = Path('../../data/processed')
PROCESSED_PATH.mkdir(parents=True, exist_ok=True)

# Colonnes
index_cols = ['unit_number', 'time_cycles']
setting_cols = ['setting_1', 'setting_2', 'setting_3']
sensor_cols = [f'sensor_{i}' for i in range(1, 22)]
cols = index_cols + setting_cols + sensor_cols

# Chargement
print("Chargement des données...")
train = pd.read_csv(DATA_PATH / 'train_FD001.txt', sep='\\s+', header=None, names=cols)
test = pd.read_csv(DATA_PATH / 'test_FD001.txt', sep='\\s+', header=None, names=cols)
y_test = pd.read_csv(DATA_PATH / 'RUL_FD001.txt', sep='\\s+', header=None, names=['RUL'])

print(f"Train: {train.shape}, Test: {test.shape}")

# --- 1. Calcul de la RUL (Target) sur le Train ---
def add_rul(df):
    max_cycles = df.groupby('unit_number')['time_cycles'].transform('max')
    df['RUL'] = max_cycles - df['time_cycles']
    return df

train = add_rul(train)

# --- 2. Normalisation ---
scaler = MinMaxScaler()
train[sensor_cols] = scaler.fit_transform(train[sensor_cols])
test[sensor_cols] = scaler.transform(test[sensor_cols])

# --- 3. Feature Engineering (Rolling Windows) ---
def compute_rolling_features(df, window=20):
    sensor_cols = [c for c in df.columns if 'sensor' in c]
    rolled = df.groupby('unit_number')[sensor_cols].rolling(window=window, min_periods=1)

    feat_mean = rolled.mean().reset_index(level=0, drop=True).add_suffix(f'_mean{window}')
    feat_std = rolled.std().reset_index(level=0, drop=True).add_suffix(f'_std{window}')

    return pd.concat([df, feat_mean, feat_std], axis=1).fillna(0)

print("Calcul des features...")
train_feat = compute_rolling_features(train)
test_feat = compute_rolling_features(test)

# --- 4. Sauvegarde ---
train_feat.to_csv(PROCESSED_PATH / 'train_FD001_features.csv', index=False)
test_feat.to_csv(PROCESSED_PATH / 'test_FD001_features.csv', index=False)

print("Données traitées sauvegardées dans", PROCESSED_PATH)

Chargement des données...
Train: (20631, 26), Test: (13096, 26)
Calcul des features...
Données traitées sauvegardées dans ../data/processed
