# Exploration des donn√©es NASA C-MAPSS

Ce notebook explore le dataset NASA C-MAPSS pour la maintenance pr√©dictive.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from src.data_loader import DataLoader

# Configuration
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 6)

In [None]:
# Charger les donn√©es
loader = DataLoader()
data = loader.load_nasa_data()

train_df = data['train']
test_df = data['test']

print(f"Donn√©es d'entra√Ænement: {train_df.shape}")
print(f"Donn√©es de test: {test_df.shape}")

In [None]:
# Afficher les premi√®res lignes
train_df.head()

In [None]:
# Informations sur les donn√©es
train_df.info()

In [None]:
# Statistiques descriptives
train_df.describe()

In [None]:
# Distribution des unit√©s
plt.figure(figsize=(10, 5))
train_df['unit_id'].value_counts().head(20).plot(kind='bar')
plt.title('Nombre d\'observations par unit√© (top 20)')
plt.xlabel('ID Unit√©')
plt.ylabel('Nombre d\'observations')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Distribution des cycles
plt.figure(figsize=(10, 5))
train_df.groupby('unit_id')['time_cycle'].max().hist(bins=30)
plt.title('Distribution des dur√©es de vie des moteurs')
plt.xlabel('Dur√©e de vie (cycles)')
plt.ylabel('Fr√©quence')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Calculer RUL pour les donn√©es d'entra√Ænement
train_with_rul = loader.calculate_rul(train_df, 'train')

# Distribution du RUL
plt.figure(figsize=(10, 5))
plt.hist(train_with_rul['RUL'], bins=50, alpha=0.7, color='skyblue')
plt.title('Distribution du RUL (Remaining Useful Life)')
plt.xlabel('RUL (cycles)')
plt.ylabel('Fr√©quence')
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Visualiser quelques capteurs
sensor_cols = [col for col in train_df.columns if 'sensor' in col][:5]

fig, axes = plt.subplots(2, 3, figsize=(15, 8))
axes = axes.flatten()

for i, sensor in enumerate(sensor_cols[:6]):
    axes[i].hist(train_df[sensor].dropna(), bins=50, alpha=0.7)
    axes[i].set_title(f'Distribution {sensor}')
    axes[i].set_xlabel('Valeur')
    axes[i].set_ylabel('Fr√©quence')
    axes[i].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Corr√©lation entre les capteurs
correlation_matrix = train_df[sensor_cols].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', 
            center=0, square=True, linewidths=0.5)
plt.title('Matrice de corr√©lation entre capteurs')
plt.tight_layout()
plt.show()

In [None]:
# √âvolution des capteurs pour un moteur sp√©cifique
unit_id = 1
unit_data = train_df[train_df['unit_id'] == unit_id]

plt.figure(figsize=(12, 6))
for sensor in sensor_cols[:3]:
    plt.plot(unit_data['time_cycle'], unit_data[sensor], label=sensor)

plt.title(f'√âvolution des capteurs - Unit√© {unit_id}')
plt.xlabel('Cycle')
plt.ylabel('Valeur')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

In [None]:
# Valeurs manquantes
missing_values = train_df.isnull().sum()
missing_percentage = (missing_values / len(train_df)) * 100

missing_df = pd.DataFrame({
    'Valeurs manquantes': missing_values,
    'Pourcentage': missing_percentage
}).sort_values('Pourcentage', ascending=False)

print("Valeurs manquantes par colonne:")
print(missing_df[missing_df['Valeurs manquantes'] > 0])

In [None]:
# R√©sum√© des donn√©es
summary = loader.get_data_summary()

print("üìã R√âSUM√â DU DATASET")
print("=" * 40)
print(f"Donn√©es d'entra√Ænement: {summary['train']['shape']}")
print(f"Unit√©s d'entra√Ænement: {summary['train']['units']}")
print(f"Cycles max moyens: {summary['train']['cycles_max']:.1f}")
print(f"Nombre de capteurs: {summary['train']['sensors']}")
print(f"Donn√©es de test: {summary['test']['shape']}")
print(f"RUL dans test: {summary['test']['has_rul']}")