# Exploratory Data Analysis - Predictive Maintenance

This notebook performs exploratory analysis on the NASA C-MAPSS turbofan engine dataset.

## Contents:
1. Data Loading
2. Dataset Overview
3. Sensor Degradation Analysis
4. RUL Distribution
5. Correlation Analysis
6. Feature Engineering Preview

In [None]:
import sys
sys.path.append('..')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from src.utils.config_loader import load_config
from src.utils.logger import get_logger
from src.ingestion.data_loader import get_data_loader
from src.preprocessing import FeatureEngineer
from src.visualization import PredictiveMaintenanceVisualizer

# Set plotting style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 6)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 1. Load Configuration and Data

In [None]:
# Load configuration
config = load_config()

print("Project:", config['project']['name'])
print("Random Seed:", config['project']['seed'])

In [None]:
# Load NASA C-MAPSS dataset
loader = get_data_loader('cmapss', config)

# Load FD001 subset (single operating condition, single fault mode)
train_df, test_df, rul_df = loader.load_dataset('FD001')

print(f"Training data shape: {train_df.shape}")
print(f"Test data shape: {test_df.shape}")
print(f"RUL data shape: {rul_df.shape}")

## 2. Dataset Overview

In [None]:
# Display first few rows
print("\nTraining Data Sample:")
train_df.head(10)

In [None]:
# Basic statistics
print("\nDataset Statistics:")
print(f"Number of engines: {train_df['unit_id'].nunique()}")
print(f"Total cycles: {len(train_df)}")
print(f"Average cycles per engine: {train_df.groupby('unit_id')['cycle'].max().mean():.2f}")
print(f"Min cycles: {train_df.groupby('unit_id')['cycle'].max().min()}")
print(f"Max cycles: {train_df.groupby('unit_id')['cycle'].max().max()}")

In [None]:
# Check for missing values
print("\nMissing Values:")
print(train_df.isnull().sum().sum())

In [None]:
# Sensor columns
sensor_cols = [col for col in train_df.columns if col.startswith('sensor_')]
print(f"\nNumber of sensors: {len(sensor_cols)}")
print(f"Sensors: {sensor_cols}")

## 3. Add RUL Labels

In [None]:
# Add RUL to training data
train_df = loader.add_rul_column(train_df)
train_df = loader.add_labels(train_df, w1=30, w0=15)

print("\nTraining data with RUL:")
train_df[['unit_id', 'cycle', 'RUL', 'label_binary', 'RUL_clipped']].head(10)

## 4. Sensor Degradation Visualization

In [None]:
# Initialize visualizer
viz = PredictiveMaintenanceVisualizer(config, '../results')

# Plot sensor degradation for first 3 units
viz.plot_sensor_degradation(train_df, sensor_cols, unit_ids=[1, 2, 3])

In [None]:
# Plot specific sensors for one unit
unit_1_data = train_df[train_df['unit_id'] == 1]

fig, axes = plt.subplots(3, 2, figsize=(14, 10))
axes = axes.ravel()

for idx, sensor in enumerate(sensor_cols[:6]):
    axes[idx].plot(unit_1_data['cycle'], unit_1_data[sensor])
    axes[idx].set_xlabel('Cycle')
    axes[idx].set_ylabel(sensor)
    axes[idx].set_title(f'{sensor} - Unit 1')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 5. RUL Distribution Analysis

In [None]:
# Plot RUL distribution
viz.plot_rul_distribution(train_df)

In [None]:
# RUL statistics
print("\nRUL Statistics:")
print(train_df['RUL'].describe())

print("\nLabel distribution:")
print(train_df['label_binary'].value_counts())
print(f"\nFailure ratio: {train_df['label_binary'].mean():.2%}")

## 6. Correlation Analysis

In [None]:
# Correlation with RUL
correlations = train_df[sensor_cols + ['RUL']].corr()['RUL'].drop('RUL').sort_values(ascending=False)

print("\nTop 10 sensors correlated with RUL:")
print(correlations.head(10))

print("\nBottom 10 sensors correlated with RUL:")
print(correlations.tail(10))

In [None]:
# Visualize correlation with RUL
fig, ax = plt.subplots(figsize=(10, 8))

correlations_sorted = correlations.abs().sort_values(ascending=True)
correlations_sorted.plot(kind='barh', ax=ax, color='steelblue')

ax.set_xlabel('Absolute Correlation with RUL')
ax.set_title('Sensor Correlation with RUL')
ax.grid(True, alpha=0.3, axis='x')

plt.tight_layout()
plt.show()

In [None]:
# Correlation heatmap (selected sensors)
top_sensors = correlations.abs().sort_values(ascending=False).head(10).index.tolist()

plt.figure(figsize=(10, 8))
sns.heatmap(train_df[top_sensors].corr(), annot=True, cmap='coolwarm', center=0, 
            fmt='.2f', square=True, linewidths=1)
plt.title('Correlation Heatmap - Top 10 Sensors')
plt.tight_layout()
plt.show()

## 7. Operating Settings Analysis

In [None]:
# Operating settings distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

for idx, setting in enumerate(['setting_1', 'setting_2', 'setting_3']):
    axes[idx].hist(train_df[setting], bins=30, edgecolor='black', alpha=0.7)
    axes[idx].set_xlabel(setting)
    axes[idx].set_ylabel('Frequency')
    axes[idx].set_title(f'{setting} Distribution')
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 8. Feature Engineering Preview

In [None]:
# Create feature engineer
engineer = FeatureEngineer(config)

# Create features for a subset
sample_df = train_df[train_df['unit_id'].isin([1, 2, 3])].copy()
sample_features = engineer.create_all_features(sample_df, sensor_cols)

print(f"\nOriginal features: {sample_df.shape[1]}")
print(f"Engineered features: {sample_features.shape[1]}")
print(f"Added features: {sample_features.shape[1] - sample_df.shape[1]}")

In [None]:
# Display new feature columns
new_cols = [col for col in sample_features.columns if col not in sample_df.columns]
print(f"\nSample of new features ({len(new_cols)} total):")
print(new_cols[:20])

In [None]:
# Compare original vs rolling features
unit_1_features = sample_features[sample_features['unit_id'] == 1]

fig, axes = plt.subplots(2, 1, figsize=(14, 8))

sensor = 'sensor_2'

axes[0].plot(unit_1_features['cycle'], unit_1_features[sensor], label='Original')
axes[0].plot(unit_1_features['cycle'], unit_1_features[f'{sensor}_rolling_mean'], 
             label='Rolling Mean', linestyle='--')
axes[0].set_xlabel('Cycle')
axes[0].set_ylabel('Value')
axes[0].set_title(f'{sensor} - Original vs Rolling Mean')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].plot(unit_1_features['cycle'], unit_1_features[f'{sensor}_rolling_std'], 
             label='Rolling Std', color='orange')
axes[1].set_xlabel('Cycle')
axes[1].set_ylabel('Value')
axes[1].set_title(f'{sensor} - Rolling Standard Deviation')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 9. Summary

### Key Findings:
1. **Dataset**: 100 training engines with varying operational cycles
2. **Sensors**: 21 sensors with different correlation strengths to RUL
3. **RUL Distribution**: Right-skewed with most engines having low RUL values
4. **Feature Engineering**: Successfully created rolling statistics and degradation features
5. **Next Steps**: 
   - Build baseline models (Random Forest, XGBoost)
   - Train deep learning models (LSTM, GRU, CNN)
   - Compare model performance