In [None]:
import dask.dataframe as dd
import glob
import pandas as pd
import numpy as np
import os
from dask.diagnostics import ProgressBar
import matplotlib.pyplot as plt 
import seaborn as sns

In [3]:
# Replace the PATH by the path of the dataset
PATH = r'/home/ensai/Bureau/SAFRAN_ENSAI_2025_GNN_Engine/Project_ENSAI_2025'
FILE_PATTERN = os.path.join(PATH, "Cie_*_Engine*.csv")

In [4]:
# Define the columns needed for analysis
FAULT_COLS = ['corrosion', 'erosion', 'fouling', 'tip_clearance']

FEATURE_COLS = [
    'CRUISE_DTAMB', 'CRUISE_ALT', 'CRUISE_MACH', 'CRUISE_COMMAND',
    'TAKEOFF_DTAMB', 'TAKEOFF_ALT', 'TAKEOFF_MACH', 'TAKEOFF_COMMAND',
    'CRUISE_DECKSMRHPC_Tout', 'CRUISE_DECKSMRHP_Nmech', 'CRUISE_DECKSMRHPC_Tin',
    'CRUISE_DECKSMRLPT_Tin', 'CRUISE_DECKSMRFuel_flow', 'CRUISE_DECKSMRHPC_Pout_st',
    'TAKEOFF_DECKSMRHPC_Tout', 'TAKEOFF_DECKSMRHP_Nmech', 'TAKEOFF_DECKSMRHPC_Tin',
    'TAKEOFF_DECKSMRLPT_Tin', 'TAKEOFF_DECKSMRFuel_flow', 'TAKEOFF_DECKSMRHPC_Pout_st',
]

total_cycles_count = 0
ddf_fleet = None # Initialize Dask DataFrame

In [5]:
# PARALLEL DATA LOADING AND CONSOLIDATION

print(f"Reading files from: {FILE_PATTERN}")

try:
    ddf_fleet = dd.read_csv(
        FILE_PATTERN, 
        dtype={col: np.float32 for col in FAULT_COLS + FEATURE_COLS},
        assume_missing=True 
    )
    
    # VERIFICATION AND TOTAL LENGTH CALCULATION
    if ddf_fleet.npartitions == 0:
        raise FileNotFoundError("Dask found 0 files matching the pattern.")

    total_cycles_delayed = ddf_fleet.shape[0]

    # Compute the length to display
    with ProgressBar():
        total_cycles_count = total_cycles_delayed.compute()
    
    print(f"\nFleet data loaded into Dask DataFrame with {ddf_fleet.npartitions} partitions.")
    print(f"Total cycles (rows) in the fleet: {total_cycles_count:,}")

except FileNotFoundError as e:
    print(f"FATAL ERROR: {e}. Please re-verify the path and file pattern.")
    exit()
except Exception as e:
    print(f"FATAL ERROR during Dask initialization: {e}")
    print("ACTION: Check for column name consistency or type issues across all 249 files.")
    exit()

# Exit if no data was loaded successfully
if total_cycles_count == 0:
    print("No data to process. Exiting.")
    exit()


Reading files from: /home/ensai/Bureau/SAFRAN_ENSAI_2025_GNN_Engine/Project_ENSAI_2025/Cie_*_Engine*.csv
[########################################] | 100% Completed | 2.36 sms

Fleet data loaded into Dask DataFrame with 249 partitions.
Total cycles (rows) in the fleet: 249,000


In [6]:
# STATISTICS

# Calculate the sum of occurrences for each fault across the entire fleet in parallel.
print("Calculating individual fault frequencies...")
with ProgressBar():
    # Summing FAULT_COLS across all partitions
    class_counts = ddf_fleet[FAULT_COLS].sum().compute()
    class_frequency = class_counts / total_cycles_count # Use the computed integer count

print("\nIndividual Fault Frequency (Fraction of cycles affected):")
print(class_frequency.to_string())

# Calculate the multi-label distribution
ddf_fleet['num_faults'] = ddf_fleet[FAULT_COLS].sum(axis=1)

print("\nCalculating multi-label distribution (0, 1, 2, 3, or 4 faults active)...")
with ProgressBar():
    # value_counts is computed in parallel across all partitions
    fault_distribution = ddf_fleet['num_faults'].value_counts(normalize=True).compute()

print("\nMulti-Label Distribution (Percentage of Cycles):")
print((fault_distribution * 100).sort_index().to_string())


print("Calculating global co-occurrence matrix")
with ProgressBar():
    # Compute the label subset into a Pandas DataFrame for matrix multiplication
    labels_df = ddf_fleet[FAULT_COLS].compute()
    co_occurrence_matrix = labels_df.T.dot(labels_df)

print("\nCo-occurrence Matrix (Total Cycles with both faults):")
print(co_occurrence_matrix)

Calculating individual fault frequencies...
[########################################] | 100% Completed | 2.37 sms

Individual Fault Frequency (Fraction of cycles affected):
corrosion        0.178004
erosion          0.205888
fouling          0.214133
tip_clearance    0.191446

Calculating multi-label distribution (0, 1, 2, 3, or 4 faults active)...
[########################################] | 100% Completed | 3.89 sms

Multi-Label Distribution (Percentage of Cycles):
num_faults
0.0    37.965462
1.0    48.897189
2.0     9.745382
3.0     3.008835
4.0     0.383133
Calculating global co-occurrence matrix (requires consolidation of labels)...
[########################################] | 100% Completed | 2.36 sms

Co-occurrence Matrix (Total Cycles with both faults):
               corrosion  erosion  fouling  tip_clearance
corrosion        44323.0   9980.0   9342.0         9027.0
erosion           9980.0  51266.0   8740.0         7409.0
fouling           9342.0   8740.0  53319.0         79

In [7]:
ddf_fleet['is_degraded'] = (ddf_fleet['num_faults'] >= 1).astype(np.int8)

In [11]:
with ProgressBar():
    mean_stats = ddf_fleet.groupby('is_degraded')[FEATURE_COLS].mean().compute()

with ProgressBar():
    std_stats = ddf_fleet.groupby('is_degraded')[FEATURE_COLS].std().compute()   

mean_stats = mean_stats.T.rename(columns={0: 'Mean_Healthy', 1: 'Mean_Degraded'})
std_stats = std_stats.T.rename(columns={0: 'StdDev_Healthy', 1: 'StdDev_Degraded'})    

[########################################] | 100% Completed | 4.13 sms
[########################################] | 100% Completed | 5.07 sms


In [12]:
comparison_df = pd.concat([mean_stats, std_stats], axis=1)
comparison_df['Abs_Difference'] = comparison_df['Mean_Degraded'] - comparison_df['Mean_Healthy']
comparison_df['Percent_Change_vs_Healthy'] = (
    (comparison_df['Mean_Degraded'] - comparison_df['Mean_Healthy']) / comparison_df['Mean_Healthy']
) * 100

comparison_df = comparison_df[[
    'Mean_Healthy', 'Mean_Degraded', 'Abs_Difference', 'Percent_Change_vs_Healthy',
    'StdDev_Healthy', 'StdDev_Degraded'
]]

print("\nFeature Comparison: Healthy (0) vs Degraded (1)")
print(comparison_df.to_string(float_format='%.4f'))


Feature Comparison: Healthy (0) vs Degraded (1)
is_degraded                 Mean_Healthy  Mean_Degraded  Abs_Difference  Percent_Change_vs_Healthy  StdDev_Healthy  StdDev_Degraded
CRUISE_DTAMB                     10.0017         9.9986         -0.0031                    -0.0310          0.5773           0.5773
CRUISE_ALT                    34999.8529     34999.7076         -0.1453                    -0.0004         58.2340          57.8512
CRUISE_MACH                       0.7800         0.7800         -0.0000                    -0.0036          0.0116           0.0115
CRUISE_COMMAND                25000.0408     25000.3326          0.2918                     0.0012         57.3175          57.4364
TAKEOFF_DTAMB                    15.0004        14.9986         -0.0019                    -0.0124          0.5782           0.5765
TAKEOFF_ALT                      99.8539       178.1460         78.2922                    78.4068        182.1276         272.2083
TAKEOFF_MACH               