# THE DATASET

# Library and Dataset Importation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
import gdown
from pathlib import Path
import zipfile

# 1️⃣ Set paths
RAW_DIR = Path("data/raw")
RAW_DIR.mkdir(parents=True, exist_ok=True)  # create folder if missing

# 2️⃣ Google Drive file ID for your zip
# Replace with the "id" from your shared link
GDRIVE_FILE_ID = "1ID1g0ZTkSfGSJy8hSTNAxbxXKIwnRrr7"
ZIP_PATH = RAW_DIR / "nasa_cmaps.zip"

# 3️⃣ Download the zip (works without credentials)
print("Downloading dataset from Google Drive...")
gdown.download(f"https://drive.google.com/uc?id={GDRIVE_FILE_ID}", str(ZIP_PATH), quiet=False)

# 4️⃣ Extract all files
with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
    zip_ref.extractall(RAW_DIR)

# 5️⃣ Optional: remove the zip to keep things clean
ZIP_PATH.unlink()

print(f"Dataset ready at: {RAW_DIR.resolve()}")


Downloading dataset from Google Drive...


Downloading...
From: https://drive.google.com/uc?id=1ID1g0ZTkSfGSJy8hSTNAxbxXKIwnRrr7
To: C:\Users\CJ\turbofan-engine-rul\notebooks\data\raw\nasa_cmaps.zip
 61%|██████████████████████████████████████████████▋                              | 7.86M/13.0M [11:40<06:20, 13.5kB/s]

KeyboardInterrupt: 

In [None]:
import sys
sys.path.append("..")

from src.data.load_cmapss import load_cmapss_subset

# To Load  data
datasets = ["FD001", "FD002", "FD003", "FD004"]

train_dfs = {}
test_dfs = {}
rul_dfs = {}

for ds in datasets:
    train_dfs[ds] = load_cmapss_subset(ds, "train")
    test_dfs[ds]  = load_cmapss_subset(ds, "test")
    rul_dfs[ds]   = load_cmapss_subset(ds, "RUL")


In [None]:
train_dfs["FD001"].head()


# Exploratory Data Analysis

### a) Assiging of Column Names

Firstly we assign actual column names to the dataset,

We will be using Sensor Naming Conventions based on commonly referenced turbofan sensor descriptions to improve
interpretability. 

Note: These mappings are approximate and used for analysis clarity.


In [None]:
# Define column names

base_columns = [
    'unit', 'cycle',
    'op_setting_1', 'op_setting_2', 'op_setting_3'
]
sensor_mapping = {
    'sensor_1':  'Fan_Inlet_Temperature',
    'sensor_2':  'LPC_Outlet_Temperature',
    'sensor_3':  'LPC_Outlet_Pressure',
    'sensor_4':  'Fan_Speed',
    'sensor_5':  'LPC_Speed',
    'sensor_6':  'HPC_Outlet_Temperature',
    'sensor_7':  'HPC_Outlet_Pressure',
    'sensor_8':  'HPT_Outlet_Temperature',
    'sensor_9':  'Fuel_Flow',
    'sensor_10': 'LPT_Outlet_Temperature',
    'sensor_11': 'HPT_Coolant_Bleed',
    'sensor_12': 'LPC_Coolant_Bleed',
    'sensor_13': 'Bypass_Duct_Pressure',
    'sensor_14': 'Core_Speed',
    'sensor_15': 'Fan_Speed_Ratio',
    'sensor_16': 'Fuel_Air_Ratio',
    'sensor_17': 'Bleed_Enthalpy',
    'sensor_18': 'HPT_Seal_Leakage',
    'sensor_19': 'LPT_Seal_Leakage',
    'sensor_20': 'High_Pressure_Spool_Speed',
    'sensor_21': 'Low_Pressure_Spool_Speed'
}
base_columns = [
    'unit', 'cycle',
    'op_setting_1', 'op_setting_2', 'op_setting_3'
]
# Generate original column names
sensor_cols = [f'sensor_{i}' for i in range(1, 22)]
columns = base_columns + sensor_cols


In [None]:
sensor_mapping['sensor_13']

In [None]:
for ds in datasets:
    train_dfs[ds].columns = columns
    test_dfs[ds].columns = columns


In [None]:
 train_dfs["FD001"].head()

###  b) Dataframe inspection and Exploration

We'll commence our analysis with the first ...engine failure simulation (FD001)

In [None]:
# lets reassign the dataframe to the variable 'FD001_train'
FD001_train = train_dfs["FD001"]

In [None]:
FD001_train.info()

In [None]:
FD001_train.describe()

In [None]:
FD001_train.head(800)

There are no null or missing values in all the columns of the dataset and their datatypes are accurate.

However some of the olumns have 0 std and will therefore have little effects on our models.

So eventually We'll need to drop all sensor columns with very low 'std'

In [None]:
FD001_train[FD001_train['unit']==60]

In [None]:
# To Confirm Engine & Cycle Behavior
FD001_train['unit'].nunique()
FD001_train.groupby('unit')['cycle'].max().describe()

This information tells us that there were 100 engines tested(unit), and that on average, an engine operates for approximately 206 cycles before failure. This gives us a baseline lifespan for the FD001 engines.
Also, it is worthy to note that there is significant variability in engine lifetimes, that is Engines do not fail at the same time, which is realistic.

The shortest-lived engine failed after 128 cycles while the longest-running engine survived 362 cycles.
This gives an idea on failure scenarios and degradation speed patterns across the engines.


Note: A cycle refers to the run-to-failure history for a particular Engine.



#### Visual inspection

Lets Pick some sensors to plot their complete cycle for a specific Engine unit:
measured across the engine’s lifetime(ie. Engine start to Engine Failure)

In [None]:
unit_id = 9 # Selects the engine number  
subset = FD001_train[FD001_train['unit'] == unit_id]

for s in ['sensor_2', 'sensor_3', 'sensor_14']: # loops through sensors 2 to 4
    plt.plot(subset['cycle'], subset[s], label=s)

plt.legend()
plt.xlabel('Cycle')
plt.ylabel('Sensor value')
plt.show()

This plot shows that Some sensors tend to trend (indicating signs of degradation or variance) while some stay flat throughout the complete cycle (indicating low variability) and are hence less/non-informative

This means that there will be need for feature selection for our models.

Note: In the above code, you can change the value of "unit_id" to access other engines lifecycle, or change to the specific sensors we want to acess in the 'for' loop.

# Data Pre-Processing

### 1) Feature Engineering

The training sets of this data have no Remaining Useful Life(RUL) values therefore we are to compute them manually.
RUL = (max_cycle_for_engine) - (current_cycle)

In [None]:
# Firstly we create a new column 'max_cycle'
FD001_train['max_cycle'] = FD001_train.groupby('unit')['cycle'].transform('max')
FD001_train['RUL'] = FD001_train['max_cycle'] - FD001_train['cycle']
FD001_train

In [None]:
# Sanity Check RUL

# Always check:

FD001_train[['unit', 'cycle', 'RUL']].head(10)
FD001_train[['unit', 'cycle', 'RUL']].tail(10)


In [None]:
unit_id = 5
subset = FD001_train[FD001_train['unit'] == unit_id]

plt.plot(subset['cycle'], subset['RUL'])
plt.xlabel('Cycle')
plt.ylabel('RUL')
plt.title(f'RUL decay for Unit {unit_id}')
plt.show()


### 2). Feature Selection

Now, we are to identify and remove those sensors with very low variance across all engines

In [None]:
sensor_cols = [c for c in FD001_train.columns if c.startswith('sensor_')]

FD001_train[sensor_cols].std().sort_values()


In [None]:
# Drop all sensors with variance less than 0.001
low_variance = FD001_train[sensor_cols].std()[FD001_train[sensor_cols].std() < 1e-3].index
low_var_sensors


Next We'll calculate the Variance Analysis on a per engine level To make sure that we only remove those sensors with low variance across all engines

In [None]:
# To Compute sensor variance within each engine
per_engine_var = (
    FD001_train.groupby('unit')[sensor_cols].std()
)


In [None]:
# Then we check the average variance of the sensors across all engines
mean_engine_var = per_engine_var.mean().sort_values()
mean_engine_var


In [None]:
# We set our Threshold Variance at'0.001' removing sensors with variance less than 0.001
low_var_sensors = mean_engine_var[mean_engine_var < 1e-3].index

# To see what sensors exactly are being dropped
print('List of Sensors to be dropped')
for item in low_var_sensors:
    print(item, ' : ', sensor_mapping[item])

In [None]:
# To drop the columns

FD001_train =d FD001_train.drop(columns=low_var_sensors)
FD001_train

Next, We'll Check the level of Correlation of the remaining sensors with RUL
This is to help us understand Which sensors change in a way that is related to RUL?

It is expected that as the Cycle of each engine unit increases its RUL decreases and the engine degrades 

Therefore we expect sensors to either Drift monotonically with degradation or Show negative correlation with RUL

While Variance tells us which of the sensors changes throughout the record, Correlation will tell us which sensors changes due to degradation

NOTE: A sensor can have high variance and yet be unrelated to the degradation process ( or operating conditions)

In [None]:
high_var_sensors = mean_engine_var[mean_engine_var > 1e-3].index
high_var_sensors 

In [None]:
# Check Correlation with RUL
df[high_var_sensors  + ['RUL']].corr()['RUL'].sort_values()