# 01 - Load and Explore CMAPSS (FD001)

This notebook loads the cleaned CMAPSS dataset, previews rows, computes RUL, and explores feature distributions and correlations.



In [7]:
# Imports and setup
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Consistent style
sns.set(style="whitegrid")
np.random.seed(42)

project_root = Path("..").resolve().parents[0] if (Path.cwd().name == "notebooks") else Path(".").resolve().parents[0]
data_dir = Path("../data/CMaps").resolve()

# Ensure cleaned data exists
train_clean = data_dir / "train_cleaned.csv"
if not train_clean.exists():
    # Attempt to run loader if file missing
    %run ../scripts/load_data.py



Saved cleaned train to: C:\Users\Daksh Mehta\OneDrive\Desktop\AkashInsights\data\train_cleaned.csv
Saved cleaned test to:  C:\Users\Daksh Mehta\OneDrive\Desktop\AkashInsights\data\test_cleaned.csv


In [8]:
# Ensure we point to cleaned CSVs under data/, and raw .txts under data/CMaps
from pathlib import Path

data_dir = Path("../data").resolve()
raw_dir = data_dir / "CMaps"

# If cleaned files are missing, run the loader (which reads from data/CMaps and writes to data/)
if not (data_dir / "train_cleaned.csv").exists():
    %run ../scripts/load_data.py


In [5]:
# Load cleaned data
train_df = pd.read_csv(data_dir / "train_cleaned.csv")
test_df = pd.read_csv(data_dir / "test_cleaned.csv")

train_df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Daksh Mehta\\OneDrive\\Desktop\\AkashInsights\\data\\CMaps\\train_cleaned.csv'

In [None]:
# Compute RUL for exploration
max_cycle = train_df.groupby("unit_number")["time_in_cycles"].transform("max")
train_df = train_df.copy()
train_df["RUL"] = (max_cycle - train_df["time_in_cycles"]).astype(int)

print("Rows:", len(train_df))
print("Columns:", list(train_df.columns))
train_df[["unit_number", "time_in_cycles", "RUL"]].head()


In [6]:
# RUL distribution
plt.figure(figsize=(8,4))
sns.histplot(train_df["RUL"], bins=50, kde=True)
plt.title("RUL Distribution (Train)")
plt.xlabel("RUL (cycles)")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


NameError: name 'train_df' is not defined

<Figure size 800x400 with 0 Axes>

In [None]:
# Feature correlations (subset for readability)
# Exclude identifiers and target for correlation heatmap
feature_cols = [c for c in train_df.columns if c not in {"unit_number", "RUL"}]

corr = train_df[feature_cols].corr()
plt.figure(figsize=(12,10))
sns.heatmap(corr, cmap="viridis", center=0)
plt.title("Feature Correlation Heatmap")
plt.tight_layout()
plt.show()
