In [6]:
# Enhanced Feature Engineering Script for CMAPSS Dataset

import pandas as pd
import numpy as np

# File paths
RAW_FILE = "/Users/aryan/Desktop/PrognosticEngine/data/train_FD001.txt"
OUTPUT_FILE = "/Users/aryan/Desktop/PrognosticEngine/data/train_FD001_features_enhanced.csv"

# Load the raw data
def load_raw_data(file_path):
    """Load the raw dataset into a Pandas DataFrame."""
    column_names = [
        'engine_id', 'cycle', 'operational_setting_1', 'operational_setting_2', 'operational_setting_3'
    ] + [f'sensor_measurement_{i}' for i in range(1, 22)]

    df = pd.read_csv(file_path, delim_whitespace=True, header=None, names=column_names)
    return df

# Calculate Remaining Useful Life (RUL)
def calculate_rul(df):
    """Calculate RUL dynamically as the difference between max cycle and current cycle for each engine."""
    df['RUL'] = df.groupby('engine_id')['cycle'].transform('max') - df['cycle']
    return df

# Create lag features
def add_lag_features(df, sensor_columns, lags=[1, 2, 3]):
    """Add lag features for specified sensor columns."""
    for lag in lags:
        for col in sensor_columns:
            df[f"{col}_lag{lag}"] = df.groupby('engine_id')[col].shift(lag)
    return df

# Create rolling average features
def add_rolling_features(df, sensor_columns, windows=[3, 5]):
    """Add rolling average features for specified sensor columns."""
    for window in windows:
        for col in sensor_columns:
            df[f"{col}_roll{window}"] = df.groupby('engine_id')[col].rolling(window).mean().reset_index(0, drop=True)
    return df

# Create interaction features
def add_interaction_features(df, sensor_columns):
    """Add interaction features by multiplying key sensors."""
    interaction_pairs = [
        ('sensor_measurement_4', 'sensor_measurement_11'),
        ('sensor_measurement_4', 'sensor_measurement_9'),
        ('sensor_measurement_11', 'sensor_measurement_9')
    ]
    for col1, col2 in interaction_pairs:
        if col1 in sensor_columns and col2 in sensor_columns:
            df[f"{col1}_x_{col2}"] = df[col1] * df[col2]
    return df

# Drop columns with no useful data
def drop_unused_columns(df, unused_columns):
    """Drop columns that contain only NaN or irrelevant values."""
    return df.drop(columns=unused_columns, errors='ignore')

if __name__ == "__main__":
    print("Loading raw data...")
    raw_df = load_raw_data(RAW_FILE)

    print("Calculating RUL...")
    raw_df = calculate_rul(raw_df)

    print("Dropping unused columns...")
    unused_columns = ['sensor_measurement_16', 'sensor_measurement_17']
    raw_df = drop_unused_columns(raw_df, unused_columns)

    print("Adding lag features...")
    sensor_columns = [col for col in raw_df.columns if 'sensor_measurement' in col]
    raw_df = add_lag_features(raw_df, sensor_columns)

    print("Adding rolling average features...")
    raw_df = add_rolling_features(raw_df, sensor_columns)

    print("Adding interaction features...")
    raw_df = add_interaction_features(raw_df, sensor_columns)

    print("Filling missing values...")
    raw_df.fillna(0, inplace=True)

    print("Saving enhanced engineered features...")
    raw_df.to_csv(OUTPUT_FILE, index=False)

    print("Enhanced feature engineering complete! File saved at:", OUTPUT_FILE)

Loading raw data...
Calculating RUL...
Dropping unused columns...
Adding lag features...
Adding rolling average features...
Adding interaction features...
Filling missing values...
Saving enhanced engineered features...


  df = pd.read_csv(file_path, delim_whitespace=True, header=None, names=column_names)
  df[f"{col1}_x_{col2}"] = df[col1] * df[col2]


Enhanced feature engineering complete! File saved at: /Users/aryan/Desktop/PrognosticEngine/data/train_FD001_features_enhanced.csv
