In [1]:
# Preprocessing Script for CMAPSS Dataset

import pandas as pd
import numpy as np
import os

# Paths to dataset files (update with actual paths if necessary)
DATA_DIR = "/Users/aryan/Desktop/PrognosticEngine/data"
TRAIN_FILE = os.path.join(DATA_DIR, "train_FD001.txt")
TEST_FILE = os.path.join(DATA_DIR, "test_FD001.txt")
RUL_FILE = os.path.join(DATA_DIR, "RUL_FD001.txt")

# Column names for the dataset
COLUMN_NAMES = [
    "engine_id", "cycle", "operational_setting_1", "operational_setting_2", 
    "operational_setting_3", "sensor_measurement_1", "sensor_measurement_2", 
    "sensor_measurement_3", "sensor_measurement_4", "sensor_measurement_5"
] + [f"sensor_measurement_{i}" for i in range(6, 22)]

# Load the data
def load_data(file_path, column_names):
    """Load the dataset into a Pandas DataFrame."""
    return pd.read_csv(file_path, sep=" ", header=None, names=column_names, engine='python').dropna(axis=1)

# Add Remaining Useful Life (RUL)
def add_rul_column(df):
    """Add a Remaining Useful Life column to the DataFrame."""
    max_cycle = df.groupby('engine_id')['cycle'].max()
    df = df.merge(max_cycle.rename('max_cycle'), on='engine_id')
    df['RUL'] = df['max_cycle'] - df['cycle']
    return df.drop(columns=['max_cycle'])

# Normalize the data
def normalize_data(df, columns):
    """Normalize selected columns in the DataFrame."""
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        df[col] = (df[col] - mean) / std
    return df

# Save preprocessed data
def save_preprocessed_data(df, output_path):
    """Save the preprocessed DataFrame to a CSV file."""
    df.to_csv(output_path, index=False)

if __name__ == "__main__":
    print("Loading data...")
    train_df = load_data(TRAIN_FILE, COLUMN_NAMES)
    test_df = load_data(TEST_FILE, COLUMN_NAMES)

    print("Adding RUL column to training data...")
    train_df = add_rul_column(train_df)

    print("Normalizing data...")
    sensor_columns = [col for col in train_df.columns if 'sensor_measurement' in col]
    train_df = normalize_data(train_df, sensor_columns)
    test_df = normalize_data(test_df, sensor_columns)

    print("Saving preprocessed data...")
    save_preprocessed_data(train_df, os.path.join(DATA_DIR, "train_FD001_preprocessed.csv"))
    save_preprocessed_data(test_df, os.path.join(DATA_DIR, "test_FD001_preprocessed.csv"))

    print("Preprocessing complete!")


Loading data...
Adding RUL column to training data...
Normalizing data...
Saving preprocessed data...
Preprocessing complete!
