In [1]:
# Data Preparation Script: Splitting Engineered Dataset

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

# File paths
INPUT_FILE = "/Users/aryan/Desktop/PrognosticEngine/data/train_FD001_features.csv"
TRAIN_OUTPUT = "/Users/aryan/Desktop/PrognosticEngine/data/train_split.csv"
VALIDATION_OUTPUT = "/Users/aryan/Desktop/PrognosticEngine/data/validation_split.csv"

# Load the dataset
def load_data(file_path):
    """Load the dataset into a Pandas DataFrame."""
    return pd.read_csv(file_path)

# Split data into features and target
def split_features_target(df, target_column='RUL'):
    """Split the dataset into features (X) and target (y)."""
    X = df.drop(columns=[target_column])
    y = df[target_column]
    return X, y

# Standardize features
def standardize_features(X):
    """Standardize the feature set using StandardScaler."""
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    return X_scaled, scaler

# Save split data
def save_split_data(X, y, train_output, validation_output, test_size=0.2, random_state=42):
    """Split the data into training and validation sets and save them to CSV files."""
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=test_size, random_state=random_state)

    # Combine features and target for saving
    train_data = pd.DataFrame(X_train, columns=X.columns)
    train_data['RUL'] = y_train.values
    train_data.to_csv(train_output, index=False)

    val_data = pd.DataFrame(X_val, columns=X.columns)
    val_data['RUL'] = y_val.values
    val_data.to_csv(validation_output, index=False)

if __name__ == "__main__":
    print("Loading data...")
    data = load_data(INPUT_FILE)

    print("Splitting features and target...")
    X, y = split_features_target(data)

    print("Standardizing features...")
    X_scaled, scaler = standardize_features(X)

    print("Splitting data into training and validation sets...")
    save_split_data(
        pd.DataFrame(X_scaled, columns=X.columns),
        y,
        TRAIN_OUTPUT,
        VALIDATION_OUTPUT
    )

    print("Data preparation complete!")
    print(f"Training data saved to: {TRAIN_OUTPUT}")
    print(f"Validation data saved to: {VALIDATION_OUTPUT}")


Loading data...
Splitting features and target...
Standardizing features...
Splitting data into training and validation sets...
Data preparation complete!
Training data saved to: /Users/aryan/Desktop/PrognosticEngine/data/train_split.csv
Validation data saved to: /Users/aryan/Desktop/PrognosticEngine/data/validation_split.csv
