In [None]:
import os
import glob
from pathlib import Path
import yaml

import logging

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

# Show more columns
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)

# Initiate Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [33]:
def get_directories():

    # Get Current Working Directory
    CWD = Path().resolve()

    # 
    # Priority: ENV → script → Jupyter fallback
    env_root = os.getenv("APP_ROOT")

    if env_root:
        APP_ROOT = Path(env_root).resolve()

    else:
        try:
            # Script Path
            APP_ROOT = Path(__file__).resolve().parents[1]
        except NameError:
            # Notebook execution path
            APP_ROOT = CWD.parent

    # Set Data Directory
    DATA_DIR = APP_ROOT / "data"

    # Log everything for debugging
    logger.info(f"CWD:             {CWD}")
    logger.info(f"APP_ROOT:        {APP_ROOT}")
    logger.info(f"DATA Directory:  {DATA_DIR}")

    return APP_ROOT, DATA_DIR

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 

def load_data(DATA_PATH, DATA_FILE, **kwargs):

    """
        Load data file with
    """

    try:
        data = pd.read_csv(DATA_PATH/DATA_FILE, **kwargs)
        logger.info(f"Data File Loaded: {data.info()}")
        return data

    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return None

    
    #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 


In [44]:
# Global Variables

# Folder Locations
APP_ROOT, DATA_DIR = get_directories()

RAW_DATA_DIR = DATA_DIR / "raw"
RAW_DATA_SUBDIR = "NASA_Turbofan_Jet_Engine_Dataset/CMaps"

RAW_DATA_PATH = RAW_DATA_DIR / RAW_DATA_SUBDIR

BRONZE_DATA_DIR = DATA_DIR / "bronze"
BRONZE_DATA_FILE_NAME = "turbofan_train_bronze.csv"
BRONZE_DATA_OUTPUT_PATH = BRONZE_DATA_DIR / BRONZE_DATA_FILE_NAME


INFO:__main__:CWD:             /notebooks
INFO:__main__:APP_ROOT:        /
INFO:__main__:DATA Directory:  /data


In [35]:
print(RAW_DATA_PATH)

/data/raw/NASA_Turbofan_Jet_Engine_Dataset/CMaps


In [36]:
# There are 26 columns in the dataset. 
# The first five columns have a different format then the remaining 21. 
# We will addresss these columns first then add in the remainingg 21. 

# Creating column name list for the first five. 
COLUMN_NAMES = [
    "unit",
    "cycle",
    "op_setting_1",
    "op_setting_2",
    "op_setting_3",
]

# As stated the remaining 21 columns share a similar name structure of sensor then a number
# So we will generate them using a loop and appended them to our list:
COLUMN_NAMES += [f"s{i}" for i in range(1, 22)]

# Display Column Names to verify success.
len(COLUMN_NAMES), COLUMN_NAMES[:10]


(26,
 ['unit',
  'cycle',
  'op_setting_1',
  'op_setting_2',
  'op_setting_3',
  's1',
  's2',
  's3',
  's4',
  's5'])

In [None]:
# Load Data Import Keywords/Additional parameters.
data_import_parameters = {"sep":"\s+", "header":None, "names": COLUMN_NAMES}

# Anomaly window for when errors occur
ANOMALY_HORIZON = 30

FD_IDS = ["FD001", "FD002", "FD003", "FD004"]

# Creating a list 
all_training_dfs = []

for fd_id in FD_IDS:

    RAW_FD_PATH = RAW_DATA_PATH
    RAW_DATA_FILE = f"train_{fd_id}.txt"

    raw_df = load_data(RAW_FD_PATH, RAW_DATA_FILE, **data_import_parameters)

    raw_df.columns = COLUMN_NAMES

    max_cycle = raw_df.groupby("unit")["cycle"].max().rename("max_cycle")
    logger.info(f"Max Cycle Dataframe Created: {raw_df.info()}")

    df = raw_df.merge(max_cycle, on="unit", how="left")
    logger.info(f"Raw Import Merged with Max Cycle: {df.info()}")

    df["RUL"] = df["max_cycle"] - df["cycle"]

    df = df.drop(columns=["max_cycle"])
    logger.info(f"RUL Calculcated and Added: {df.info()}")

    df["anomaly_flag"] = (df["RUL"] <= ANOMALY_HORIZON).astype(int)
    logger.info(f"Anomaly Flag Added: {df.info()}")

    df["dataset_name"] = "TURBOFAN"
    logger.info(f"Column Added: 'dataset_name': {df.info()}")

    df["fd_id"] = fd_id
    logger.info(f"Column Added: 'fd_id': {df.info()}")

    df["event_time"] = df["cycle"]
    logger.info(f"Column Added: 'event_time': {df.info()}")

    df = df.rename(columns={
                "unit": "machine_id",
                "cycle": "time_index",
                })
    logger.info(f"Columns Renamed: 'unit'>'machine_id' & 'cycle'>'time_index'  {df.info()}")

    core_columns = ["dataset_name", "fd_id", "machine_id", "time_index", "event_time"]
    logger.info(f"Core Columns Assigned and stored to list: {core_columns}")

    label_columns = ["RUL", "anomaly_flag"]
    logger.info(f"Label Columns Assigned and stored to list: {label_columns}")

    data_columns = list(df.columns)
    logger.info(f"Data Columns List created from existing columns in dataset: {data_columns}")

    for column in core_columns + label_columns: 
        if column in data_columns:
            data_columns.remove(column)

    logger.info(f"Data Columns List purged of core and label columns: {data_columns}")

    ordered_columns = core_columns + label_columns + data_columns
    logger.info(f"Column Order Set: {ordered_columns}")

    df_ordered = df[ordered_columns].copy()
    logger.info(f"Orderd Dataframe created: {df_ordered.info()}")

    all_training_dfs.append(df_ordered)
    logger.info(f"Orderd Dataframe appended to list of dataframes: {all_training_dfs}")


bronze_dfs = pd.concat(all_training_dfs, axis=0, ignore_index=True)
logger.info(f"All Dataframe concatenated into a single dataframe, vertically {bronze_dfs}")

NameError: name 'COLUMN_NAMES' is not defined

In [38]:
bronze_dfs.shape

(160359, 31)

In [39]:
bronze_dfs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160359 entries, 0 to 160358
Data columns (total 31 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   dataset_name  160359 non-null  object 
 1   fd_id         160359 non-null  object 
 2   machine_id    160359 non-null  int64  
 3   time_index    160359 non-null  int64  
 4   event_time    160359 non-null  int64  
 5   RUL           160359 non-null  int64  
 6   anomaly_flag  160359 non-null  int64  
 7   op_setting_1  160359 non-null  float64
 8   op_setting_2  160359 non-null  float64
 9   op_setting_3  160359 non-null  float64
 10  s1            160359 non-null  float64
 11  s2            160359 non-null  float64
 12  s3            160359 non-null  float64
 13  s4            160359 non-null  float64
 14  s5            160359 non-null  float64
 15  s6            160359 non-null  float64
 16  s7            160359 non-null  float64
 17  s8            160359 non-null  float64
 18  s9  

In [40]:
bronze_dfs.head(10)

Unnamed: 0,dataset_name,fd_id,machine_id,time_index,event_time,RUL,anomaly_flag,op_setting_1,op_setting_2,op_setting_3,s1,s2,s3,s4,s5,s6,s7,s8,s9,s10,s11,s12,s13,s14,s15,s16,s17,s18,s19,s20,s21
0,TURBOFAN,FD001,1,1,1,191,0,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,TURBOFAN,FD001,1,2,2,190,0,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,TURBOFAN,FD001,1,3,3,189,0,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,TURBOFAN,FD001,1,4,4,188,0,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,TURBOFAN,FD001,1,5,5,187,0,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044
5,TURBOFAN,FD001,1,6,6,186,0,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,21.61,554.67,2388.02,9049.68,1.3,47.16,521.68,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669
6,TURBOFAN,FD001,1,7,7,185,0,0.001,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,21.61,554.34,2388.02,9059.13,1.3,47.36,522.32,2388.03,8132.32,8.3974,0.03,392,2388,100.0,39.1,23.3774
7,TURBOFAN,FD001,1,8,8,184,0,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,21.61,553.85,2388.0,9040.8,1.3,47.24,522.47,2388.03,8131.07,8.4076,0.03,391,2388,100.0,38.97,23.3106
8,TURBOFAN,FD001,1,9,9,183,0,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.8,14.62,21.61,553.69,2388.05,9046.46,1.3,47.29,521.79,2388.05,8125.69,8.3728,0.03,392,2388,100.0,39.05,23.4066
9,TURBOFAN,FD001,1,10,10,182,0,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,21.61,553.59,2388.05,9051.7,1.3,47.03,521.79,2388.06,8129.38,8.4286,0.03,393,2388,100.0,38.95,23.4694


In [45]:
bronze_dfs.to_csv(BRONZE_DATA_OUTPUT_PATH, index=False)