In [None]:
import os
import glob
from pathlib import Path
import yaml

import logging

import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split, KFold

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


# Custom Utilities Module
from utils.paths import get_paths
from utils.file_io import load_data


# Show more columns
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)


# Initiate Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Get Path's Object
paths = get_paths()

logger.info(f"Project Root Path Loaded: {paths.root}")    
    
logger.info(f"Project Data Path Loaded: {paths.data}")
logger.info(f"Data Raw Path Loaded: {paths.data_raw}")

logger.info(f"Data Bronze Path Loaded: {paths.data_bronze}")

In [None]:
# There are 26 columns in the dataset. 
# The first five columns have a different format then the remaining 21. 
# We will addresss these columns first then add in the remainingg 21. 

# Creating column name list for the first five. 
COLUMN_NAMES = [
    "unit",
    "cycle",
    "op_setting_1",
    "op_setting_2",
    "op_setting_3",
]

# As stated the remaining 21 columns share a similar name structure of sensor then a number
# So we will generate them using a loop and appended them to our list:
COLUMN_NAMES += [f"s{i}" for i in range(1, 22)]

# Display Column Names to verify success.
len(COLUMN_NAMES), COLUMN_NAMES[:10]


In [None]:
# Load Data Import Keywords/Additional parameters.
data_import_parameters = {"sep":"\s+", "header":None, "names": COLUMN_NAMES}

# Anomaly window for when errors occur
ANOMALY_HORIZON = 30

FD_IDS = ["FD001", "FD002", "FD003", "FD004"]

# Creating a list 
all_training_dfs = []

for fd_id in FD_IDS:

    RAW_DATA_FILE = f"train_{fd_id}.txt"

    raw_df = load_data(path.data_raw / "CMaps", RAW_DATA_FILE, **data_import_parameters)

    raw_df.columns = COLUMN_NAMES

    max_cycle = raw_df.groupby("unit")["cycle"].max().rename("max_cycle")
    logger.info(f"Max Cycle Dataframe Created: {raw_df.info()}")

    df = raw_df.merge(max_cycle, on="unit", how="left")
    logger.info(f"Raw Import Merged with Max Cycle: {df.info()}")

    df["RUL"] = df["max_cycle"] - df["cycle"]

    df = df.drop(columns=["max_cycle"])
    logger.info(f"RUL Calculcated and Added: {df.info()}")

    df["anomaly_flag"] = (df["RUL"] <= ANOMALY_HORIZON).astype(int)
    logger.info(f"Anomaly Flag Added: {df.info()}")

    df["dataset_name"] = "TURBOFAN"
    logger.info(f"Column Added: 'dataset_name': {df.info()}")

    df["fd_id"] = fd_id
    logger.info(f"Column Added: 'fd_id': {df.info()}")

    df["event_time"] = df["cycle"]
    logger.info(f"Column Added: 'event_time': {df.info()}")

    df = df.rename(columns={
                "unit": "machine_id",
                "cycle": "time_index",
                })
    logger.info(f"Columns Renamed: 'unit'>'machine_id' & 'cycle'>'time_index'  {df.info()}")

    core_columns = ["dataset_name", "fd_id", "machine_id", "time_index", "event_time"]
    logger.info(f"Core Columns Assigned and stored to list: {core_columns}")

    label_columns = ["RUL", "anomaly_flag"]
    logger.info(f"Label Columns Assigned and stored to list: {label_columns}")

    data_columns = list(df.columns)
    logger.info(f"Data Columns List created from existing columns in dataset: {data_columns}")

    for column in core_columns + label_columns: 
        if column in data_columns:
            data_columns.remove(column)

    logger.info(f"Data Columns List purged of core and label columns: {data_columns}")

    ordered_columns = core_columns + label_columns + data_columns
    logger.info(f"Column Order Set: {ordered_columns}")

    df_ordered = df[ordered_columns].copy()
    logger.info(f"Orderd Dataframe created: {df_ordered.info()}")

    all_training_dfs.append(df_ordered)
    logger.info(f"Orderd Dataframe appended to list of dataframes: {all_training_dfs}")


bronze_dfs = pd.concat(all_training_dfs, axis=0, ignore_index=True)
logger.info(f"All Dataframe concatenated into a single dataframe, vertically {bronze_dfs}")

In [None]:
bronze_dfs.shape

In [None]:
bronze_dfs.info()

In [None]:
bronze_dfs.head(10)

In [None]:
bronze_dfs.to_csv(BRONZE_DATA_OUTPUT_PATH, index=False)