In [8]:
import os
import glob
from pathlib import Path
import yaml

import logging

import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split, KFold

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


# Custom Utilities Module
from utils.paths import get_paths
from utils.file_io import load_data, ingest_data, save_data


# Show more columns
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)

# Initiate Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [9]:
# Get Path's Object
paths = get_paths()

logger.info(f"Project Root Path Loaded: {paths.root}")    
    
logger.info(f"Project Data Path Loaded: {paths.data}")
logger.info(f"Data Raw Path Loaded: {paths.data_raw}")

logger.info(f"Data Bronze Path Loaded: {paths.data_bronze}")

logger.info(f"Data Bronze Training Path Loaded:  {paths.data_bronze_train}")
logger.info(f"Data Bronze Testing Path Loaded:  {paths.data_bronze_test}")

INFO:__main__:Project Root Path Loaded: /workspace
INFO:__main__:Project Data Path Loaded: /workspace/data
INFO:__main__:Data Raw Path Loaded: /workspace/data/raw
INFO:__main__:Data Bronze Path Loaded: /workspace/data/bronze
INFO:__main__:Data Bronze Training Path Loaded:  /workspace/data/bronze/train
INFO:__main__:Data Bronze Testing Path Loaded:  /workspace/data/bronze/test


In [10]:
# Load Data

ff_train_df = ingest_data(
    paths.data_raw / "TEP_Dataset_CSV",
    file_name="TEP_FaultFree_Training.csv",
    dataset_name="TEP",
    split="train",
    is_labeled="yes",
    label_type="normal",
)

ff_test_df = ingest_data(
    paths.data_raw / "TEP_Dataset_CSV",
    file_name="TEP_FaultFree_Testing.csv",
    dataset_name="TEP",
    split="test",
    is_labeled="yes",
    label_type="normal",
)

f_train_df = ingest_data(
    paths.data_raw / "TEP_Dataset_CSV",
    file_name="TEP_Faulty_Training.csv",
    dataset_name="TEP",
    split="train",
    is_labeled="yes",
    label_type="abnormal",
)

f_test_df = ingest_data(
    paths.data_raw / "TEP_Dataset_CSV",
    file_name="TEP_Faulty_Testing.csv",
    dataset_name="TEP",
    split="test",
    is_labeled="yes",
    label_type="abnormal",
)



INFO:utils.file_io:Loading CSV file: /workspace/data/raw/TEP_Dataset_CSV/TEP_FaultFree_Training.csv
INFO:utils.file_io:Loaded CSV: TEP_FaultFree_Training.csv | shape=(250000, 55) | columns=['faultNumber', 'simulationRun', 'sample', 'xmeas_1', 'xmeas_2', 'xmeas_3', 'xmeas_4', 'xmeas_5', 'xmeas_6', 'xmeas_7', 'xmeas_8', 'xmeas_9', 'xmeas_10', 'xmeas_11', 'xmeas_12', 'xmeas_13', 'xmeas_14', 'xmeas_15', 'xmeas_16', 'xmeas_17', 'xmeas_18', 'xmeas_19', 'xmeas_20', 'xmeas_21', 'xmeas_22', 'xmeas_23', 'xmeas_24', 'xmeas_25', 'xmeas_26', 'xmeas_27', 'xmeas_28', 'xmeas_29', 'xmeas_30', 'xmeas_31', 'xmeas_32', 'xmeas_33', 'xmeas_34', 'xmeas_35', 'xmeas_36', 'xmeas_37', 'xmeas_38', 'xmeas_39', 'xmeas_40', 'xmeas_41', 'xmv_1', 'xmv_2', 'xmv_3', 'xmv_4', 'xmv_5', 'xmv_6', 'xmv_7', 'xmv_8', 'xmv_9', 'xmv_10', 'xmv_11']
INFO:utils.file_io:Loading CSV file: /workspace/data/raw/TEP_Dataset_CSV/TEP_FaultFree_Testing.csv
INFO:utils.file_io:Loaded CSV: TEP_FaultFree_Testing.csv | shape=(480000, 55) | colum

In [11]:

#tep_all_df = pd.concat([ff_train_df, ff_test_df, f_train_df, f_test_df], ignore_index=True)

# Combinding the Training dataframe
tep_all_training_df = pd.concat([ff_train_df, f_train_df], ignore_index=True)

# Combinding the Testing dataframe
tep_all_testing_df = pd.concat([ff_test_df, f_test_df], ignore_index=True)

In [12]:
# All Datasets
#tep_all_df["global_row_id"] = np.arange(len(tep_all_df), dtype=np.int64)


#tep_all_training_df["global_row_id"] = np.arange(len(tep_all_training_df), dtype=np.int64)

#tep_all_testing_df["global_row_id"] = np.arange(len(tep_all_testing_df), dtype=np.int64)

In [13]:
#meta_columns = ["global_row_id", "_source_file", "row_id", "dataset_name", "bronze_ingested_at", "run_id", "split", "is_labeled", "label_type"]

# 
#meta_columns = [column for column in meta_columns if column in tep_all_df.columns]

#tep_all_df = tep_all_df[meta_columns + [c for c in tep_all_df.columns if c not in meta_columns]]

#
meta_columns = ["_source_file", "row_id", "dataset_name", "bronze_ingested_at", "run_id", "split", "is_labeled", "label_type"]

# 
meta_columns = [column for column in meta_columns if column in tep_all_training_df.columns]

tep_all_training_df = tep_all_training_df[meta_columns + [c for c in tep_all_training_df.columns if c not in meta_columns]]

# 
meta_columns = [column for column in meta_columns if column in tep_all_testing_df.columns]

tep_all_testing_df = tep_all_testing_df[meta_columns + [c for c in tep_all_testing_df.columns if c not in meta_columns]]


In [14]:
# Save Data as Parquet
# save_data(tep_all_df, paths.data_bronze, "tep_combined_bronze")

# Save Data as Parquet
save_data(tep_all_training_df, paths.data_bronze_train, "tep_training_combined_bronze")

# Save Data as Parquet
save_data(tep_all_testing_df, paths.data_bronze_test, "tep_testing_combined_bronze")


INFO:utils.file_io:Saving DataFrame to Parquet: /workspace/data/bronze/train/tep_training_combined_bronze.parquet
INFO:utils.file_io:Saved: tep_training_combined_bronze.parquet | shape=(5250000, 63) | columns=['_source_file', 'dataset_name', 'run_id', 'split', 'is_labeled', 'label_type', 'ingest_utc', '_source_row_id', 'faultNumber', 'simulationRun', 'sample', 'xmeas_1', 'xmeas_2', 'xmeas_3', 'xmeas_4', 'xmeas_5', 'xmeas_6', 'xmeas_7', 'xmeas_8', 'xmeas_9', 'xmeas_10', 'xmeas_11', 'xmeas_12', 'xmeas_13', 'xmeas_14', 'xmeas_15', 'xmeas_16', 'xmeas_17', 'xmeas_18', 'xmeas_19', 'xmeas_20', 'xmeas_21', 'xmeas_22', 'xmeas_23', 'xmeas_24', 'xmeas_25', 'xmeas_26', 'xmeas_27', 'xmeas_28', 'xmeas_29', 'xmeas_30', 'xmeas_31', 'xmeas_32', 'xmeas_33', 'xmeas_34', 'xmeas_35', 'xmeas_36', 'xmeas_37', 'xmeas_38', 'xmeas_39', 'xmeas_40', 'xmeas_41', 'xmv_1', 'xmv_2', 'xmv_3', 'xmv_4', 'xmv_5', 'xmv_6', 'xmv_7', 'xmv_8', 'xmv_9', 'xmv_10', 'xmv_11']
INFO:utils.file_io:Saving DataFrame to Parquet: /work

PosixPath('/workspace/data/bronze/test/tep_testing_combined_bronze.parquet')