In [4]:
import os
import glob
from pathlib import Path
import yaml

import logging

import pandas as pd 
import numpy as np 

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn.model_selection import train_test_split, KFold

from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder

from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

from sklearn.ensemble import RandomForestClassifier, IsolationForest
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


# Custom Utilities Module
from utils.paths import get_paths
from utils.file_io import ingest_data, save_data


# Show more columns
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)

# Initiate Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [6]:
# Get Path's Object
paths = get_paths()

logger.info(f"Project Root Path Loaded: {paths.root}")    
    
logger.info(f"Project Data Path Loaded: {paths.data}")
logger.info(f"Data Raw Path Loaded: {paths.data_raw}")

logger.info(f"Data Bronze Path Loaded: {paths.data_bronze}")

logger.info(f"Data Bronze Training Path Loaded:  {paths.data_bronze_train}")
logger.info(f"Data Bronze Testing Path Loaded:  {paths.data_bronze_test}")


INFO:__main__:Project Root Path Loaded: /workspace
INFO:__main__:Project Data Path Loaded: /workspace/data
INFO:__main__:Data Raw Path Loaded: /workspace/data/raw
INFO:__main__:Data Bronze Path Loaded: /workspace/data/bronze
INFO:__main__:Data Bronze Training Path Loaded:  /workspace/data/bronze/train
INFO:__main__:Data Bronze Testing Path Loaded:  /workspace/data/bronze/test


In [3]:
# Load Data

pump_df = ingest_data(
    paths.data_raw / "pump_sensor_data",
    file_name="sensor.csv",
    dataset_name="PUMP",
    split="unsplit",
    is_labeled="yes",
    label_type=pd.NA,
)

INFO:utils.file_io:Loading CSV file: /workspace/data/raw/pump_sensor_data/sensor.csv
INFO:utils.file_io:Loaded CSV: sensor.csv | shape=(220320, 55) | columns=['Unnamed: 0', 'timestamp', 'sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21', 'sensor_22', 'sensor_23', 'sensor_24', 'sensor_25', 'sensor_26', 'sensor_27', 'sensor_28', 'sensor_29', 'sensor_30', 'sensor_31', 'sensor_32', 'sensor_33', 'sensor_34', 'sensor_35', 'sensor_36', 'sensor_37', 'sensor_38', 'sensor_39', 'sensor_40', 'sensor_41', 'sensor_42', 'sensor_43', 'sensor_44', 'sensor_45', 'sensor_46', 'sensor_47', 'sensor_48', 'sensor_49', 'sensor_50', 'sensor_51', 'machine_status']


In [5]:
# Basic Dataframe Information/Summary

print("Shape:", pump_df.shape)
print("\nData types:")
print(pump_df.dtypes)

print("\nMemory usage (MB):")
print(pump_df.memory_usage(deep=True).sum() / (1024 ** 2))

print("\nFirst 15 rows:")
display(pump_df.head(15))

print("\nBasic numeric summary:")
display(pump_df.describe().T)

print("\nBasic object / categorical summary:")
display(pump_df.describe(include="object").T)

Shape: (220320, 63)

Data types:
_source_file          category
dataset_name          category
bronze_ingested_at      object
_source_row_id           int64
run_id                category
                        ...   
sensor_48              float64
sensor_49              float64
sensor_50              float64
sensor_51              float64
machine_status          object
Length: 63, dtype: object

Memory usage (MB):
139.99181365966797

First 15 rows:


Unnamed: 0.1,_source_file,dataset_name,bronze_ingested_at,_source_row_id,run_id,split,is_labeled,label_type,Unnamed: 0,timestamp,sensor_00,sensor_01,sensor_02,sensor_03,sensor_04,sensor_05,sensor_06,sensor_07,sensor_08,sensor_09,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21,sensor_22,sensor_23,sensor_24,sensor_25,sensor_26,sensor_27,sensor_28,sensor_29,sensor_30,sensor_31,sensor_32,sensor_33,sensor_34,sensor_35,sensor_36,sensor_37,sensor_38,sensor_39,sensor_40,sensor_41,sensor_42,sensor_43,sensor_44,sensor_45,sensor_46,sensor_47,sensor_48,sensor_49,sensor_50,sensor_51,machine_status
0,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,0,,unsplit,yes,,0,2018-04-01 00:00:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353,37.2274,47.52422,31.11716,1.681353,419.5747,,461.8781,466.3284,2.565284,665.3993,398.9862,880.0001,498.8926,975.9409,627.674,741.7151,848.0708,429.0377,785.1935,684.9443,594.4445,682.8125,680.4416,433.7037,171.9375,341.9039,195.0655,90.32386,40.36458,31.51042,70.57291,30.98958,31.770832,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
1,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,1,,unsplit,yes,,1,2018-04-01 00:01:00,2.465394,47.09201,53.2118,46.31076,634.375,76.45975,13.41146,16.13136,15.56713,15.05353,37.2274,47.52422,31.11716,1.681353,419.5747,,461.8781,466.3284,2.565284,665.3993,398.9862,880.0001,498.8926,975.9409,627.674,741.7151,848.0708,429.0377,785.1935,684.9443,594.4445,682.8125,680.4416,433.7037,171.9375,341.9039,195.0655,90.32386,40.36458,31.51042,70.57291,30.98958,31.770832,41.92708,39.6412,65.68287,50.92593,38.19444,157.9861,67.70834,243.0556,201.3889,NORMAL
2,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,2,,unsplit,yes,,2,2018-04-01 00:02:00,2.444734,47.35243,53.2118,46.39757,638.8889,73.54598,13.32465,16.03733,15.61777,15.01013,37.86777,48.17723,32.08894,1.708474,420.848,,462.7798,459.6364,2.500062,666.2234,399.9418,880.4237,501.3617,982.7342,631.1326,740.8031,849.8997,454.239,778.5734,715.6266,661.574,721.875,694.7721,441.2635,169.982,343.1955,200.9694,93.90508,41.40625,31.25,69.53125,30.46875,31.77083,41.66666,39.351852,65.39352,51.21528,38.194443,155.9606,67.12963,241.3194,203.7037,NORMAL
3,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,3,,unsplit,yes,,3,2018-04-01 00:03:00,2.460474,47.09201,53.1684,46.397568,628.125,76.98898,13.31742,16.24711,15.69734,15.08247,38.57977,48.65607,31.67221,1.579427,420.7494,,462.898,460.8858,2.509521,666.0114,399.1046,878.8917,499.043,977.752,625.4076,739.2722,847.7579,474.8731,779.5091,690.4011,686.1111,754.6875,683.3831,446.2493,166.4987,343.9586,193.1689,101.0406,41.92708,31.51042,72.13541,30.46875,31.51042,40.88541,39.0625,64.81481,51.21528,38.19444,155.9606,66.84028,240.4514,203.125,NORMAL
4,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,4,,unsplit,yes,,4,2018-04-01 00:04:00,2.445718,47.13541,53.2118,46.397568,636.4583,76.58897,13.35359,16.21094,15.69734,15.08247,39.48939,49.06298,31.95202,1.683831,419.8926,,461.4906,468.2206,2.604785,663.2111,400.5426,882.5874,498.5383,979.5755,627.183,737.6033,846.9182,408.8159,785.2307,704.6937,631.4814,766.1458,702.4431,433.9081,164.7498,339.963,193.877,101.7038,42.70833,31.51042,76.82291,30.98958,31.51042,41.40625,38.77315,65.10416,51.79398,38.77315,158.2755,66.55093,242.1875,201.3889,NORMAL
5,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,5,,unsplit,yes,,5,2018-04-01 00:05:00,2.453588,47.09201,53.1684,46.397568,637.6157,78.18568,13.41146,16.16753,15.89265,15.16204,39.29406,49.37051,32.23816,1.673484,418.9049,,461.8948,461.9289,2.507935,663.4962,398.6428,872.4973,498.4064,974.6847,624.3462,739.2036,846.4617,429.6945,777.282,700.7193,623.1481,734.375,711.2128,438.6517,162.8758,333.7975,196.4579,90.84048,42.70833,31.77083,79.94791,30.98958,31.25,42.70833,38.77315,63.65741,51.79398,38.77315,164.6412,66.55093,241.6088,201.6782,NORMAL
6,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,6,,unsplit,yes,,6,2018-04-01 00:06:00,2.455556,47.04861,53.1684,46.397568,633.3333,75.81614,13.43316,16.13136,15.65393,15.08247,38.29974,49.57146,32.00982,1.684984,420.3324,,464.2402,467.5146,2.598702,667.4751,401.1847,882.7164,500.4944,981.2908,631.2756,740.9517,853.9647,458.3623,771.8188,722.5254,674.074,707.8125,716.1951,433.5065,161.9678,330.3747,194.0652,89.23161,42.70833,32.29166,79.94791,31.25,31.51042,43.22916,38.19444,61.9213,51.79398,39.0625,171.875,67.70834,240.162,200.2315,NORMAL
7,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,7,,unsplit,yes,,7,2018-04-01 00:07:00,2.449653,47.13541,53.1684,46.397568,630.6713,75.77331,13.25231,16.12413,16.19647,15.08247,37.3396,49.32732,31.8832,1.646842,417.552,,462.4563,463.8936,2.533115,662.9967,395.2946,864.6103,496.4218,966.2091,618.3017,741.4593,834.6233,476.1262,784.8675,703.1723,653.2407,768.75,723.7064,451.6564,162.8504,327.9788,196.9991,87.68256,42.96875,32.55208,78.38541,30.98958,31.510416,42.96875,38.194443,59.60648,50.92593,39.35185,178.5301,68.57639,241.3194,201.0995,NORMAL
8,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,8,,unsplit,yes,,8,2018-04-01 00:08:00,2.463426,47.09201,53.1684,46.397568,631.9444,74.58916,13.28848,16.13136,15.47309,15.11863,38.45401,50.28795,32.09234,1.686156,422.0777,,463.4988,461.546,2.52659,666.7677,400.1693,881.6849,500.8832,982.4542,629.5605,743.5971,852.8547,480.1708,791.9694,716.4828,666.6666,718.2291,671.4195,439.8698,155.7845,316.8784,197.1554,89.55923,42.70833,32.55208,75.52083,30.989582,31.510416,42.1875,38.19444,57.87037,50.63657,39.35185,182.0023,69.44444,243.0556,201.6782,NORMAL
9,sensor.csv,PUMP,2025-12-19T20:50:08.957597+00:00,9,,unsplit,yes,,9,2018-04-01 00:09:00,2.445718,47.17882,53.1684,46.397568,641.7823,74.57428,13.38252,16.24711,15.61777,15.11863,39.52119,50.44635,32.25679,1.637774,421.4344,,463.4123,468.8477,2.630246,666.2795,400.21,881.1141,501.1399,980.8168,629.5303,746.6206,854.307,438.7823,799.5127,717.6339,635.1852,736.4583,699.0274,445.2378,153.0564,313.2721,195.057,98.83604,42.70833,32.55208,73.4375,30.989582,31.510416,41.66666,39.0625,56.42361,50.63657,39.0625,186.6319,69.7338,246.5278,200.8102,NORMAL



Basic numeric summary:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
_source_row_id,220320.0,110159.5,63601.049991,0.0,55079.75,110159.5,165239.25,220319.0
Unnamed: 0,220320.0,110159.5,63601.049991,0.0,55079.75,110159.5,165239.25,220319.0
sensor_00,210112.0,2.372221,0.412227,0.0,2.438831,2.456539,2.499826,2.549016
sensor_01,219951.0,47.591611,3.296666,0.0,46.31076,48.133678,49.47916,56.72743
sensor_02,220301.0,50.867392,3.66682,33.15972,50.39062,51.6493,52.77777,56.03299
sensor_03,220301.0,43.752481,2.418887,31.64062,42.838539,44.227428,45.3125,48.22049
sensor_04,220301.0,590.673936,144.023912,2.798032,626.6204,632.638916,637.615723,800.0
sensor_05,220301.0,73.396414,17.298247,0.0,69.97626,75.57679,80.91215,99.99988
sensor_06,215522.0,13.501537,2.163736,0.014468,13.34635,13.64294,14.53993,22.25116
sensor_07,214869.0,15.843152,2.201155,0.0,15.90712,16.16753,16.42795,23.59664



Basic object / categorical summary:


Unnamed: 0,count,unique,top,freq
bronze_ingested_at,220320,1,2025-12-19T20:50:08.957597+00:00,220320
timestamp,220320,220320,2018-08-31 23:43:00,1
machine_status,220320,3,NORMAL,205836


In [7]:
# Save Data as Parquet
save_data(pump_df, paths.data_bronze_train, "pump_train_bronze")

INFO:utils.file_io:Saving DataFrame to Parquet: /workspace/data/bronze/train/pump_train_bronze.parquet
INFO:utils.file_io:Saved: pump_train_bronze.parquet | shape=(220320, 63) | columns=['_source_file', 'dataset_name', 'bronze_ingested_at', '_source_row_id', 'run_id', 'split', 'is_labeled', 'label_type', 'Unnamed: 0', 'timestamp', 'sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07', 'sensor_08', 'sensor_09', 'sensor_10', 'sensor_11', 'sensor_12', 'sensor_13', 'sensor_14', 'sensor_15', 'sensor_16', 'sensor_17', 'sensor_18', 'sensor_19', 'sensor_20', 'sensor_21', 'sensor_22', 'sensor_23', 'sensor_24', 'sensor_25', 'sensor_26', 'sensor_27', 'sensor_28', 'sensor_29', 'sensor_30', 'sensor_31', 'sensor_32', 'sensor_33', 'sensor_34', 'sensor_35', 'sensor_36', 'sensor_37', 'sensor_38', 'sensor_39', 'sensor_40', 'sensor_41', 'sensor_42', 'sensor_43', 'sensor_44', 'sensor_45', 'sensor_46', 'sensor_47', 'sensor_48', 'sensor_49', 'sensor_50', 'sen

PosixPath('/workspace/data/bronze/train/pump_train_bronze.parquet')