In [1]:
import os
import glob
from pathlib import Path
import yaml

import logging

import pandas as pd 
import numpy as np 

# Custom Utilities Module
from utils.paths import get_paths
from utils.file_io import load_data


# Show more columns
pd.set_option("display.max_columns", 100)
pd.set_option("display.width", 200)

# Initiate Logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
# Get Path's Object
paths = get_paths()

logger.info(f"Project Root Path Loaded: {paths.root}")    
    
logger.info(f"Project Data Path Loaded: {paths.data}")
logger.info(f"Data Raw Path Loaded: {paths.data_raw}")

logger.info(f"Data Bronze Path Loaded: {paths.data_bronze}")


INFO:__main__:Project Root Path Loaded: /workspace
INFO:__main__:Project Data Path Loaded: /workspace/data
INFO:__main__:Data Raw Path Loaded: /workspace/data/raw
INFO:__main__:Data Bronze Path Loaded: /workspace/data/bronze


In [None]:
# Load Data

df = load_data(paths.data_raw / "AI4I_2020_Predictive_Maintenance_Dataset", "predictive_maintenance.csv", "AI4I", "unsplit", "")

INFO:utils.file_io:Loading CSV file: /workspace/data/raw/AI4I_2020_Predictive_Maintenance_Dataset/predictive_maintenance.csv
INFO:utils.file_io:Loaded CSV: predictive_maintenance.csv | shape=(10000, 10) | columns=['UDI', 'Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]', 'Target', 'Failure Type']


In [4]:
# Basic Dataframe Information/Summary

print("Shape:", df.shape)
print("\nData types:")
print(df.dtypes)

print("\nMemory usage (MB):")
print(df.memory_usage(deep=True).sum() / (1024 ** 2))

print("\nFirst 15 rows:")
display(df.head(15))

print("\nBasic numeric summary:")
display(df.describe().T)

print("\nBasic object / categorical summary:")
display(df.describe(include="object").T)

Shape: (10000, 10)

Data types:
UDI                          int64
Product ID                  object
Type                        object
Air temperature [K]        float64
Process temperature [K]    float64
Rotational speed [rpm]       int64
Torque [Nm]                float64
Tool wear [min]              int64
Target                       int64
Failure Type                object
dtype: object

Memory usage (MB):
2.329835891723633

First 15 rows:


Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Target,Failure Type
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,No Failure
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,No Failure
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,No Failure
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,No Failure
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,No Failure
5,6,M14865,M,298.1,308.6,1425,41.9,11,0,No Failure
6,7,L47186,L,298.1,308.6,1558,42.4,14,0,No Failure
7,8,L47187,L,298.1,308.6,1527,40.2,16,0,No Failure
8,9,M14868,M,298.3,308.7,1667,28.6,18,0,No Failure
9,10,M14869,M,298.5,309.0,1741,28.0,21,0,No Failure



Basic numeric summary:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
UDI,10000.0,5000.5,2886.89568,1.0,2500.75,5000.5,7500.25,10000.0
Air temperature [K],10000.0,300.00493,2.000259,295.3,298.3,300.1,301.5,304.5
Process temperature [K],10000.0,310.00556,1.483734,305.7,308.8,310.1,311.1,313.8
Rotational speed [rpm],10000.0,1538.7761,179.284096,1168.0,1423.0,1503.0,1612.0,2886.0
Torque [Nm],10000.0,39.98691,9.968934,3.8,33.2,40.1,46.8,76.6
Tool wear [min],10000.0,107.951,63.654147,0.0,53.0,108.0,162.0,253.0
Target,10000.0,0.0339,0.180981,0.0,0.0,0.0,0.0,1.0



Basic object / categorical summary:


Unnamed: 0,count,unique,top,freq
Product ID,10000,10000,L57163,1
Type,10000,3,L,6000
Failure Type,10000,6,No Failure,9652


In [16]:
# Get total count of unique product ids. 

len(list(df['Product ID'].unique()))



10000

In [15]:

# Get Total number of unique Failure Types:
list(df['Failure Type'].unique())

['No Failure',
 'Power Failure',
 'Tool Wear Failure',
 'Overstrain Failure',
 'Random Failures',
 'Heat Dissipation Failure']

In [None]:
# Add Metadata Column Data

# datset_name
df['dataset_name'] = "AI4I"

# dataset split flag
df['unknown'] = "unknown"

# Fault label flag
df["label_type"] = (df['Target'] == "1").astype(int)

