In [1]:
import os
import glob
import numpy as np
import pandas as pd

# Setup
labels = ['Bearing', 'Flywheel', 'Healthy', 'LIV', 'LOV', 'NRV', 'Piston', 'Riderbelt']
Faults = {label: idx for idx, label in enumerate(labels)}
MaxExpNo = 225

# Base path to dataset
base_path = os.path.expanduser('~/Downloads/AirCompressor_Data')

# Raw data store
raw_data = []  # list of dicts: [{'signal': ndarray, 'label': str, 'fault': int}, ...]

for label in labels:
    path = os.path.join(base_path, label, 'preprocess_Reading*.dat')
    files = sorted(glob.glob(path))
    
    if not files:
        print(f"⚠️ No files found for label: {label}")
        continue

    for file in files:
        try:
            signal = np.loadtxt(file, delimiter=',')
            raw_data.append({
                'signal': signal,
                'label': label,
                'fault': Faults[label]
            })
        except Exception as e:
            print(f"❌ Error loading {file}: {e}")

print(f"\n✅ Loaded {len(raw_data)} total files.")



✅ Loaded 1800 total files.


In [5]:
import pandas as pd
import numpy as np
from scipy.stats import kurtosis, skew
from scipy.fftpack import fft

# --- Feature Extraction Functions ---

def PrimaryFeatureExtractor(X):
    rms = np.sqrt(np.mean(np.square(X)))
    CrestFactor = np.abs(X).max() / rms
    ShapeFactor = rms / np.mean(np.abs(X))
    return [[
        np.mean(np.abs(X)),
        np.min(X),
        np.max(X),
        np.std(X),
        rms,
        skew(X),
        kurtosis(X),
        CrestFactor,
        ShapeFactor
    ]]

def FFT_BasedFeatures(X, NBins=100):
    N = len(X)
    T = 1.0 / N
    FreqList = np.linspace(0.0, 1.0 / (2.0 * T), N // 2)
    X_fft = fft(X)
    X_fft[0] = 0
    X_fft_magnitude = 2.0 / N * np.abs(X_fft[0:N // 2])

    BinCounts = (N // 2) // NBins
    SpecEnergy = []
    labels = []

    for i in range(NBins):
        SpecEnergy.append(X_fft_magnitude[i * BinCounts:(i + 1) * BinCounts].sum())
        labels.append(f'FFT{i + 1}')

    return SpecEnergy, labels

# --- Data Processing ---

feature_rows = []
data_columns_FFT_Features = None  # to be initialized during first FFT extraction

for idx, entry in enumerate(raw_data):
    try:
        X = entry['signal']
        
        # Extract statistical and FFT features
        StatFeatures = PrimaryFeatureExtractor(X)[0]
        FFT_Features, data_columns_FFT_Features = FFT_BasedFeatures(X)

        # Merge all features + fault label
        row = StatFeatures + FFT_Features + [entry['fault']]
        feature_rows.append(row)

        if idx % 10 == 0:
            print(f"✅ Processed {idx}/{len(raw_data)} entries")

    except Exception as e:
        print(f"❌ Error extracting features for index {idx}: {e}")

# --- Build DataFrame using original names ---

data_columns_PrimaryStatFeatures = [
    'Mean', 'Min', 'Max', 'StdDv', 'RMS', 'Skewness',
    'Kurtosis', 'CrestFactor', 'ShapeFactor'
]
data_columns_Target = ['Fault']
data_columns = data_columns_PrimaryStatFeatures + data_columns_FFT_Features + data_columns_Target

# Create final DataFrame
data = pd.DataFrame(feature_rows, columns=data_columns)

# Split into inputs and labels
input_data = data.drop(columns=['Fault'])
target_data = pd.DataFrame(data['Fault'], columns=['Fault'], dtype=int)

print("\n✅ Feature extraction and DataFrame construction complete.")
print(f"📊 input_data shape: {input_data.shape}")
print(f"🎯 target_data shape: {target_data.shape}")


✅ Processed 0/1800 entries
✅ Processed 10/1800 entries
✅ Processed 20/1800 entries
✅ Processed 30/1800 entries
✅ Processed 40/1800 entries
✅ Processed 50/1800 entries
✅ Processed 60/1800 entries
✅ Processed 70/1800 entries
✅ Processed 80/1800 entries
✅ Processed 90/1800 entries
✅ Processed 100/1800 entries
✅ Processed 110/1800 entries
✅ Processed 120/1800 entries
✅ Processed 130/1800 entries
✅ Processed 140/1800 entries
✅ Processed 150/1800 entries
✅ Processed 160/1800 entries
✅ Processed 170/1800 entries
✅ Processed 180/1800 entries
✅ Processed 190/1800 entries
✅ Processed 200/1800 entries
✅ Processed 210/1800 entries
✅ Processed 220/1800 entries
✅ Processed 230/1800 entries
✅ Processed 240/1800 entries
✅ Processed 250/1800 entries
✅ Processed 260/1800 entries
✅ Processed 270/1800 entries
✅ Processed 280/1800 entries
✅ Processed 290/1800 entries
✅ Processed 300/1800 entries
✅ Processed 310/1800 entries
✅ Processed 320/1800 entries
✅ Processed 330/1800 entries
✅ Processed 340/1800 entr

In [3]:
input_data

Unnamed: 0,Mean,Min,Max,StdDv,RMS,Skewness,Kurtosis,CrestFactor,ShapeFactor,FFT1,...,FFT91,FFT92,FFT93,FFT94,FFT95,FFT96,FFT97,FFT98,FFT99,FFT100
0,0.140067,-1.5920,1.3448,0.186773,0.192985,-0.186858,4.518489,8.249357,1.377799,0.032111,...,0.000141,0.000132,0.000123,0.000114,0.000106,0.000099,0.000093,0.000087,0.000084,0.000082
1,0.187374,-1.4158,1.3472,0.202243,0.239738,-0.297777,3.182940,5.905620,1.279463,0.035926,...,0.001102,0.001102,0.001103,0.001103,0.001103,0.001103,0.001104,0.001104,0.001103,0.001104
2,0.201925,-1.3502,1.1123,0.230012,0.259409,-0.205917,1.754806,5.204907,1.284679,0.038505,...,0.001468,0.001472,0.001475,0.001477,0.001480,0.001482,0.001483,0.001485,0.001485,0.001486
3,0.177199,-1.3254,1.1532,0.223281,0.235517,-0.205812,2.146448,5.627623,1.329111,0.063200,...,0.000268,0.000253,0.000239,0.000224,0.000211,0.000199,0.000189,0.000179,0.000174,0.000171
4,0.182402,-1.2071,1.2910,0.237785,0.239437,-0.075352,1.286465,5.391815,1.312691,0.155544,...,0.003041,0.003039,0.003037,0.003035,0.003035,0.003033,0.003033,0.003033,0.003032,0.003031
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1795,0.143902,-1.3802,1.5637,0.194700,0.194728,0.025085,2.808758,8.030176,1.353194,0.029575,...,0.000227,0.000229,0.000230,0.000231,0.000231,0.000232,0.000233,0.000233,0.000233,0.000234
1796,0.137908,-1.4787,1.4960,0.180747,0.187902,-0.053696,4.104418,7.961604,1.362513,0.025515,...,0.000054,0.000049,0.000045,0.000040,0.000035,0.000031,0.000027,0.000024,0.000021,0.000020
1797,0.131200,-1.2317,1.5648,0.178472,0.181081,0.075692,4.157256,8.641426,1.380187,0.021799,...,0.000821,0.000821,0.000821,0.000821,0.000821,0.000821,0.000821,0.000821,0.000821,0.000821
1798,0.131577,-1.3973,1.3526,0.178887,0.180757,0.011308,3.914086,7.730254,1.373775,0.039722,...,0.001398,0.001396,0.001395,0.001393,0.001391,0.001390,0.001389,0.001388,0.001388,0.001388


In [6]:
from sklearn import preprocessing
normalization_status='RobustScaler'   
''' Choices:
                                        1. Normalization
                                        2. StandardScaler
                                        3. MinMaxScaler
                                        4. RobustScaler
                                        5. Normalizer
                                        6. WithoutNormalization   '''
input_data_columns=data_columns_PrimaryStatFeatures+data_columns_FFT_Features

if (normalization_status=='Normalization'):
    data_array=preprocessing.normalize(input_data,norm='l2',axis=0)
    input_data=pd.DataFrame(data_array,columns=input_data_columns)
elif (normalization_status=='StandardScaler'):
    scaler = preprocessing.StandardScaler()
    scaled_df = scaler.fit_transform(input_data)
    input_data = pd.DataFrame(scaled_df, columns=input_data_columns)
elif (normalization_status=='MinMaxScaler'):
    scaler = preprocessing.MinMaxScaler()
    scaled_df = scaler.fit_transform(input_data)
    input_data = pd.DataFrame(scaled_df, columns=input_data_columns)
elif (normalization_status=='RobustScaler'):
    scaler = preprocessing.RobustScaler()
    scaled_df = scaler.fit_transform(input_data)
    input_data = pd.DataFrame(scaled_df, columns=input_data_columns)
elif (normalization_status=='Normalizer'):
    scaler = preprocessing.Normalizer()
    scaled_df = scaler.fit_transform(input_data)
    input_data = pd.DataFrame(scaled_df, columns=input_data_columns)
elif (normalization_status=='WithoutNormalization'):
    print ('No normalization is required')

target_data=pd.DataFrame(data['Fault'],columns=['Fault'],dtype=int)