In [12]:
#1:import necessary libraries
import os
import numpy as np
import pandas as pd
import librosa
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import joblib

#suppress warnings for cleaner output
warnings.filterwarnings('ignore')


In [13]:
#2:extracting audio features MFCCs, chroma, spectral features, ZCR, RMSE
def extract_features(file_path):
    try:
        y, sr = librosa.load(file_path, sr=None) #load audio file
        
        #extracted features
        mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) 
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        spec_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
        spec_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
        spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
        zcr = librosa.feature.zero_crossing_rate(y)
        rmse = librosa.feature.rms(y=y)
        
        #store features mean and std
        features = {}
        for i in range(13):
            features[f'mfcc_mean_{i+1}'] = np.mean(mfccs[i])
            features[f'mfcc_std_{i+1}'] = np.std(mfccs[i])
        
        #update features
        features.update({
            'chroma_mean': np.mean(chroma), 'chroma_std': np.std(chroma),
            'spec_centroid_mean': np.mean(spec_centroid), 'spec_centroid_std': np.std(spec_centroid),
            'spec_bandwidth_mean': np.mean(spec_bandwidth), 'spec_bandwidth_std': np.std(spec_bandwidth),
            'spec_rolloff_mean': np.mean(spec_rolloff), 'spec_rolloff_std': np.std(spec_rolloff),
            'zcr_mean': np.mean(zcr), 'zcr_std': np.std(zcr),
            'rmse_mean': np.mean(rmse), 'rmse_std': np.std(rmse)
        })
        return features
   
    #error handling
    except Exception as e:
        print(f"Error: {file_path}")
        return None

print("Feature extraction function ready")

Feature extraction function ready


In [14]:
#3:loading data and building dataframe

data = []
base_path = r"C:\Users\siona\Desktop\Data\Info_4000\Valve"  # Change to your data folder


print("Loading audio files...")
for valve_id in os.listdir(base_path):
    valve_path = os.path.join(base_path, valve_id)
    if not os.path.isdir(valve_path):
        continue
    
    #handles nested structure: Valve/id_00/id_00/abnormal
    inner_valve_path = os.path.join(valve_path, valve_id)
    if os.path.isdir(inner_valve_path):
        valve_path = inner_valve_path  #use the nested folder
    
    #extract features
    for condition in ["normal", "abnormal"]:
        condition_path = os.path.join(valve_path, condition)
        if not os.path.isdir(condition_path):
            continue
        
        #handles nested structure
        for file in os.listdir(condition_path):
            if file.endswith(".wav"): #only wav files
                file_path = os.path.join(condition_path, file) #full file path
                features = extract_features(file_path) #extract features
                if features:
                    features["label"] = 0 if condition == "normal" else 1 
                    features["valve_id"] = valve_id 
                    data.append(features) #store features
                    if len(data) % 100 == 0:
                        print(f" Processed {len(data)} files...")#print progress

#create dataframe
df = pd.DataFrame(data)
print(f"\nDataset created with,  {df.shape[0]} samples, {df.shape[1]-2} features")


Loading audio files...
 Processed 100 files...
 Processed 200 files...
 Processed 300 files...
 Processed 400 files...
 Processed 500 files...
 Processed 600 files...
 Processed 700 files...
 Processed 800 files...
 Processed 900 files...
 Processed 1000 files...
 Processed 1100 files...
 Processed 1200 files...
 Processed 1300 files...
 Processed 1400 files...
 Processed 1500 files...
 Processed 1600 files...
 Processed 1700 files...
 Processed 1800 files...
 Processed 1900 files...
 Processed 2000 files...
 Processed 2100 files...
 Processed 2200 files...
 Processed 2300 files...
 Processed 2400 files...
 Processed 2500 files...
 Processed 2600 files...
 Processed 2700 files...
 Processed 2800 files...
 Processed 2900 files...
 Processed 3000 files...
 Processed 3100 files...
 Processed 3200 files...
 Processed 3300 files...
 Processed 3400 files...
 Processed 3500 files...
 Processed 3600 files...
 Processed 3700 files...
 Processed 3800 files...
 Processed 3900 files...
 Processed 

In [15]:
#4: data overview
print("\n  Dataset Overview:")
print(f"Shape: {df.shape}")
print(f"\nthe label distribution:\n{df['label'].value_counts()}")
print(f"\nPrinting first 5 rows: ")
print(df.head())


  Dataset Overview:
Shape: (4170, 40)

the label distribution:
label
0    3691
1     479
Name: count, dtype: int64

Printing first 5 rows: 
   mfcc_mean_1  mfcc_std_1  mfcc_mean_2  mfcc_std_2  mfcc_mean_3  mfcc_std_3  \
0  -453.536346   62.326683   111.516434   36.946873     9.088650   12.933658   
1  -407.746124   43.768436    93.851753   27.782482    -3.282425   10.550710   
2  -435.306610   51.851673    98.593742   31.520517     1.550137   10.447041   
3  -430.467712   51.209702   113.662689   34.732361    -3.186110   12.423004   
4  -473.426910   69.769539   124.112320   42.873428     9.627452   14.904742   

   mfcc_mean_4  mfcc_std_4  mfcc_mean_5  mfcc_std_5  ...  spec_bandwidth_mean  \
0    17.291801    9.700444    12.015554    5.246408  ...          1725.775371   
1    17.772728   11.152828    -1.624269    7.094705  ...          1835.443473   
2    -1.708624    5.369768     4.929927    3.855741  ...          1883.026426   
3    20.563763   11.926163     3.592756    6.686066  .

In [16]:
#5: model training
# prep data
X = df.drop(['label', 'valve_id'], axis=1)
y = df['label']

#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#train model
model = RandomForestClassifier(n_estimators=200, random_state=42) #create model with 200 trees
model.fit(X_train, y_train)

print("Model trained")

Model trained


In [17]:
#6: model eval
y_pred = model.predict(X_test) #predict
accuracy = accuracy_score(y_test, y_pred) #calc accuracy

print(f"\nAccuracy: {accuracy*100:.2f}%")
if accuracy >= 0.90:
    print("Meets 90%") #meets requirement
else:
    print("Below 90%") # might need tuning

#confusion matrix and the classification report
cm = confusion_matrix(y_test, y_pred)
print(f"\nConfusion Matrix:\n{cm}")
print(f"\n{classification_report(y_test, y_pred, target_names=['Normal', 'Abnormal'])}")



Accuracy: 96.52%
Meets 90%

Confusion Matrix:
[[738   0]
 [ 29  67]]

              precision    recall  f1-score   support

      Normal       0.96      1.00      0.98       738
    Abnormal       1.00      0.70      0.82        96

    accuracy                           0.97       834
   macro avg       0.98      0.85      0.90       834
weighted avg       0.97      0.97      0.96       834



In [18]:
#7:predict on new data
new_data_folder = r"C:\Users\siona\Desktop\Projects\INFO_4000\Week 09\Exercise MP2\Attempt\Part2_MP2\Valve_Data_for_Prediction"  # Change to your test folder

#check if folder exists
if os.path.exists(new_data_folder):
    print("\nPredictions on new data:")
    results = []
    
    #predict
    for file in os.listdir(new_data_folder):
        if file.endswith(".wav"):
            file_path = os.path.join(new_data_folder, file)
            features = extract_features(file_path)
            if features: #check if features were extracted
                new_df = pd.DataFrame([features])
                new_scaled = scaler.transform(new_df)
                prediction = model.predict(new_scaled)[0] #predict
                label = "Normal" if prediction == 0 else "Abnormal" #label
                results.append({'File': file, 'Prediction': label}) #store results
                print(f"  {file}: {label}") # print
    
    #print summary
    if results:
        print(f"\nSummary: {pd.DataFrame(results)}")
else:
    print(f"\n'{new_data_folder}' folder not found") #if folder is not found


Predictions on new data:
  Valve1_000NB.wav: Normal
  Valve2_000AB.wav: Abnormal

Summary:                File Prediction
0  Valve1_000NB.wav     Normal
1  Valve2_000AB.wav   Abnormal


In [19]:
#8: save model
joblib.dump(model, "valve_classifier.pkl")
joblib.dump(scaler, "scaler.pkl")
print("\nModel and scaler saved")


Model and scaler saved


Assignment asks for the questions to be put into a cell, does it mean code or markdown? I did markdown.

1. While you built this application in the classical ML way using tabular data, what is the other
way you could have built this application where Deep Learning can be used? Explain the
basic concept of that method and explain how it works.

    An alternative would be to use a CNN on the mel-spectograms which are the 2D images: time x frequency x amplitude, instead of manually extracting features, converts each audio clip ino mel-spectograms image.

    So converts audio to mel-spectrogram, then feeds into CNN layers (Conv2D layers, Pooling, Flatten, Dense, Output)
    so as an example:
    Audio to Mel-spectrogram to Conv2D to MaxPool to Conv2D to MaxPool to Flatten to Dense to Output (Normal/Abnormal)

    This is good since it works well with larger datasets and can find patterns taht a classical ML might miss.


2. Explain how one would have to preprocess the data to adopt the above deep learning
method and how that can be implemented?

   The first step would be to load the audio with librosa adn resample to a fixed rate. Create a fixed length so all the audio is a fixed lenght (ex. 3 seconds). Then create a mel-spectrogram 
   in our case:

   mel = librosa.feature.melspectrogram(y, sr=sr, n_mels=128, n_fft=2048, hop_length=512)
   mel_db = librosa.power_to_db(mel)

   Then Normalize scale pixels for training and convergence [0,1]. And if needed you can also add data augmentation like time stretching or noise addition etc. 
   After all this the audio should be clean and standardized 2D input ready for a CNN



3. In completing this application what were the challenges and what were three takeaways
that you would consider significant in understanding audio analytics?

   The challenges faced were quite a bit but excluding from the assignment, one was my laptop itself acting up and crashing every so often, making the problem of extracting features which takes time due to many files more difficult than it already was. Others include managing nested folders and inconsistent audio lentghs, choosing the best feature, and reaching 90% accuracy through tuning.

   The main take aways were that:
   
   a. Audio contians more than one variable including time,frequency, amplitude, etc. Making it important to use MFCCs for tone and texture, Spectral for Frequency distribution, ZCR/RMSE for Noisiness and energy. Combining all helps improve accuracy.

   b. Preprocessing matters becasue scaling avoids bias from large-value features, and using mean + std captures both behavior and variation.

   c. Domain knowledge helps, knowing how valves sound helps interpret MFCCs and spectra. Like Worn bearing might make a higher freq. noise, or a belt on a machine squaking giving an altered sound. 
   So a classical ML works but needs features which require expertise to create, while DL learns features automatically but needs more data and is less interpretable.