# Code for Training and Evaluation

In [1]:
import os
import librosa
import numpy as np
import joblib
import logging
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Set up logging
logging.basicConfig(level=logging.INFO)

DATA_PATH = "D:\\Saare_Projects\\ML_project\\final_Lung_Sound_Detection\\audio_sample\\Asthma Detection Dataset Version 2"
CLASSES = ["asthma", "Bronchial", "copd", "healthy", "pneumonia"]

def extract_mfcc_features(file_path, n_mfcc=20):
    try:
        sample, sample_rate = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=sample, sr=sample_rate, n_mfcc=n_mfcc)
        return np.mean(mfccs.T, axis=0)
    except Exception as e:
        logging.warning(f"Error processing file {file_path}: {e}")
        return None

def load_data(data_path, classes):
    features, labels = [], []
    for class_name in classes:
        class_folder = os.path.join(data_path, class_name)
        for file_name in os.listdir(class_folder):
            if file_name.endswith('.wav'):
                file_path = os.path.join(class_folder, file_name)
                mfcc = extract_mfcc_features(file_path)
                if mfcc is not None:
                    features.append(mfcc)
                    labels.append(class_name)
    return np.array(features), np.array(labels)

# Load data and split
X, y = load_data(DATA_PATH, CLASSES)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Resample with SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler for later
joblib.dump(scaler, 'scaler.pkl')

# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(zip(np.unique(y_train), class_weights))

# Hyperparameter tuning
param_dist = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
}
rf_model = RandomForestClassifier(class_weight=class_weight_dict, random_state=42)
search = RandomizedSearchCV(rf_model, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, random_state=42)
search.fit(X_train_scaled, y_train)

# Save best model
best_model = search.best_estimator_
joblib.dump(best_model, 'random_forest_model.pkl')
logging.info("Model and scaler saved successfully.")

# Evaluation
y_pred = best_model.predict(X_test_scaled)
logging.info(f"Accuracy: {accuracy_score(y_test, y_pred)}")
logging.info(f"Classification Report:\n{classification_report(y_test, y_pred)}")
logging.info(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


INFO:root:Model and scaler saved successfully.
INFO:root:Accuracy: 0.897119341563786
INFO:root:Classification Report:
              precision    recall  f1-score   support

   Bronchial       0.84      0.76      0.80        21
      asthma       0.83      0.84      0.84        58
        copd       0.95      0.94      0.94        80
     healthy       0.96      0.93      0.94        27
   pneumonia       0.88      0.93      0.91        57

    accuracy                           0.90       243
   macro avg       0.89      0.88      0.89       243
weighted avg       0.90      0.90      0.90       243

INFO:root:Confusion Matrix:
[[16  4  0  0  1]
 [ 3 49  1  1  4]
 [ 0  4 75  0  1]
 [ 0  0  1 25  1]
 [ 0  2  2  0 53]]


# Prediction Code

In [2]:
import joblib
import librosa
import numpy as np

# Load the pre-trained model and scaler
model = joblib.load('random_forest_model.pkl')
scaler = joblib.load('scaler.pkl')
CLASSES = ["asthma", "Bronchial", "copd", "healthy", "pneumonia"]

def extract_mfcc_features(file_path, n_mfcc=20):
    try:
        sample, sample_rate = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=sample, sr=sample_rate, n_mfcc=n_mfcc)
        return np.mean(mfccs.T, axis=0)
    except Exception as e:
        logging.warning(f"Error processing file {file_path}: {e}")
        return None

# Predict
def predict(file_path):
    features = extract_mfcc_features(file_path)
    if features is None:
        return "Error in feature extraction"

    # Scale features with loaded scaler
    features_scaled = scaler.transform([features])
    prediction = model.predict(features_scaled)[0]
    return f"The predicted class is: {prediction}"

# Test prediction
file_path = "D:\\Saare_Projects\\ML_project\\final_Lung_Sound_Detection\\audio_sample\\Asthma Detection Dataset Version 2\\healthy\\P15Healthy61S.wav"
print(predict(file_path))


The predicted class is: healthy


In [3]:
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(best_model, 'random_forest_model.pkl')


['random_forest_model.pkl']

In [4]:
joblib.dump(scaler, 'scaler.pkl')


['scaler.pkl']

In [5]:
# Save scaler for later use
joblib.dump(scaler, 'scaler.pkl')

# Save the best model
joblib.dump(best_model, 'random_forest_model.pkl')
logging.info("Model and scaler saved successfully.")


INFO:root:Model and scaler saved successfully.
