# Training and Evaluation

In [1]:
import os
import librosa
import numpy as np
import joblib
import logging
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils.class_weight import compute_class_weight
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

# Set up logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Constants
DATA_PATH = "D:\\Saare_Projects\\ML_project\\final_Lung_Sound_Detection\\audio_sample\\Asthma Detection Dataset Version 2"
CLASSES = ["asthma", "Bronchial", "copd", "healthy", "pneumonia"]

# Feature extraction
def extract_mfcc_features(file_path, n_mfcc=20):
    try:
        sample, sample_rate = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=sample, sr=sample_rate, n_mfcc=n_mfcc)
        return np.mean(mfccs.T, axis=0)
    except Exception as e:
        logging.warning(f"Error processing file {file_path}: {e}")
        return None

# Load data
def load_data(data_path, classes):
    features, labels = [], []
    for class_name in classes:
        class_folder = os.path.join(data_path, class_name)
        for file_name in os.listdir(class_folder):
            if file_name.endswith(".wav"):
                file_path = os.path.join(class_folder, file_name)
                mfcc = extract_mfcc_features(file_path)
                if mfcc is not None:
                    features.append(mfcc)
                    labels.append(class_name)
    features, labels = np.array(features), np.array(labels)
    assert len(features) == len(labels), "Mismatch between features and labels"
    return features, labels

# Load and split data
X, y = load_data(DATA_PATH, CLASSES)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Resample with SMOTE
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save scaler
joblib.dump(scaler, "scaler.pkl")
logging.info("Scaler saved successfully.")

# Compute class weights
class_weights = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weight_dict = {label: weight for label, weight in zip(np.unique(y_train), class_weights)}

# Hyperparameter tuning
param_dist = {
    "n_estimators": [100, 200, 300, 500],
    "max_depth": [None, 10, 20, 30, 50],
    "min_samples_split": [2, 5, 10, 20],
    "min_samples_leaf": [1, 2, 4, 10],
    "max_features": ["sqrt", "log2", None],
    "bootstrap": [True, False],
}
rf_model = RandomForestClassifier(class_weight=class_weight_dict, random_state=42)
search = RandomizedSearchCV(
    rf_model, param_distributions=param_dist, n_iter=50, cv=3, n_jobs=-1, random_state=42
)
search.fit(X_train_scaled, y_train)

# Save the best model
best_model = search.best_estimator_
joblib.dump(best_model, "random_forest_model.pkl")
logging.info("Model saved successfully.")

# Evaluate the model
y_pred = best_model.predict(X_test_scaled)
logging.info(f"Accuracy: {accuracy_score(y_test, y_pred)}")
logging.info(f"Classification Report:\n{classification_report(y_test, y_pred)}")
logging.info(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")


2024-11-21 12:45:31 - INFO - Scaler saved successfully.
2024-11-21 12:47:35 - INFO - Model saved successfully.
2024-11-21 12:47:35 - INFO - Accuracy: 0.9008264462809917
2024-11-21 12:47:35 - INFO - Classification Report:
              precision    recall  f1-score   support

   Bronchial       0.89      0.81      0.85        21
      asthma       0.83      0.84      0.84        58
        copd       0.95      0.94      0.94        80
     healthy       0.96      0.92      0.94        26
   pneumonia       0.88      0.93      0.91        57

    accuracy                           0.90       242
   macro avg       0.90      0.89      0.90       242
weighted avg       0.90      0.90      0.90       242

2024-11-21 12:47:35 - INFO - Confusion Matrix:
[[17  4  0  0  0]
 [ 2 49  1  1  5]
 [ 0  4 75  0  1]
 [ 0  0  1 24  1]
 [ 0  2  2  0 53]]


# Prediction

In [32]:
import joblib
import librosa
import numpy as np
import logging

# Set up logging
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(message)s",
    level=logging.INFO,
    datefmt="%Y-%m-%d %H:%M:%S"
)

# Constants
CLASSES = ["asthma", "Bronchial", "copd", "healthy", "pneumonia"]
MODEL_PATH = "random_forest_model.pkl"
SCALER_PATH = "scaler.pkl"

# Load model and scaler
model = joblib.load(MODEL_PATH)
scaler = joblib.load(SCALER_PATH)

# Feature extraction
def extract_mfcc_features(file_path, n_mfcc=20):
    try:
        sample, sample_rate = librosa.load(file_path, sr=None)
        mfccs = librosa.feature.mfcc(y=sample, sr=sample_rate, n_mfcc=n_mfcc)
        return np.mean(mfccs.T, axis=0)
    except Exception as e:
        logging.warning(f"Error processing file {file_path}: {e}")
        return None

# Prediction function
def predict(file_path):
    features = extract_mfcc_features(file_path)
    if features is None:
        return "Error in feature extraction"
    
    # Ensure feature dimensions match model input
    if len(features) != scaler.mean_.shape[0]:
        return "Feature dimensions do not match the model requirements."
    
    # Scale and predict
    features_scaled = scaler.transform([features])
    prediction = model.predict(features_scaled)[0]
    return f"The predicted class is: {prediction}"

# Test prediction
file_path = "D:\\Saare_Projects\\ML_project\\Lung_Sound_Detection\\Audio Files\\BP36_pneumonia,Crep,P R M,36,F.wav"
print(predict(file_path))


The predicted class is: pneumonia
