In [None]:
# Imports
import numpy as np
import pandas as pd
import librosa
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from scipy.stats import randint

# Step 1: Load and preprocess audio paths and labels
def load_audio_metadata(csv_path):
    df = pd.read_csv(csv_path)
    return df["File Path"].tolist(), df["Classification"].tolist()

# Step 2: Extract MFCC features from audio files
def extract_mfcc_features(paths, labels, n_mfcc=13, sr=22050, duration=3):
    features = []
    valid_labels = []

    for path, label in zip(paths, labels):
        try:
            y, _ = librosa.load(path, sr=sr, duration=duration)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
            mfcc_mean = np.mean(mfcc.T, axis=0)
            features.append(mfcc_mean)
            valid_labels.append(label)
        except Exception as e:
            print(f"Error processing {path}: {e}")

    return pd.DataFrame(features), pd.Series(valid_labels)

# Step 3: Encode labels and split data
def prepare_data(X, y, test_size=0.2, random_state=42):
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    return train_test_split(X, y_encoded, test_size=test_size, random_state=random_state), le

# Step 4: Train Random Forest with hyperparameter tuning
def train_random_forest(X_train, y_train):
    model = RandomForestClassifier(class_weight="balanced", random_state=42)
    param_dist = {
        "n_estimators": randint(50, 300),
        "max_depth": randint(5, 50),
        "min_samples_split": randint(2, 20),
        "min_samples_leaf": randint(1, 10),
        "max_features": ["sqrt", "log2", None]
    }

    search = RandomizedSearchCV(
        estimator=model,
        param_distributions=param_dist,
        n_iter=100,
        cv=5,
        scoring='accuracy',
        verbose=1,
        n_jobs=1
    )
    search.fit(X_train, y_train)
    return search.best_estimator_, search

# Step 5: Evaluate a model
def evaluate_model(model, X_test, y_test, label_encoder):
    y_pred = model.predict(X_test)
    print("\nAccuracy on test set:", accuracy_score(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Step 6: Benchmark multiple classifiers
def benchmark_models(X_train, X_test, y_train, y_test):
    models = {
        "SVM": SVC(kernel="rbf", class_weight="balanced", random_state=42),
        "DecisionTree": DecisionTreeClassifier(class_weight="balanced", random_state=42),
        "RandomForest": RandomForestClassifier(class_weight="balanced", random_state=42),
        "AdaBoost": AdaBoostClassifier(n_estimators=100, random_state=42),
        "NaiveBayes": GaussianNB(),
        "MLP": MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=42),
        "XGBoost": XGBClassifier(eval_metric="mlogloss", random_state=42),
        "CatBoost": CatBoostClassifier(verbose=0, random_state=42)
    }

    results = []
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        results.append({
            "Model": name,
            "Train Accuracy": accuracy_score(y_train, y_train_pred),
            "Test Accuracy": accuracy_score(y_test, y_test_pred)
        })

    return pd.DataFrame(results)

# Step 7: Run the full pipeline
def run_audio_classification_pipeline(csv_path="classified_audios.csv"):
    print("Loading metadata...")
    paths, labels = load_audio_metadata(csv_path)

    print("Extracting MFCC features...")
    X, y = extract_mfcc_features(paths, labels)

    print("Preparing data...")
    (X_train, X_test, y_train, y_test), le = prepare_data(X, y)

    print("Training Random Forest with hyperparameter tuning...")
    best_rf_model, search = train_random_forest(X_train, y_train)

    print("Evaluating tuned Random Forest...")
    evaluate_model(best_rf_model, X_test, y_test, le)
    print("\nBest Hyperparameters:")
    print(search.best_params_)

    print("\nBenchmarking other classifiers...")
    results_df = benchmark_models(X_train, X_test, y_train, y_test)
    print("\nModel Comparison:")
    print(results_df)

# Execute
run_audio_classification_pipeline("classified_audios.csv")

Loading metadata...
Extracting MFCC features...
Preparing data...
Training Random Forest with hyperparameter tuning...
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Evaluating tuned Random Forest...

Accuracy on test set: 0.55

Classification Report:
              precision    recall  f1-score   support

        High       0.78      0.70      0.74        10
         Low       0.67      0.33      0.44         6
      Medium       0.25      0.50      0.33         4

    accuracy                           0.55        20
   macro avg       0.56      0.51      0.50        20
weighted avg       0.64      0.55      0.57        20


Best Hyperparameters:
{'max_depth': 31, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 91}

Benchmarking other classifiers...





Model Comparison:
          Model  Train Accuracy  Test Accuracy
0           SVM        0.392405           0.60
1  DecisionTree        1.000000           0.60
2  RandomForest        1.000000           0.50
3      AdaBoost        0.835443           0.45
4    NaiveBayes        0.582278           0.50
5           MLP        0.405063           0.25
6       XGBoost        1.000000           0.60
7      CatBoost        1.000000           0.50
