In [1]:
# Audio hybrid feature bank â€“ ESC-50

%pip install librosa soundfile transformers umap-learn --quiet

import os
import numpy as np
import pandas as pd
import librosa

import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

print("Torch version:", torch.__version__)
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", device)

Note: you may need to restart the kernel to use updated packages.
Torch version: 2.1.0
Using device: mps


In [None]:
# Adjust these if your folder names differ
DATA_ROOT = "../data"          
ESC50_ROOT = os.path.join(DATA_ROOT, "esc50")

META_PATH = os.path.join(ESC50_ROOT, "meta", "esc50.csv")
AUDIO_DIR = os.path.join(ESC50_ROOT, "audio")

print("ESC-50 root:", ESC50_ROOT)
print("Meta file exists:", os.path.exists(META_PATH))
print("Audio dir exists:", os.path.isdir(AUDIO_DIR))

df = pd.read_csv(META_PATH)
df.head(), len(df)

ESC-50 root: ../data/esc50
Meta file exists: True
Audio dir exists: True


(            filename  fold  target        category  esc10  src_file take
 0   1-100032-A-0.wav     1       0             dog   True    100032    A
 1  1-100038-A-14.wav     1      14  chirping_birds  False    100038    A
 2  1-100210-A-36.wav     1      36  vacuum_cleaner  False    100210    A
 3  1-100210-B-36.wav     1      36  vacuum_cleaner  False    100210    B
 4  1-101296-A-19.wav     1      19    thunderstorm  False    101296    A,
 2000)

In [3]:
train_df = df[df["fold"] < 5].reset_index(drop=True)
test_df  = df[df["fold"] == 5].reset_index(drop=True)

len(train_df), len(test_df)

(1600, 400)

In [4]:
SR = 16000      # sample rate for all audio
N_MFCC = 40     # number of MFCC coefficients

def extract_mfcc(path, sr=SR, n_mfcc=N_MFCC):
    # Load mono audio, resample to SR
    y, _ = librosa.load(path, sr=sr, mono=True)
    # Compute MFCCs: (n_mfcc, time_frames)
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    # Take mean over time to get a fixed-length vector
    return mfcc.mean(axis=1)

In [5]:
def build_mfcc_features(df_subset):
    X_list, y_list = [], []
    for _, row in df_subset.iterrows():
        audio_path = os.path.join(AUDIO_DIR, row["filename"])
        feats = extract_mfcc(audio_path)
        X_list.append(feats)
        y_list.append(row["target"])
    return np.vstack(X_list), np.array(y_list)

mfcc_train_X, mfcc_train_y = build_mfcc_features(train_df)
mfcc_test_X,  mfcc_test_y  = build_mfcc_features(test_df)

mfcc_train_X.shape, mfcc_test_X.shape

((1600, 40), (400, 40))

In [6]:
mfcc_clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(
        max_iter=2000,
        multi_class="multinomial",
        n_jobs=-1
    )
)

print("Training Logistic Regression on MFCC features...")
mfcc_clf.fit(mfcc_train_X, mfcc_train_y)

mfcc_y_pred = mfcc_clf.predict(mfcc_test_X)

mfcc_acc = accuracy_score(mfcc_test_y, mfcc_y_pred)
print("\nMFCC + LogisticRegression accuracy:", round(mfcc_acc, 3))

print("\nClassification report (MFCC):")
print(classification_report(mfcc_test_y, mfcc_y_pred, digits=3))

Training Logistic Regression on MFCC features...





MFCC + LogisticRegression accuracy: 0.3

Classification report (MFCC):
              precision    recall  f1-score   support

           0      0.300     0.375     0.333         8
           1      0.556     0.625     0.588         8
           2      0.125     0.125     0.125         8
           3      0.250     0.250     0.250         8
           4      0.250     0.125     0.167         8
           5      0.125     0.125     0.125         8
           6      0.364     0.500     0.421         8
           7      0.000     0.000     0.000         8
           8      0.500     0.750     0.600         8
           9      0.385     0.625     0.476         8
          10      0.750     0.750     0.750         8
          11      0.600     0.750     0.667         8
          12      0.444     0.500     0.471         8
          13      0.111     0.125     0.118         8
          14      0.083     0.125     0.100         8
          15      0.000     0.000     0.000         8
         

In [7]:
#AST Embeddings (Audio Spectrogram Transformer)
from transformers import ASTFeatureExtractor, ASTModel
import torch

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("AST using:", device)

  from .autonotebook import tqdm as notebook_tqdm


AST using: mps


In [8]:
ast_processor = ASTFeatureExtractor.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593"
)

ast_model = ASTModel.from_pretrained(
    "MIT/ast-finetuned-audioset-10-10-0.4593"
).to(device)

ast_model.eval()

ASTModel(
  (embeddings): ASTEmbeddings(
    (patch_embeddings): ASTPatchEmbeddings(
      (projection): Conv2d(1, 768, kernel_size=(16, 16), stride=(10, 10))
    )
    (dropout): Dropout(p=0.0, inplace=False)
  )
  (encoder): ASTEncoder(
    (layer): ModuleList(
      (0-11): 12 x ASTLayer(
        (attention): ASTAttention(
          (attention): ASTSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
          (output): ASTSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (intermediate): ASTIntermediate(
          (dense): Linear(in_features=768, out_features=3072, bias=True)
          (intermediate_act_fn): GELUActivation(

In [9]:
def extract_ast_features(path):
    # Load raw waveform
    y, sr = librosa.load(path, sr=16000, mono=True)

    # Prepare input for AST
    inputs = ast_processor(
        y,
        sampling_rate=16000,
        return_tensors="pt",
        padding="longest"
    ).to(device)

    with torch.no_grad():
        outputs = ast_model(**inputs)
        # outputs.last_hidden_state: (batch, time, hidden_size=768)
        # take mean over time for fixed-length vector
        feats = outputs.last_hidden_state.mean(dim=1).cpu().numpy().flatten()

    return feats

In [10]:
def build_ast_features(df_subset):
    X_list, y_list = [], []
    for _, row in df_subset.iterrows():
        audio_path = os.path.join(AUDIO_DIR, row["filename"])
        feats = extract_ast_features(audio_path)
        X_list.append(feats)
        y_list.append(row["target"])
    return np.vstack(X_list), np.array(y_list)

ast_train_X, ast_train_y = build_ast_features(train_df)
ast_test_X,  ast_test_y  = build_ast_features(test_df)

ast_train_X.shape, ast_test_X.shape

((1600, 768), (400, 768))

In [11]:
ast_clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=2000, multi_class="multinomial", n_jobs=-1)
)

print("Training Logistic Regression on AST features...")
ast_clf.fit(ast_train_X, ast_train_y)

ast_y_pred = ast_clf.predict(ast_test_X)

ast_acc = accuracy_score(ast_test_y, ast_y_pred)
print("\nAST + LogisticRegression accuracy:", round(ast_acc, 3))

print("\nClassification report (AST):")
print(classification_report(ast_test_y, ast_y_pred, digits=3))

Training Logistic Regression on AST features...





AST + LogisticRegression accuracy: 0.943

Classification report (AST):
              precision    recall  f1-score   support

           0      1.000     1.000     1.000         8
           1      1.000     1.000     1.000         8
           2      1.000     1.000     1.000         8
           3      1.000     0.875     0.933         8
           4      1.000     1.000     1.000         8
           5      1.000     0.875     0.933         8
           6      1.000     1.000     1.000         8
           7      0.889     1.000     0.941         8
           8      1.000     1.000     1.000         8
           9      1.000     1.000     1.000         8
          10      0.889     1.000     0.941         8
          11      1.000     0.875     0.933         8
          12      0.857     0.750     0.800         8
          13      1.000     1.000     1.000         8
          14      1.000     1.000     1.000         8
          15      1.000     0.750     0.857         8
         

In [12]:
#MFCC + AST first.
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# 1) Concatenate MFCC + AST embeddings
audio_hybrid_train_X = np.concatenate([mfcc_train_X, ast_train_X], axis=1)
audio_hybrid_test_X  = np.concatenate([mfcc_test_X,  ast_test_X],  axis=1)

print("Audio hybrid feature shape (train, test):",
      audio_hybrid_train_X.shape, audio_hybrid_test_X.shape)

# 2) Train Logistic Regression on the hybrid features
audio_hybrid_clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, multi_class="multinomial", n_jobs=-1)
)

print("\nTraining Logistic Regression on MFCC + AST fused features...")
audio_hybrid_clf.fit(audio_hybrid_train_X, mfcc_train_y)   # labels are the same

hybrid_audio_y_pred = audio_hybrid_clf.predict(audio_hybrid_test_X)

audio_hybrid_acc = accuracy_score(mfcc_test_y, hybrid_audio_y_pred)
print("\nMFCC + AST (audio hybrid) accuracy:", audio_hybrid_acc)

print("\nClassification report (Audio Hybrid):")
print(classification_report(mfcc_test_y, hybrid_audio_y_pred, digits=3))

Audio hybrid feature shape (train, test): (1600, 808) (400, 808)

Training Logistic Regression on MFCC + AST fused features...





MFCC + AST (audio hybrid) accuracy: 0.9375

Classification report (Audio Hybrid):
              precision    recall  f1-score   support

           0      1.000     1.000     1.000         8
           1      1.000     1.000     1.000         8
           2      1.000     1.000     1.000         8
           3      1.000     0.875     0.933         8
           4      1.000     1.000     1.000         8
           5      1.000     0.875     0.933         8
           6      1.000     1.000     1.000         8
           7      0.889     1.000     0.941         8
           8      1.000     1.000     1.000         8
           9      1.000     1.000     1.000         8
          10      0.889     1.000     0.941         8
          11      1.000     0.875     0.933         8
          12      0.857     0.750     0.800         8
          13      1.000     1.000     1.000         8
          14      1.000     1.000     1.000         8
          15      1.000     0.750     0.857         