# core

> Audio Embeddings

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
def foo(): pass

In [None]:
import numpy as np
from fastcore.all import *
import soundfile as sf
from fasttransform import Transform, Pipeline
from transformers import AutoFeatureExtractor

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def load_audio(path, sec=None, sample_rate=16_000) -> np.array:
    with sf.SoundFile(path) as f: audio = f.read(int(sec * sample_rate)) if sec else f.read()
    return audio

In [None]:
audio = load_audio('../test_files/XC119042.ogg')
print(audio.shape)
audio[:5]

(632790,)


array([-2.64216160e-05, -2.54259703e-05,  5.56615578e-06, -5.17481631e-08,
       -1.35020821e-06])

In [None]:
def pool(inputs, pooling="mean"):
    if pooling == "mean":
        return inputs.mean(axis=0)
    elif pooling == "max":
        return inputs.max(axis=0)
    else:
        return inputs

In [None]:
pool(np.array([[1,8,9,4,5], 
               [6,7,8,9,10]]), pooling="max")

array([ 6,  8,  9,  9, 10])

In [None]:
pool(np.array([[1,2,3,4,5], 
               [6,7,8,9,10]]), pooling="mean")

array([3.5, 4.5, 5.5, 6.5, 7.5])

In [None]:
def extract_audio_features(audio, sample_rate=16_000, model_name="MIT/ast-finetuned-audioset-10-10-0.4593", pooling=None):
    feature_extractor = AutoFeatureExtractor.from_pretrained(model_name)
    inputs = np.array(feature_extractor(audio, sampling_rate=sample_rate, return_tensors="pt")['input_values'][0])
    return pool(inputs, pooling) if pooling else inputs

inputs = extract_audio_features(audio, pooling="max")
print(inputs.shape)
inputs[:5]

(128,)


array([ 0.26203126, -0.17400272,  0.20281754, -1.2775939 , -0.03474949],
      dtype=float32)

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()