# core

> Audio Embeddings

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
def foo(): pass

In [None]:
import numpy as np
from fastcore.all import *
import soundfile as sf
from IPython.display import Audio
from fasttransform import Transform, Pipeline
from transformers import AutoFeatureExtractor

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
class AudioArray(BasicRepr): 
    def __init__(self, a: np.array, sr: int): store_attr()
    @property
    def shape(self): return self.a.shape
    def __len__(self): return len(self.a)
    def __getitem__(self, idx): return self.a[idx]

In [None]:
aa = AudioArray(np.array([1,2,3,4,5]), 32_000)
aa[:3], aa.sr

(array([1, 2, 3]), 32000)

In [None]:
def load_audio(path, sr=None): 
    with sf.SoundFile(path) as f: return AudioArray(f.read(), sr if sr else f.samplerate)

In [None]:
audio = load_audio('../test_files/XC119042.ogg', sr=32_000)
audio[:5], audio.sr

(array([-2.64216160e-05, -2.54259703e-05,  5.56615578e-06, -5.17481631e-08,
        -1.35020821e-06]),
 32000)

In [None]:
Audio(audio, rate=audio.sr)

In [None]:
def calc_ratio(audio: AudioArray, target_sr: int) -> float:
    return target_sr / audio.sr

# 16000 / 32000 = 0.5
calc_ratio(audio, 16_000)

0.5

In [None]:
len(audio.a)

632790

In [None]:
def new_length(audio: AudioArray, target_sr: int) -> int:
    return int(len(audio.a) * calc_ratio(audio, target_sr))

In [None]:
# 632790 * 0.5 = 316395
new_length(audio, 16_000)

316395

In [None]:
def resample(audio: AudioArray, target_sr: int) -> AudioArray:
    if audio.sr == target_sr: return audio
    indices = np.linspace(0, len(audio.a) - 1, new_length(audio, target_sr))
    resampled = np.interp(indices, np.arange(len(audio.a)), audio.a)
    return AudioArray(resampled, target_sr)

In [None]:
audio_16k = resample(audio, 16_000)
audio_16k[:5], audio_16k.sr

(array([-2.64216160e-05,  5.56613802e-06, -1.35020873e-06,  3.97246192e-06,
         2.80902310e-05]),
 16000)

In [None]:
Audio(audio_16k, rate=audio_16k.sr)

In [None]:
def get_features(audio: AudioArray, model_name="MIT/ast-finetuned-audioset-10-10-0.4593"):
    return AutoFeatureExtractor.from_pretrained(model_name)(audio, sampling_rate=audio.sr, return_tensors="np")['input_values']

In [None]:
inputs = get_features(audio_16k)
inputs.shape, inputs[0][0][:5]



((1, 1024, 128),
 array([-0.7693808 , -1.2774696 , -0.90064937, -1.2775939 , -1.0286174 ],
       dtype=float32))

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()