# Audio Common
This module contains the "common" data to all other modules, like basic types.

## Setup

### Prerequisites
Be sure you've run `install.sh` before running this notebook!

### Settings

In [1]:
#Export
from pathlib import Path
import mimetypes
import torch
from torchaudio import load as load_audio
from torchaudio.transforms import MelSpectrogram, PadTrim
from fastai.vision import Image, open_image

### Constants and definitions

In [2]:
#Export
AUDIO_EXTENSIONS = tuple(str.lower(k) for k,v in mimetypes.types_map.items() 
                         if v.startswith('audio/'))

## AudioData

This is the base class of our audio data. It contains two basic information about the "sound":
* sig: the actual signal
* sr: the sample rate

**IMPORTANT:** the audio signal is expected to be one-dimensional i.e. mono. If you have stereo recordings, you should downsample to mono. Later, we could handle this as a preprocessing step, and/or handle stereo files natively.

In [4]:
#Export
class SPEC2DB(object):
    """Turns a spectrogram from the power/amplitude scale to the decibel scale.

    Args:
        stype (str): scale of input spectrogram ("power" or "magnitude").  The
            power being the elementwise square of the magnitude. default: "power"
        top_db (float, optional): minimum negative cut-off in decibels.  A reasonable number
            is -80.
    """
    def __init__(self, stype="power", top_db=None):
        self.stype = stype
        self.top_db = -top_db if top_db > 0 else top_db
        self.multiplier = 10. if stype == "power" else 20.

    def __call__(self, spec):
        spec_db = self.multiplier * torch.log10(spec / spec.max())  # power -> dB
        if self.top_db is not None:
            spec_db = torch.max(spec_db, spec_db.new([self.top_db]))
        return spec_db

In [None]:
#Export
def tfm_pad_or_trim(sig, mx, trim_section="end", pad_at_end=True, **kwargs):
    """Pad tensor with zeros (silence) until it reaches length `mx` frames, or trim clip to length `mx` frames"""
    siglen = len(sig)
    if siglen < mx:
        diff = mx - siglen
        padding = sig.new_zeros(diff) # Maintain input tensor device & type params
        nsig = torch.cat((sig,padding)) if pad_at_end else torch.cat((padding,sig))
    else:
        if trim_section not in {"start","mid","end"}:
            raise ValueError(f"'trim_section' argument must be one of 'start', 'mid' or 'end', got '{trim_section}'")
        if trim_section == "mid":
            nsig = sig.narrow(0, (siglen // 2) - (mx // 2), mx)
        elif trim_section == "end":
            nsig = sig.narrow(0, siglen-mx, mx)
        else:
            nsig = sig.narrow(0, 0, mx)
    return AudioData(sig=nsig, sr=ad.sr)

In [5]:
#Export
class AudioData:
    '''Holds basic information from audio signal'''

    def __init__(self, sig, sr=16000, spectro=None):
        self.sig = sig.reshape(-1) # We want single dimension data
        self.sr = sr
        self.spectro = spectro
        self.use_spectro = spectro is not None
          
    @property 
    def shape(self): 
        return self.spectro.shape if self.spectro is not None else self.sig.shape 
        
    @classmethod
    def load(cls, fileName, use_spectro=True, cache_spectro=True, to_db_scale=True, n_fft=1024,
                ws=None, hop=72, f_min=0.0, f_max=8000, pad=0, n_mels=224, max_to_pad=10000, **kwargs):
        p = Path(fileName)
        if p.exists() & str(p).lower().endswith(AUDIO_EXTENSIONS):
            signal, samplerate = load_audio(str(fileName))
            signal = PadTrim(max_len=max_to_pad)([None,:]).squeeze()
            mel = None
            if use_spectro:
                image_path = None
                if cache_spectro:
                    image_path = p.with_suffix('.jpg')
                    if image_path.exists(): 
                        mel = open_image(image_path).data
                        return AudioData(signal, samplerate, mel)

                mel = MelSpectrogram(sr=samplerate, n_mels=n_mels, n_fft=n_fft, ws=ws, hop=hop,
                                    f_min=f_min, f_max=f_max, pad=pad)(signal.reshape(1, -1))
                mel = mel.permute(0,2,1) # swap dimension...
                if to_db_scale: mel = SPEC2DB(stype='magnitude', top_db=f_max)(mel)
                if cache_spectro: 
                    Image(mel).save(image_path)
                    Image(mel).save(image_path)
            return AudioData(signal, samplerate, mel)
        raise Exception(f"Error while processing {fileName}: file not found or does not have valid extension: {AUDIO_EXTENSIONS}")

## Tests

VERY rudimentary. Ideally we would have sample data that we knew would fail (e.g. non-audio data, audio data with wrong extensions, stereo samples, etc). 

### Sample data for our tests

In [4]:
from fastai.basics import url2name, datapath4file, untar_data
data_url = 'http://www.openslr.org/resources/45/ST-AEDS-20180100_1-OS'
path = datapath4file(url2name(data_url))
untar_data(data_url, dest = path) 
good_sample = path.ls()[256] # arbitrary choice of file

### Direct torchaudio "test" of sample data

In [5]:
signal, samplerate = torchaudio.load(good_sample)

In [6]:
signal.shape, samplerate

(torch.Size([1, 43520]), 16000)

In [7]:
from IPython.display import Audio
Audio(data=signal,rate=samplerate)

In [8]:
s = AudioData.load(good_sample)
display(s)


<__main__.AudioData at 0x7f745f7bd1d0>

### More reasonable tests

In [9]:
def is_mono(a): assert 1 == len(a.sig.shape), "Not single dim"
def is_16kHz(a): assert 16000 == a.sr, "Not 16kHz"
def has_data(a): assert a.sig.shape[0] > 100, "Not more than 100 samples"

In [10]:
allTests = lambda x: [f(x) for f in [is_mono, is_16kHz, has_data]]

In [11]:
def test_AudioData_create_from_audio_file_path(f):
    a = AudioData.load(f)
    allTests(a)
    print(f"{f} passed loading from file")

In [12]:
def test_AudioData_create_from_data(f):
    signal,samplerate = torchaudio.load(f)
    a = AudioData(signal,samplerate)
    allTests(a)
    print(f"{f} passed loading from data")

In [13]:
inps = [good_sample, "badpath"] ## Should have bad_samples too
for inp in inps:
    try:
        test_AudioData_create_from_audio_file_path(inp)
        test_AudioData_create_from_data(inp)
    except Exception as e: print(e)

/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00199.wav passed loading from file
/home/jupyter/.fastai/data/ST-AEDS-20180100_1-OS/f0004_us_f0004_00199.wav passed loading from data
Error while processing badpath: file not found or does not have valid extension: ('.aif', '.aifc', '.aiff', '.au', '.mp2', '.mp3', '.ra', '.snd', '.wav')


<span style="color:red">**Careful - trying to torchaudio.load() a non-audio file breaks the kernel!!**</span> That's why we check the extension in `AudioData.load`.

In [14]:
# cwd = %pwd
# test_AudioData_create_from_data(cwd + "/README.md")

## Export

In [15]:
!python notebook2script.py AudioCommon.ipynb

Converted AudioCommon.ipynb to nb_AudioCommon.py
