In [2]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from typing import Optional, Any, Iterable
from pydub import AudioSegment

In [3]:
path_to_audio = "./test_data/karma police.wav"

In [6]:
class AudioIO:
    backend: str = "pydub"
 
    def __init__(
        self,
        path: str,
        target_sample_rate: Optional[int] = 44100,
        mono: Optional[bool] = True,
        normalize: Optional[bool] = True,
        chunk_size: Optional[int] = 1024,
        hop_size: Optional[int] = 512,
        dtype: Optional[Any] = np.float32,
    ):
        assert os.path.exists(path)
        self.path = path
        self.sample_rate = target_sample_rate
        self.mono = mono
        self.normalize = normalize
        self.chunk_size = chunk_size
        self.hop_size = hop_size
        self.dtype = dtype
        self.sample_length = 0

    def __len__(self):
        return self.sample_length - self.chunk_size // self.hop_size
 
    def __normalize(self, samples):
        if not isinstance(samples, np.ndarray):
            samples = np.asarray(samples, self.dtype)
        samples /= np.max(np.abs(samples))
        return samples
    
    def _sample_chunk(self, samples, chunk_index):
        start = chunk_index * self.chunk_size
        end = (start + self.chunk_size)
        chunk = samples[start:end]
        return np.asarray(np.array_split(chunk, self.hop_size))
 
    def read(self, chunk_index: Optional[int]=None, verbose=False, truncate: Optional[int]=None):
        audio: AudioSegment = AudioSegment.from_file(self.path, format="wav")
        if self.mono:
            audio = audio.set_channels(1).set_frame_rate(self.sample_rate)
            samples = np.array(audio.get_array_of_samples()).astype(np.float32)
        else:
            audio = audio.set_frame_rate(self.sample_rate)
            audio = audio.split_to_mono()
            left = audio[0].get_array_of_samples()
            right = audio[1].get_array_of_samples()
            # audio = audio.get_array_of_samples()
            samples = np.array([left, right]).astype(np.float32)
        if self.normalize:
            samples = self.__normalize(samples)

        self.sample_length = len(samples)
        if not chunk_index is None and isinstance(chunk_index, int):
            samples = self._sample_chunk(samples, chunk_index)
 
        if verbose:
            print(
                f"Sample shape: {samples.shape}, Audio type: {type(audio)}, Num Channels: {self.num_channels}, Sample Rate: {self.sample_rate}"
            )
        if truncate is None:
            return samples, self.sample_rate
        elif not self.mono:
            return samples[:, :truncate], self.sample_rate
        else:
            return samples[:truncate], self.sample_rate
        
reader = AudioIO(path_to_audio, mono=True)
audio, sample_rate = reader.read(chunk_index=1)
print(audio.shape)

(512, 2)


In [8]:
from spectra.fft import Real1DLogFFT

fft_op = Real1DLogFFT(sample_rate)

fft_frames, freqs = fft_op(audio, apply_transforms=False)
print(fft_frames.shape, freqs.shape)

IndexError: boolean index did not match indexed array along axis 0; size of axis is 512 but size of corresponding boolean axis is 513