In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from pathlib import Path
from IPython.display import Audio, display
import librosa
import numpy as np
from scipy.signal import stft
import torch
from fastai.torch_core import to_np
from fastai_audio.audio_clip import open_audio
from fastai_audio.transform import Spectrogram, FrequencyToMel

In [3]:
DATA_PATH = Path('data/examples_16KHz')

In [4]:
def get_data():
    # load data from example files
    clips = [open_audio(fn) for fn in DATA_PATH.iterdir()]
    sample_rate = clips[0].sample_rate
    tensors = [clip.data for clip in clips]
    # make them all the same length so they can be combined into a batch
    min_len = min(t.size(0) for t in tensors)
    tensors = [t[:min_len] for t in tensors]
    batch_tensor = torch.stack(tensors)
    return batch_tensor, sample_rate

In [5]:
xs, sr = get_data()
x = to_np(xs[0])

In [6]:
Audio(x, rate=sr)

In [7]:
def time_stretch(y, rate):
    stft = core.stft(y)
    stft_stretch = core.phase_vocoder(stft, rate)
    y_stretch = core.istft(stft_stretch, dtype=y.dtype)
    return y_stretch

In [8]:
def info(arr):
    print("shape:", arr.shape)
    print("dtype:", arr.dtype)
    print("range: [{:.5f}, {:.5f}]".format(arr.min(), arr.max()))
    print()

In [9]:
n_fft = 1024
hop_length = 256

In [10]:
info(x)

shape: (90508,)
dtype: float32
range: [-0.56581, 0.48552]



In [11]:
D = librosa.stft(x, 
                 n_fft=n_fft, 
                 hop_length=hop_length,
                 window='hann')
m = np.abs(D)
p = np.angle(D)
info(m), info(p)

shape: (513, 354)
dtype: float32
range: [0.00000, 59.74660]

shape: (513, 354)
dtype: float32
range: [-3.14159, 3.14159]



(None, None)

In [13]:
# %timeit phase_vocoder(D, 1.0)

In [14]:
xt = torch.from_numpy(x)
Dt = torch.stft(xt, 
                n_fft=n_fft, 
                hop_length=hop_length,
                window=torch.hann_window(n_fft))
mt = Dt.pow(2.0).sum(-1).sqrt()
pt = torch.atan2(Dt[...,1], Dt[...,0])
info(mt), info(pt)

shape: torch.Size([513, 354])
dtype: torch.float32
range: [0.00000, 59.74660]

shape: torch.Size([513, 354])
dtype: torch.float32
range: [-3.14159, 3.14159]



(None, None)

In [15]:
np.isclose(to_np(mt), m, atol=1e-5).all()

True

In [16]:
# off around the boundaries
np.isclose(to_np(pt), p, atol=1e-3).mean()

0.9629299236792547

In [17]:
def phase_vocoder_fast(D, rate, hop_length=None):
    n_fft = 2 * (D.shape[0] - 1)
    if hop_length is None:
        hop_length = int(n_fft // 4)

    time_steps = np.arange(0, D.shape[1], rate, dtype=np.float)
    # Expected phase advance in each bin
    phi_advance = np.linspace(0, np.pi * hop_length, D.shape[0])
    # Phase accumulator; initialize to the first sample
    phase_acc = np.angle(D[:, 0])
    # Pad 0 column to simplify boundary logic
    D = np.pad(D, [(0, 0), (0, 1)], mode='constant')
    
    mags = np.abs(D)
    phases = np.angle(D)
    phase_diffs = np.diff(phases, axis=-1)
    # Compute phase advance
    dphase = (phase_diffs - phi_advance[:,np.newaxis])
    # Wrap to -pi:pi range
    dphase = dphase - 2.0 * np.pi * np.round(dphase / (2.0 * np.pi))
    cols0 = np.s_[:, time_steps.astype(int)]
    cols1 = np.s_[:, (time_steps + 1).astype(int)]
    alpha = np.mod(time_steps, 1.0)
    mag = ((1.0 - alpha).astype(np.float32) * mags[cols0] 
           + (alpha.astype(np.float32) * mags[cols1]))
    
    dphase1 = dphase[cols0] + phi_advance[:,np.newaxis]    
    dphase1 = np.hstack([phase_acc[:,np.newaxis], 
                         dphase1[:,:-1]]).astype(np.float32)
    phase_accs = np.cumsum(dphase1, axis=-1)
    return mag * np.exp(1.j * phase_accs)

rate = 1.7

slow = librosa.phase_vocoder(D, rate)
fast = phase_vocoder_fast(D, rate)

res = (slow == fast)
res_close = np.isclose(slow, fast)

(res.all(),res.mean(), res_close.all(), res_close.mean())

(False, 0.6834270684686197, False, 0.6840799500079279)

In [18]:
%timeit librosa.phase_vocoder(D, 1)

48.6 ms ± 1.62 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [19]:
%timeit phase_vocoder_fast(D, 1)

14.8 ms ± 886 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


### Time Stretch

In [20]:
def time_stretch(y, rate):
    stft = librosa.stft(y)
    stft_stretch = phase_vocoder_fast(stft, rate)
    y_stretch = librosa.istft(stft_stretch, dtype=y.dtype)
    return y_stretch

In [21]:
x1 = librosa.effects.time_stretch(x, 2.0)
Audio(x1, rate=sr)

In [22]:
x2 = time_stretch(x, 2.0)
Audio(x1, rate=sr)

In [23]:
%timeit  librosa.effects.time_stretch(x, 2.0)

28.1 ms ± 1.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
%timeit time_stretch(x, 2.0)

21.7 ms ± 576 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


### PyTorch PhaseVocoder

In [25]:
def phase_vocoder_torch(mags, phases, rate, hop_length=None):
    n_fft = 2 * (mags.size(0) - 1)
    n_steps = mags.size(1)
    if hop_length is None:
        hop_length = int(n_fft // 4)
            
    time_steps = torch.arange(0, n_steps, rate)
    time_steps_np = to_np(time_steps)
    
    cols0 = np.s_[:, time_steps_np.astype(int)]
    cols1 = np.s_[:, (time_steps_np + 1).astype(int)]
    
    phi_advance = torch.linspace(0, np.pi * hop_length, mags.size(0)).unsqueeze_(1)
    phase_acc = phases[:,0].unsqueeze(1)    
    phases = torch.nn.functional.pad(phases, [0,1,0,0])
    phase_diffs = phases[:,1:] - phases[:,:-1] - phi_advance
    # wrap to [-pi, pi] range
    phase_diffs = phase_diffs - 2.0*np.pi * torch.round(phase_diffs / (2.0*np.pi))
    alpha = torch.remainder(time_steps, 1.0)
    mags = torch.nn.functional.pad(mags, [0,1,0,0])
    mag = ((1.0 - alpha) * mags[cols0]
                + alpha  * mags[cols1])
    delta_phase = torch.cat([phase_acc, phase_diffs[cols0] + phi_advance], dim=1)[...,:-1]
    phase_accs = torch.cumsum(delta_phase, dim=-1)
    return mag, phase_accs
    
    
mt2, pt2 = phase_vocoder_torch(mt, pt, 0.5)

In [26]:
%timeit phase_vocoder_torch(mt, pt, 1.0)

3.49 ms ± 118 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [27]:
mt2.shape, pt2.shape

(torch.Size([513, 708]), torch.Size([513, 708]))

In [28]:
m = to_np(mt2)
p = to_np(pt2)
real = m * np.cos(p)
imag = m * np.sin(p)

In [29]:
D3 = real + 1.j*imag

In [30]:
D3.shape

(513, 708)

In [31]:
y3 = librosa.istft(D3, dtype=x.dtype)

In [32]:
y3.shape

(180992,)

In [33]:
Audio(y3, rate=sr)