In [1]:

import librosa
import numpy as np
import scipy
import torch 

from gfm_iaif import gfm_iaif
from glottis import Glottis

import IPython.display


In [99]:
# input, fs = librosa.load("C#2.wav")
input, fs = librosa.load("0_47.wav")
print(f"Shapes: input {input.shape} len")
framelength = 1024
hoplength = 128
fmin, fmax = 70, 500
ncilinders = 44

frames = librosa.util.frame(input, frame_length=framelength, hop_length=hoplength)
nframes = frames.shape[1]

#X = librosa.amplitude_to_db(np.abs(librosa.stft(input, n_fft=framelength))).squeeze()

glottis = np.zeros_like(input)
vtcoeffs = np.empty((ncilinders+1,nframes))
glcoeffs = np.empty((4, nframes))
lipcoeffs = np.empty((2, nframes))

for i in range(nframes):
    frame = frames[:, i]
    vtcoeffs[:,i], glcoeffs[:,i], lipcoeffs[:,i] = gfm_iaif(frame, n_vt=ncilinders)
    framepad = np.pad(frame, ((0,ncilinders+1)), mode='edge')
    idx = np.arange(librosa.frames_to_samples(i, hop_length=hoplength), librosa.frames_to_samples(i, hop_length=hoplength)+framelength)
    glottis[idx] += scipy.signal.lfilter(vtcoeffs[:,i], [1], framepad)[ncilinders+1:] * scipy.signal.get_window("hamming", framelength)

#freqresp = np.empty((framelength//2, nframes), dtype=np.complex64)
#for i in range(nframes):
#    w, freqresp[:,i] = scipy.signal.freqz([1], vtcoeffs[:,i], plot=lambda w, h: plot.line(w, 20. * np.log10(np.abs(h)), alpha=0.05), fs=fs)
#    w, freqresp[:,i] = scipy.signal.freqz(vtcoeffs[:,i], [1], plot=lambda w, h: plot.line(w, 20. * np.log10(np.abs(h)), alpha=0.05, color="red"), fs=fs)

#X = librosa.amplitude_to_db(np.abs(librosa.stft(glottis, n_fft=framelength))).squeeze()

f0 = np.concatenate([librosa.yin(frames[:,i] / np.max(np.abs(frames[:,i])), fmin=fmin, fmax=fmax, frame_length=framelength, hop_length=hoplength, sr=fs, center=False, trough_threshold=0.1) for i in range(nframes)])
gframes = librosa.util.frame(glottis, frame_length=framelength, hop_length=hoplength)
print(f"Shapes: gframes {gframes.shape} (orig), nframes {nframes}")

Rd = np.empty(nframes)

for i in range(nframes):
    X = librosa.amplitude_to_db(np.abs(librosa.stft(gframes[:,i], n_fft=framelength, hop_length=framelength)))
    h1bin = int(np.round(f0[i] / fs * framelength))
    h2bin = int(np.round(2 * f0[i] / fs * framelength))
    Rd[i] = (X[h1bin,1] - X[h2bin,1] + 7.6) / 11.
    #gframes[:,i]

tenseness = np.clip( (1 - Rd / 3) , 0, 1)
loudness = librosa.feature.rms(y=input, frame_length=framelength, hop_length=hoplength, center=False )

# create an empty glottis
glottis = Glottis(ncilinders, fs)
glottis_signal = glottis.get_waveform(tenseness=torch.Tensor(tenseness), 
                                      freq=torch.Tensor(f0.reshape(-1, 1)), 
                                      frame_len=hoplength).detach().numpy()
print(f"glottis_signal {glottis_signal.shape} original length")
# no volume adjustment
gframes = librosa.util.frame(glottis_signal, frame_length=framelength, hop_length=hoplength).copy()
print(f"gframes {gframes.shape} pre_adjustment")

# volume adjustment
for i in range(gframes.shape[1]):
    idx = np.arange(librosa.frames_to_samples(i, hop_length=hoplength), librosa.frames_to_samples(i, hop_length=hoplength)+framelength)
    glottis_signal[idx] *= (loudness[0,i]+10**(5/20))

# reconvert with volumne now
gframes = librosa.util.frame(glottis_signal, frame_length=framelength, hop_length=hoplength)
print(f"gframes {gframes.shape} post_adjustment")
print(f"glottis_signal {glottis_signal.shape} new length")

# resynth from the data

# rng = np.random.default_rng()
# ran = rng.standard_normal(vtcoeffs.shape[1]) 

vtsignal = np.zeros_like(input)
for i in range(min(nframes, gframes.shape[1])):
    frame = gframes[:, i]
    framepad = np.pad(frame, ((0,ncilinders+1)), mode='edge')
    idx = np.arange(librosa.frames_to_samples(i, hop_length=hoplength), librosa.frames_to_samples(i, hop_length=hoplength)+framelength)
    # TODO what is this: out[idx] += np.fft.irfft(np.fft.rfft(frame * scipy.signal.get_window("hamming", framelength))[0:-1] * Hkl[:,i], n=framelength)
    vtsignal[idx] += scipy.signal.lfilter([1], vtcoeffs[:,i], framepad)[ncilinders+1:] * scipy.signal.get_window("hamming", framelength)
    # TODO donde estan los lip coefficients???

out = np.nan_to_num(vtsignal, nan=0.0)
print(f"Shapes: output {out.shape} len")


Shapes: input (81585,) len
Shapes: gframes (1024, 630) (orig), nframes 630
glottis_signal (80512,) original length
gframes (1024, 622) pre_adjustment
gframes (1024, 622) post_adjustment
glottis_signal (80512,) new length
Shapes: output (81585,) len


  vtsignal[idx] += scipy.signal.lfilter([1], lipcoeffs[:,i], framepad)[ncilinders+1:] * scipy.signal.get_window("hamming", framelength)


In [107]:
vtcoeffs[:,10], glcoeffs[:,10], lipcoeffs[:,10]

(array([ 1.00000000e+00,  3.16928119e-01,  1.11431852e+00,  4.22056731e-01,
         1.16289147e+00, -1.24205642e-03,  4.11626601e-01, -5.82757698e-01,
        -4.07133831e-02, -1.03524332e+00, -5.92584350e-01, -1.45314379e+00,
        -6.00836569e-01, -1.22332679e+00, -2.22765363e-01, -7.07792984e-01,
         3.32018311e-01, -2.48840202e-01,  7.36478239e-01,  6.50272467e-02,
         1.03290322e+00,  3.20074352e-01,  9.73692361e-01,  4.73670121e-02,
         5.84807309e-01, -3.85534181e-01,  1.34660815e-01, -7.22014410e-01,
        -8.22403153e-02, -6.74399805e-01, -6.47668468e-02, -4.96538279e-01,
         7.63613563e-02, -1.27495211e-01,  2.97584330e-01,  5.15517741e-03,
         2.67556300e-01, -2.16472828e-02,  1.70779225e-01, -1.46314751e-01,
         8.94924942e-02, -7.35247306e-02,  1.14134798e-01, -5.90334146e-02,
         4.50615993e-02]),
 array([ 1.        , -1.49534585,  0.72836915, -0.11807258]),
 array([ 1.  , -0.99]))

In [108]:
print("Original...")
IPython.display.Audio(data=input, rate=fs, autoplay=True)


Original...


In [109]:

print("Resynth...")
IPython.display.Audio(data=out, rate=fs, autoplay=True, )


Resynth...


  scaled = data / normalization_factor * 32767
  return scaled.astype("<h").tobytes(), nchan


In [30]:
print("Glottis...")
IPython.display.Audio(data=glottis_signal, rate=fs)


Glottis...


In [36]:
tenseness.shape

(630,)

In [37]:
loudness.shape

(1, 630)

In [38]:
nframes, gframes.shape

(630, (1024, 622))

In [44]:
input.shape[0]-framelength-glottis_signal.shape[0]

49

In [40]:
glottis_signal.shape

(80512,)

In [104]:
lipcoeffs.shape

(2, 630)

In [105]:
vtcoeffs.shape

(45, 630)