In [None]:
import numpy as np
from scipy import signal, fft
from scipy.linalg import toeplitz, solve
import matplotlib.pyplot as plt
import IPython.display as ipd

In [None]:
import os
import IPython
os.environ['NUMBA_CACHE_DIR'] = IPython.paths.get_ipython_cache_dir()
import librosa

In [None]:
def lpc_analysis(s, p=20):
    """ compute the LPC analysis using the autocorrelation method
    
    Parameters
    ----------
    x : numpy array
        windowed signal frame as a numpy 1D array.
    p : int
        model order.
        
    Returns
    -------
    ak : numpy array
         model coefficients.
    e : float
        minimum mean squared error.
    e_norm : float
             normalized minimum mean squared error.
    """
    # frame length
    N = s.shape[0]
    
    # compute autocorrelation values
    r = np.zeros((p+1, 1))
    for k in range(p+1):
        r[k] = np.dot(s[:N-k].T, s[k:])

    # solve to compute model coefficients
    ak = solve(toeplitz(r[:p]), r[1:]).squeeze()

    # compute mean squared error
    e = r[0] - np.dot(ak.T, r[1:])

    # compute normalized mean squared error
    e_norm = e / r[0]

    return ak, e, e_norm

In [None]:
def formantes(file, fs=22050, N=460, p=20, umbral1=120, umbral2=250):
    
    if np.max(abs(file))>1e-8:
        x = file / np.max(abs(file)) * 0.9  #normalización
    else:
        return -1, -1
    
    #Enventanado
    # sample del medio de la señal
    ind_mid = int(len(x)/2)
    # signal frame
    s = x[ind_mid-int(N/2):ind_mid+int(N/2)]
    # smoothing window
    window = signal.windows.get_window('hann', N)
    # windowed signal frame
    s_win = s * window

    ak, _, _ = lpc_analysis(s_win, p)  #LPC

    #Formantes
    # raíces del filtro
    raices = np.roots(np.concatenate(([1], -ak)))
    # nos quedamos con polos complejos con ángulo menor a pi
    polos = raices[raices.imag>0]
    # descomponemos en ganancia y ángulo
    Ak = np.abs(polos)
    omegak = np.angle(polos)
    # calculamos frecuencia y estimamos ancho de banda de los polos
    fk = omegak*fs/(2*np.pi)
    BWk = fs*np.log(1/Ak)/np.pi
    # nos quedamos con los polos con ancho menor a umbral
    fk_sort = np.sort(fk[BWk<=umbral1], axis=0)
    if len(fk_sort)<2:
        fk_sort = np.sort(fk[BWk<=umbral2], axis=0)

    return fk_sort[0], fk_sort[1]

In [None]:
def distancias_vocales(F1, F2):
    
    vocales = np.array(((800, 1170), (480, 2300), (240, 2800), (510, 960), (250, 630)))
    # vocales = np.array(((240, 2400), (235, 2100), (390, 2300), (370, 1900), (610, 1900), (585, 1710), (850, 1610), (820, 1530), (750, 940), (700, 760), (600, 1170), (500, 700), (460, 1310), (360, 640), (300, 1390), (250, 595)))
    # vocales_str = ['a', 'e', 'i', 'o', 'u']
    distancias = np.linalg.norm(vocales-(F1,F2), ord=2, axis=1)
    # vocal = np.argmin(distancias)

    return distancias

In [None]:
def ST_distancias_vocales(s, L=2048, R=256):
    """ compute the analysis phase of the phase vocoder, i.e. the STFT of the input audio signal
    
    Parameters
    ----------
    x : numpy array
        input audio signal (mono) as a numpy 1D array.
    L : int
        window length in samples.
    R : int
        hop size in samples.
    win : string
          window type as defined in scipy.signal.windows.    
        
    Returns
    -------
    X_stft : numpy array
             STFT of x as a numpy 2D array.
    omega_stft : numpy array
                 frequency values in radians.
    samps_stft : numpy array
                 time sample at the begining of each frame.

    """
    
    # length of the input signal
    M = s.size;      
    
    # total number of analysis frames
    num_frames = int(np.floor((M - L) / R))

    # initialize stft
    distancias = np.zeros((5, num_frames))
    
    # process each frame
    for ind in range(num_frames):

        # initial and ending points of the frame
        n_ini = int(ind * R)
        n_end = n_ini + L

        # signal frame
        s_w = s[n_ini:n_end]

        # save DFT of the signal frame
        [F1, F2] = formantes(s_w, N=200, p=9, umbral1=150, umbral2=10000)
        if F1!=-1:
            distancias[:, ind] = distancias_vocales(F1, F2)
            # if np.min(distancias[:, ind])>750:
            #     distancias[:, ind] = np.zeros(5)
        else:
            distancias[:, ind] = np.zeros(5)
        
    # frequency values in radians    
    # quefrencys = np.arange(N)

    # # time sample at the center of each frame
    # samps_ceps = np.arange(L/2, M-L/2+1, R)[:-1]
 
    return distancias#, samps_ceps, quefrencys

In [None]:
dir_files = './data/'

X_wav, Fs = librosa.load(dir_files + 'Emily_Linge-vocals.wav')
Y_wav, Fs = librosa.load(dir_files + 'Sting-vocals.wav')

In [None]:
# import libtsm

In [None]:
# opt_chroma_shift = 2
# pitch_shift_for_audio_1 = -opt_chroma_shift % 12
# audio_1_shifted = libtsm.pitch_shift(X_wav, pitch_shift_for_audio_1 * 100, order="tsm-res")

In [None]:
H = int(0.02*Fs)
X = ST_distancias_vocales(X_wav, L=H, R=H)
X = 1 - X/X.max()
Y = ST_distancias_vocales(Y_wav, L=H, R=H)
Y = 1 - Y/Y.max()

In [None]:
tx = np.arange(0, len(X_wav)-2*H, H)
ty = np.arange(0, len(Y_wav)-2*H, H)
l = np.arange(5)

plt.close('all')
plt.figure(figsize=(20,5))
plt.pcolormesh(tx, l, X)

In [None]:
plt.figure(figsize=(20,5))
plt.pcolormesh(ty, l, Y)

In [None]:
import libfmp.c3

In [None]:
C = libfmp.c3.compute_cost_matrix(X, Y)
D = libfmp.c3.compute_accumulated_cost_matrix(C)
P = libfmp.c3.compute_optimal_warping_path(D)

plt.close('all')
plt.figure(figsize=(15, 5))
ax = plt.subplot(1, 2, 1)
libfmp.c3.plot_matrix_with_points(C, P, linestyle='-',  marker='', 
    ax=[ax], aspect='equal', clim=[0, np.max(C)], 
    title='$C$ with optimal warping path', xlabel='Sequence Y', ylabel='Sequence X');

ax = plt.subplot(1, 2, 2)
libfmp.c3.plot_matrix_with_points(D, P, linestyle='-', marker='', 
    ax=[ax], aspect='equal', clim=[0, np.max(D)], 
    title='$D$ with optimal warping path', xlabel='Sequence Y', ylabel='Sequence X');

plt.tight_layout()

In [None]:
from synctoolbox.dtw.utils import compute_optimal_chroma_shift, shift_chroma_vectors, make_path_strictly_monotonic

In [None]:
print('Length of warping path obtained from MrMsDTW:', P.T.shape[1])
wp = make_path_strictly_monotonic(P.T)
print('Length of warping path made strictly monotonic:', wp.shape[1])

In [None]:
import libtsm

# pitch_shift_for_audio_1 = -opt_chroma_shift % 12
# if pitch_shift_for_audio_1 > 6:
#     pitch_shift_for_audio_1 -= 12
# audio_1_shifted = libtsm.pitch_shift(X_wav, pitch_shift_for_audio_1 * 100, order="tsm-res")

# The TSM functionality of the libtsm library expects the warping path to be given in audio samples.
# Here, we do the conversion and additionally clip values that are too large.
time_map = wp.T * H
time_map = np.concatenate((time_map, np.array([[len(X_wav)-1,len(Y_wav)-1]])))

time_map = libtsm.ensure_validity(time_map)

y_hpstsm = libtsm.hps_tsm(X_wav, time_map)
stereo_sonification = np.hstack((Y_wav.reshape(-1, 1), y_hpstsm))

# print('Original signal 1', flush=True)
# ipd.display(ipd.Audio(X_wav, rate=Fs, normalize=True))

# print('Original signal 2', flush=True)
# ipd.display(ipd.Audio(Y_wav, rate=Fs, normalize=True))

print('Synchronized versions', flush=True)
ipd.display(ipd.Audio(stereo_sonification.T, rate=Fs, normalize=True))

In [None]:
def graficar_envolvente(file, fs=22050, Ndft=1024, N=460, p=20):
    
    if np.max(abs(file))>1e-8:
        x = file / np.max(abs(file)) * 0.9  #normalización
    else:
        return 0, 0
    
    #Enventanado
    # sample del medio de la señal
    ind_mid = int(len(x)/2)
    # signal frame
    s = x[ind_mid-int(N/2):ind_mid+int(N/2)]
    # smoothing window
    window = signal.windows.get_window('hann', N)
    # windowed signal frame
    s_win = s * window

    # spectrum of the signal frame
    X = np.fft.fft(s_win, Ndft)
    # frequency values
    f = np.fft.fftfreq(Ndft) * fs

    # magnitude spectrum
    magX = np.abs(X)
    ind_fmx = int(Ndft/2)

    ak, e, e_norm = lpc_analysis(s_win, p)  #LPC

    # filter obtained from the lpc analysis
    S = 1
    U = np.concatenate([[1], -ak])

    # compute gain 
    G = np.sqrt(e)

    # compute the frequency response of the digital filter
    w, H = signal.freqz(G*S, U, worN=Ndft, whole=True)
    fw = w / (2 * np.pi) * fs

    # magnitude spectrum
    magH = np.abs(H)
    ind_fmx = int(Ndft/2)

    # plot the frequency response
    plt.figure(figsize=(15,5))
    plt.plot(f[:ind_fmx], 20 * np.log10(magX[:ind_fmx]), 'k', label='Respuesta en frecuencia de la señal')
    plt.plot(f[:ind_fmx], 20 * np.log10(magH[:ind_fmx]), 'r', label='Respuesta en frecuencia del modelo todo-polos')
    plt.ylabel('Magnitud (dB)')
    plt.xlabel('Frecuencia (Hz)')
    plt.legend();

In [None]:
n = 1060
print(n*H/Fs)

In [None]:
graficar_envolvente(X_wav[n*H:(n+1)*H], N=200, p=9)

In [None]:
graficar_envolvente(Y_wav[n*H:(n+1)*H], N=200, p=9)