In [None]:
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd

from scipy import io, signal
from IPython.display import Audio

In [None]:
import os
import IPython
os.environ['NUMBA_CACHE_DIR'] = IPython.paths.get_ipython_cache_dir()
import librosa

In [None]:
dir_files = './data/'

X_wav, Fs = librosa.load(dir_files + 'Emily_Linge-vocals.wav')
Y_wav, Fs = librosa.load(dir_files + 'Police-vocals-guitar.wav')

In [None]:
tuning_offset_1 = librosa.estimate_tuning(y=X_wav, sr=Fs)
tuning_offset_2 = librosa.estimate_tuning(y=Y_wav, sr=Fs)
print('Estimated tuning deviation for recording 1: %f cents, for recording 2: %f cents' % (tuning_offset_1, tuning_offset_2))

In [None]:
N = 2048
H = 4096
X = librosa.feature.chroma_stft(y=X_wav, sr=Fs, norm=2, hop_length=H, n_fft=N, tuning=tuning_offset_1)
X = X / X.max()
Y = librosa.feature.chroma_stft(y=Y_wav, sr=Fs, norm=2, hop_length=H, n_fft=N, tuning=tuning_offset_2)
Y = Y / Y.max()

plt.figure(figsize=(15, 3))
plt.title('Sequence $X$')
librosa.display.specshow(X, x_axis='frames', y_axis='chroma', cmap='gray_r', hop_length=H)
plt.xlabel('Time (frames)')
plt.ylabel('Chroma')
plt.colorbar()
plt.clim([0, 1])
plt.tight_layout(); plt.show()
# ipd.display(ipd.Audio(X_wav, rate=Fs))

plt.figure(figsize=(15, 3))
plt.title('Sequence $Y$')
librosa.display.specshow(Y, x_axis='frames', y_axis='chroma', cmap='gray_r', hop_length=H)
plt.xlabel('Time (frames)')
plt.ylabel('Chroma')
plt.colorbar()
plt.clim([0, 1])
plt.tight_layout(); plt.show()
# ipd.display(ipd.Audio(Y_wav, rate=Fs))

In [None]:
from synctoolbox.dtw.utils import compute_optimal_chroma_shift, shift_chroma_vectors, make_path_strictly_monotonic

In [None]:
opt_chroma_shift = compute_optimal_chroma_shift(X, Y)
print('Pitch shift between recording 1 and recording 2, determined by DTW:', opt_chroma_shift, 'bins')

In [None]:
N = 2048
H = int(0.02*Fs)
X = librosa.feature.chroma_stft(y=X_wav, sr=Fs, norm=2, hop_length=H, n_fft=N, tuning=tuning_offset_1)
X = X / X.max()
Y = librosa.feature.chroma_stft(y=Y_wav, sr=Fs, norm=2, hop_length=H, n_fft=N, tuning=tuning_offset_2)
Y = Y / Y.max()

plt.figure(figsize=(15, 3))
plt.title('Sequence $X$')
librosa.display.specshow(X, x_axis='frames', y_axis='chroma', cmap='gray_r', hop_length=H)
plt.xlabel('Time (frames)')
plt.ylabel('Chroma')
plt.colorbar()
plt.clim([0, 1])
plt.tight_layout(); plt.show()
# ipd.display(ipd.Audio(X_wav, rate=Fs))

plt.figure(figsize=(15, 3))
plt.title('Sequence $Y$')
librosa.display.specshow(Y, x_axis='frames', y_axis='chroma', cmap='gray_r', hop_length=H)
plt.xlabel('Time (frames)')
plt.ylabel('Chroma')
plt.colorbar()
plt.clim([0, 1])
plt.tight_layout(); plt.show()
# ipd.display(ipd.Audio(Y_wav, rate=Fs))

In [None]:
Y = shift_chroma_vectors(Y, opt_chroma_shift)

In [None]:
plt.figure(figsize=(15, 3))
plt.title('Sequence $X$')
librosa.display.specshow(X, x_axis='frames', y_axis='chroma', cmap='gray_r', hop_length=H)
plt.xlabel('Time (frames)')
plt.ylabel('Chroma')
plt.colorbar()
plt.clim([0, 1])
plt.tight_layout(); plt.show()
# ipd.display(ipd.Audio(X_wav, rate=Fs))

plt.figure(figsize=(15, 3))
plt.title('Sequence $Y$ (Shifted)')
librosa.display.specshow(Y, x_axis='frames', y_axis='chroma', cmap='gray_r', hop_length=H)
plt.xlabel('Time (frames)')
plt.ylabel('Chroma')
plt.colorbar()
plt.clim([0, 1])
plt.tight_layout(); plt.show()
# ipd.display(ipd.Audio(Y_wav, rate=Fs))

In [None]:
import libfmp.c3

In [None]:
C = libfmp.c3.compute_cost_matrix(X, Y)
D = libfmp.c3.compute_accumulated_cost_matrix(C)
P = libfmp.c3.compute_optimal_warping_path(D)


plt.figure(figsize=(15, 5))
ax = plt.subplot(1, 2, 1)
libfmp.c3.plot_matrix_with_points(C, P, linestyle='-',  marker='', 
    ax=[ax], aspect='equal', clim=[0, np.max(C)], 
    title='$C$ with optimal warping path', xlabel='Sequence Y', ylabel='Sequence X');

ax = plt.subplot(1, 2, 2)
libfmp.c3.plot_matrix_with_points(D, P, linestyle='-', marker='', 
    ax=[ax], aspect='equal', clim=[0, np.max(D)], 
    title='$D$ with optimal warping path', xlabel='Sequence Y', ylabel='Sequence X');

plt.tight_layout()

In [None]:
N = X.shape[1]
M = Y.shape[1]

plt.figure(figsize=(15, 5))
ax_X = plt.axes([0, 0.60, 1, 0.40])
librosa.display.specshow(X, ax=ax_X, x_axis='frames', y_axis='chroma', cmap='gray_r', hop_length=H)
ax_X.set_ylabel('Cromagrama de Emily')
ax_X.set_xlabel('Tiempo (frames)')
ax_X.xaxis.tick_top()
ax_X.xaxis.set_label_position('top') 
# ax_X.set_title('Emily')

ax_Y = plt.axes([0, 0, 1, 0.40])
librosa.display.specshow(Y, ax=ax_Y, x_axis='frames', y_axis='chroma', cmap='gray_r', hop_length=H)
ax_Y.set_ylabel('Cromagrama de The Police')
ax_Y.set_xlabel('Tiempo (frames)')
# ax_Y.set_title('The Police')

step = 100
y_min_X, y_max_X = ax_X.get_ylim()
y_min_Y, y_max_Y = ax_Y.get_ylim()
for t in P[0:-1:step, :]: 
    ax_X.vlines(t[0], y_min_X, y_max_X, color='r')
    ax_Y.vlines(t[1], y_min_Y, y_max_Y, color='r')

ax = plt.axes([0, 0.40, 1, 0.20])
for p in P[0:-1:step, :]: 
    ax.plot((p[0]/N, p[1]/M), (1, -1), color='r')
    ax.set_xlim(0, 1)
    ax.set_ylim(-1, 1)
ax.set_xticks([])
ax.set_yticks([]);

In [None]:
print('Length of warping path obtained from MrMsDTW:', P.T.shape[1])
wp = make_path_strictly_monotonic(P.T)
print('Length of warping path made strictly monotonic:', wp.shape[1])

In [None]:
import libtsm

pitch_shift_for_audio_1 = -opt_chroma_shift % 12
if pitch_shift_for_audio_1 > 6:
    pitch_shift_for_audio_1 -= 12
audio_1_shifted = libtsm.pitch_shift(X_wav, pitch_shift_for_audio_1 * 100, order="tsm-res")

# The TSM functionality of the libtsm library expects the warping path to be given in audio samples.
# Here, we do the conversion and additionally clip values that are too large.
time_map = wp.T * H
time_map = np.concatenate((time_map, np.array([[len(X_wav)-1,len(Y_wav)-1]])))

time_map = libtsm.ensure_validity(time_map)

y_hpstsm = libtsm.hps_tsm(X_wav, time_map)
stereo_sonification = np.hstack((Y_wav.reshape(-1, 1), y_hpstsm))

# print('Original signal 1', flush=True)
# ipd.display(ipd.Audio(X_wav, rate=Fs, normalize=True))

# print('Original signal 2', flush=True)
# ipd.display(ipd.Audio(Y_wav, rate=Fs, normalize=True))

print('Synchronized versions', flush=True)
ipd.display(ipd.Audio(stereo_sonification.T, rate=Fs, normalize=True))

In [None]:
print(X[:,1000])
print(np.unique(X[:,0].shape))