In [1]:
%cd "D:\Schoolwork\TERM 3\WORK\visual_prosody"

D:\Schoolwork\TERM 3\WORK\visual_prosody


In [2]:
import os, sys, glob, shutil
import os.path as op
import argparse
import json
import yaml
import numpy as np
import pandas as pd
from pprint import pprint
import os.path as op

import torch
import torch.nn.functional as F
import torchaudio
from tqdm.notebook import tqdm

In [3]:
import argparse
import fnmatch
import logging
import multiprocessing as mp
from typing import Dict, List, Tuple

import librosa
import pysptk
import pyworld as pw
import soundfile as sf
from fastdtw import fastdtw
from scipy import spatial

In [4]:
split_txt_val_path = r'.\preprocessed_data\LJSpeech\val.txt'
val_uids = []
with open(split_txt_val_path) as file:
    for line in file:
        # print(line.split('|')[0])
        val_uids.append(line.split('|')[0])

In [5]:
len(val_uids)

512

# MCD

In [6]:
recon_folder = r'.\output\0629lj\result\LJSpeech\wav\reconstructed'
syn_folder = r'.\output\0629lj\result\LJSpeech\wav\synthesized'
gt_folder = r'.\Data\LJSpeech-1.1\wavs'

In [7]:
sr = 22050
melkwargs = {
    "n_fft": int(0.05 * sr), "win_length": int(0.05 * sr),
    "hop_length": int(0.0125 * sr), "f_min": 20,
    "n_mels": 80, "window_fn": torch.hann_window
}
mfcc_fn = torchaudio.transforms.MFCC(
    sr, n_mfcc=13, log_mels=True, melkwargs=melkwargs
)

In [8]:
uids = []
MCDs_recon = []
for uid in tqdm(val_uids):
    uids.append(uid)
    wav_1, sr = torchaudio.load(op.join(recon_folder, f"{uid}.wav"))
    wav_2, sr = torchaudio.load(op.join(syn_folder, f"{uid}.wav"))
    wav_1 = wav_1.squeeze()
    wav_2 = wav_2.squeeze()
    mel_1 = mfcc_fn(wav_1).T.numpy()
    mel_2 = mfcc_fn(wav_2).T.numpy()
    # DTW
    _, path = fastdtw(mel_2, mel_1, dist=spatial.distance.euclidean)
    twf = np.array(path).T
    mel_2 = mel_2[twf[0]]
    mel_1 = mel_1[twf[1]]
    # We sum the squared differences over the first K MFCCs, skipping ct,0
    mel_1 = mel_1[:, 1:]
    mel_2 = mel_2[:, 1:]
    result = (((mel_1 - mel_2) ** 2).sum(axis=1)**0.5).mean()
    MCDs_recon.append(result)

  0%|          | 0/512 [00:00<?, ?it/s]

In [9]:
uids = []
MCDs_gt = []
for uid in tqdm(val_uids):
    uids.append(uid)
    wav_1, sr = torchaudio.load(op.join(gt_folder, f"{uid}.wav"))
    wav_2, sr = torchaudio.load(op.join(syn_folder, f"{uid}.wav"))
    wav_1 = wav_1.squeeze()
    wav_2 = wav_2.squeeze()
    mel_1 = mfcc_fn(wav_1).T.numpy()
    mel_2 = mfcc_fn(wav_2).T.numpy()
    # DTW
    _, path = fastdtw(mel_2, mel_1, dist=spatial.distance.euclidean)
    twf = np.array(path).T
    mel_2 = mel_2[twf[0]]
    mel_1 = mel_1[twf[1]]
    # We sum the squared differences over the first K MFCCs, skipping ct,0
    mel_1 = mel_1[:, 1:]
    mel_2 = mel_2[:, 1:]
    result = (((mel_1 - mel_2) ** 2).sum(axis=1)**0.5).mean()
    MCDs_gt.append(result)

  0%|          | 0/512 [00:00<?, ?it/s]

In [10]:
lj_mcd_df = pd.DataFrame({
    'uid': uids,
    'MCD_recon': MCDs_recon,
    'MCD_gt': MCDs_gt,
})

In [11]:
lj_mcd_df.to_csv(r".\jupyter_walkthrough\metrics\MCD_LJ.csv")

In [12]:
print(f"MCDs_recon mean on LJSpeech: {torch.tensor(MCDs_recon).mean()}")
print(f"MCDs_recon std on LJSpeech: {torch.tensor(MCDs_recon).std()}")

print(f"MCDs_gt mean on LJSpeech: {torch.tensor(MCDs_gt).mean()}")
print(f"MCDs_gt std on LJSpeech: {torch.tensor(MCDs_gt).std()}")

MCDs_recon mean on LJSpeech: 5.480271339416504
MCDs_recon std on LJSpeech: 0.7322819232940674
MCDs_gt mean on LJSpeech: 7.159063339233398
MCDs_gt std on LJSpeech: 0.9118814468383789


# log f0

In [13]:
# https://github.com/espnet/espnet/blob/3e0dad524d62ccd45e067e9b36049f2214ea972a/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_f0.py

In [14]:
def world_extract(
    x: np.ndarray,
    fs: int,
    f0min: int = 40,
    f0max: int = 800,
    n_fft: int = 512,
    n_shift: int = 256,
    mcep_dim: int = 25,
    mcep_alpha: float = 0.41,
) -> np.ndarray:
    """Extract World-based acoustic features.

    Args:
        x (ndarray): 1D waveform array.
        fs (int): Minimum f0 value (default=40).
        f0 (int): Maximum f0 value (default=800).
        n_shift (int): Shift length in point (default=256).
        n_fft (int): FFT length in point (default=512).
        n_shift (int): Shift length in point (default=256).
        mcep_dim (int): Dimension of mel-cepstrum (default=25).
        mcep_alpha (float): All pass filter coefficient (default=0.41).

    Returns:
        ndarray: Mel-cepstrum with the size (N, n_fft).
        ndarray: F0 sequence (N,).

    """
    # extract features
    x = x.astype(np.float64)
    f0, time_axis = pw.harvest(
        x,
        fs,
        f0_floor=f0min,
        f0_ceil=f0max,
        frame_period=n_shift / fs * 1000,
    )
    sp = pw.cheaptrick(x, f0, time_axis, fs, fft_size=n_fft)
    if mcep_dim is None or mcep_alpha is None:
        mcep_dim, mcep_alpha = _get_best_mcep_params(fs)
    mcep = pysptk.sp2mc(sp, mcep_dim, mcep_alpha)

    return mcep, f0


def _get_basename(path: str) -> str:
    return os.path.splitext(os.path.split(path)[-1])[0]


def _get_best_mcep_params(fs: int) -> Tuple[int, float]:
    if fs == 16000:
        return 23, 0.42
    elif fs == 22050:
        return 34, 0.45
    elif fs == 24000:
        return 34, 0.46
    elif fs == 44100:
        return 39, 0.53
    elif fs == 48000:
        return 39, 0.55
    else:
        raise ValueError(f"Not found the setting for {fs}.")

In [15]:
uids = []
logf0_rmses_recon = []
logf0_corrs_recon = []

for uid in tqdm(val_uids):
    uids.append(uid)
    # load wav file as int16
    gen_x, gen_fs = sf.read(op.join(syn_folder, f"{uid}.wav"), dtype="int16")
    gt_x, gt_fs = sf.read(op.join(recon_folder, f"{uid}.wav"), dtype="int16")
    fs = gen_fs
    # extract ground truth and converted features
    gen_mcep, gen_f0 = world_extract(
        x=gen_x,
        fs=fs,
        f0min=40,
        f0max=800,
        n_fft=1024,
        n_shift=256,
        mcep_dim=None,
        mcep_alpha=None,
    )
    gt_mcep, gt_f0 = world_extract(
        x=gt_x,
        fs=fs,
        f0min=40,
        f0max=800,
        n_fft=1024,
        n_shift=256,
        mcep_dim=None,
        mcep_alpha=None,
    )
    
    # DTW
    _, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean)
    twf = np.array(path).T
    gen_f0_dtw = gen_f0[twf[0]]
    gt_f0_dtw = gt_f0[twf[1]]
    
    # Get voiced part
    nonzero_idxs = np.where((gen_f0_dtw != 0) & (gt_f0_dtw != 0))[0]
    gen_f0_dtw_voiced = np.log(gen_f0_dtw[nonzero_idxs])
    gt_f0_dtw_voiced = np.log(gt_f0_dtw[nonzero_idxs])

    # log F0 RMSE
    log_f0_rmse = np.sqrt(np.mean((gen_f0_dtw_voiced - gt_f0_dtw_voiced) ** 2))
    # print(f"{uid} {log_f0_rmse:.4f}")

    # log F0 corr
    log_f0_corr = np.corrcoef(gen_f0_dtw_voiced, gt_f0_dtw_voiced)[0][1]
    # print(f"{uid} {log_f0_corr:.4f}")

    logf0_rmses_recon.append(log_f0_rmse)
    logf0_corrs_recon.append(log_f0_corr)



  0%|          | 0/512 [00:00<?, ?it/s]

In [16]:
uids = []
logf0_rmses_gt = []
logf0_corrs_gt = []

for uid in tqdm(val_uids):
    uids.append(uid)
    # load wav file as int16
    gen_x, gen_fs = sf.read(op.join(syn_folder, f"{uid}.wav"), dtype="int16")
    gt_x, gt_fs = sf.read(op.join(gt_folder, f"{uid}.wav"), dtype="int16")
    fs = gen_fs
    # extract ground truth and converted features
    gen_mcep, gen_f0 = world_extract(
        x=gen_x,
        fs=fs,
        f0min=40,
        f0max=800,
        n_fft=1024,
        n_shift=256,
        mcep_dim=None,
        mcep_alpha=None,
    )
    gt_mcep, gt_f0 = world_extract(
        x=gt_x,
        fs=fs,
        f0min=40,
        f0max=800,
        n_fft=1024,
        n_shift=256,
        mcep_dim=None,
        mcep_alpha=None,
    )
    
    # DTW
    _, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean)
    twf = np.array(path).T
    gen_f0_dtw = gen_f0[twf[0]]
    gt_f0_dtw = gt_f0[twf[1]]
    
    # Get voiced part
    nonzero_idxs = np.where((gen_f0_dtw != 0) & (gt_f0_dtw != 0))[0]
    gen_f0_dtw_voiced = np.log(gen_f0_dtw[nonzero_idxs])
    gt_f0_dtw_voiced = np.log(gt_f0_dtw[nonzero_idxs])

    # log F0 RMSE
    log_f0_rmse = np.sqrt(np.mean((gen_f0_dtw_voiced - gt_f0_dtw_voiced) ** 2))
    # print(f"{uid} {log_f0_rmse:.4f}")

    # log F0 corr
    log_f0_corr = np.corrcoef(gen_f0_dtw_voiced, gt_f0_dtw_voiced)[0][1]
    # print(f"{uid} {log_f0_corr:.4f}")

    logf0_rmses_gt.append(log_f0_rmse)
    logf0_corrs_gt.append(log_f0_corr)

  0%|          | 0/512 [00:00<?, ?it/s]

In [17]:
lj_logf0_df = pd.DataFrame({
    'uid': uids,
    'logf0_rmse_recon': logf0_rmses_recon,
    'logf0_corr_recon': logf0_corrs_recon,
    'logf0_rmse_gt': logf0_rmses_gt,
    'logf0_corr_gt': logf0_corrs_gt,
})
lj_logf0_df.to_csv(r".\jupyter_walkthrough\metrics\logF0_LJ.csv")

In [18]:
print(f"logf0_rmse_recon mean on LJSpeech: {torch.tensor(logf0_rmses_recon).mean()}")
print(f"logf0_rmse_recon std on LJSpeech: {torch.tensor(logf0_rmses_recon).std()}")
print(f"logf0_corr_recon mean on LJSpeech: {torch.tensor(logf0_corrs_recon).mean()}")
print(f"logf0_corr_recon std on LJSpeech: {torch.tensor(logf0_corrs_recon).std()}")

print(f"logf0_rmse_gt mean on LJSpeech: {torch.tensor(logf0_rmses_gt).mean()}")
print(f"logf0_rmse_gt std on LJSpeech: {torch.tensor(logf0_rmses_gt).std()}")
print(f"logf0_corr_gt mean on LJSpeech: {torch.tensor(logf0_corrs_gt).mean()}")
print(f"logf0_corr_gt std on LJSpeech: {torch.tensor(logf0_corrs_gt).std()}")

logf0_rmse_recon mean on LJSpeech: 0.2128476233348171
logf0_rmse_recon std on LJSpeech: 0.07755068307229493
logf0_corr_recon mean on LJSpeech: 0.7322283435658924
logf0_corr_recon std on LJSpeech: 0.16015679184393214
logf0_rmse_gt mean on LJSpeech: 0.2078816564710602
logf0_rmse_gt std on LJSpeech: 0.08039552050371225
logf0_corr_gt mean on LJSpeech: 0.7355830529558801
logf0_corr_gt std on LJSpeech: 0.14026441509413443
