In [32]:
import os, sys, glob, shutil
import argparse
import json
import yaml
import numpy as np
from pprint import pprint

import torch
import yaml
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
from scipy.io import wavfile
from matplotlib import pyplot as plt
from tqdm.notebook import tqdm

In [33]:
%cd "D:\Schoolwork\TERM 3\WORK\visual_prosody"

D:\Schoolwork\TERM 3\WORK\visual_prosody


In [34]:
split_txt_val_path = r'.\preprocessed_data\Ego4D_final_v6\val.txt'
val_uids = []
with open(split_txt_val_path) as file:
    for line in file:
        # print(line.split('|')[0])
        val_uids.append(line.split('|')[0])

In [35]:
print(len(val_uids))

2772


In [36]:
from utils.model import get_model, get_vocoder, get_param_num, vocoder_infer
from utils.tools import to_device, log, synth_one_sample, expand, plot_mel
from model import FastSpeech2Loss
from dataset import Dataset
# from utils.auto_tqdm import tqdm
from evaluate import evaluate

In [37]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
parser = argparse.ArgumentParser()
parser.add_argument("--restore_step", type=int, default=0)
parser.add_argument(
    "-p",
    "--preprocess_config",
    type=str,
    required=True,
    help="path to preprocess.yaml",
)
parser.add_argument(
    "-m", "--model_config", type=str, required=True, help="path to model.yaml"
)
parser.add_argument(
    "-t", "--train_config", type=str, required=True, help="path to train.yaml"
)

argString = '-p ./config/Ego4D_final_v6/0703a_preprocess.yaml -m ./config/Ego4D_final_v6/0703a_model.yaml -t ./config/Ego4D_final_v6/0703a_train.yaml'
# args = parser.parse_args()
args = parser.parse_args(argString.split())

In [8]:
pprint(args)
# Read Config
preprocess_config = yaml.load(
    open(args.preprocess_config, "r"), Loader=yaml.FullLoader
)
model_config = yaml.load(open(args.model_config, "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open(args.train_config, "r"), Loader=yaml.FullLoader)
configs = (preprocess_config, model_config, train_config)
print("Prepare training ...")

preprocess_config, model_config, train_config = configs

Namespace(restore_step=0, preprocess_config='./config/Ego4D_final_v6/0703a_preprocess.yaml', model_config='./config/Ego4D_final_v6/0703a_model.yaml', train_config='./config/Ego4D_final_v6/0703a_train.yaml')
Prepare training ...


In [15]:
ckpt_path = r'./output/0703a/ckpt/Ego4D_final_v6/1000000.pth.tar'
ckpt = torch.load(ckpt_path)

In [16]:
# Prepare model
model, optimizer = get_model(args, configs, device, train=True)
model.load_state_dict(ckpt["model"], strict=False)
model.to(device)
model = nn.DataParallel(model)
num_param = get_param_num(model)
Loss = FastSpeech2Loss(preprocess_config, model_config).to(device)
print("Number of FastSpeech2 Parameters:", num_param)

=> Using speaker embeddings.
True
2
=> Using VarianceAdaptorWithSpeaker.
Number of FastSpeech2 Parameters: 35165553


In [19]:
# Load vocoder
vocoder = get_vocoder(model_config, device)
step = args.restore_step + 1
model.eval()
print()
dataset = Dataset(
    "val.txt", 'val', preprocess_config, train_config, sort=False, drop_last=False
)

Removing weight norm...



In [20]:
batch_size = train_config["optimizer"]["batch_size"]
batch_size = 1

In [21]:
loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False,
        collate_fn=dataset.collate_fn,
    )
# Get loss function
Loss = FastSpeech2Loss(preprocess_config, model_config).to(device)

In [22]:
os.path.join(train_config['path']['result_path'], 'plots')

'./output/0703a/result/Ego4D_final_v6\\plots'

In [23]:
# => batch:
# return (
#     ids,
#     raw_texts,
#     speakers,
#     texts,
#     text_lens,
#     max(text_lens),
#     mels,
#     mel_lens,
#     max(mel_lens),
#     pitches,
#     energies,
#     durations,
#     speaker_embeddings,
# )
# [12] 对应speaker embedding


# => model output / prediction
# return (
#     output,
#     postnet_output,
#     p_predictions,
#     e_predictions,
#     log_d_predictions,
#     d_rounded,
#     src_masks,
#     mel_masks,
#     src_lens,
#     mel_lens,
# )
output_plot_path = os.path.join(train_config['path']['result_path'], 'plot')
output_mel_syn_path = os.path.join(train_config['path']['result_path'], 'mel', 'syn')
output_mel_gt_path = os.path.join(train_config['path']['result_path'], 'mel', 'gt')
output_wav_syn_path = os.path.join(train_config['path']['result_path'], 'wav', 'synthesized')
output_wav_rec_path = os.path.join(train_config['path']['result_path'], 'wav', 'reconstructed')
os.makedirs(output_plot_path, exist_ok=True)
os.makedirs(output_mel_syn_path, exist_ok=True)
os.makedirs(output_mel_gt_path, exist_ok=True)
os.makedirs(output_wav_syn_path, exist_ok=True)
os.makedirs(output_wav_rec_path, exist_ok=True)

In [24]:
for batchs in tqdm(loader):
    for targets in batchs:
        targets = to_device(targets, device)
        with torch.no_grad():
            predictions = model(*(targets[2:]))
        basenames = targets[0]
        for i in range(len(predictions[0])):
            basename = basenames[i]
            src_len = predictions[8][i].item()
            mel_len = predictions[9][i].item()
            mel_prediction = predictions[1][i, :mel_len].detach().transpose(0, 1)
            mel_target = targets[6][i, :mel_len].detach().transpose(0, 1)

            torch.save(mel_prediction.cpu(), os.path.join(output_mel_syn_path, f"{basename}.pt"))
            torch.save(mel_target.cpu(), os.path.join(output_mel_gt_path, f"{basename}.pt"))
            
            
            duration = predictions[5][i, :src_len].detach().cpu().numpy()
            if preprocess_config["preprocessing"]["pitch"]["feature"] == "phoneme_level":
                pitch = predictions[2][i, :src_len].detach().cpu().numpy()
                pitch = expand(pitch, duration)
            else:
                pitch = predictions[2][i, :mel_len].detach().cpu().numpy()
            if preprocess_config["preprocessing"]["energy"]["feature"] == "phoneme_level":
                energy = predictions[3][i, :src_len].detach().cpu().numpy()
                energy = expand(energy, duration)
            else:
                energy = predictions[3][i, :mel_len].detach().cpu().numpy()

            with open(os.path.join(preprocess_config["path"]["preprocessed_path"], 
                                   "stats.json")) as f:
                stats = json.load(f)
                stats = stats["pitch"] + stats["energy"][:2]
                                       
            fig = plot_mel(
                [
                    (mel_prediction.cpu().numpy(), pitch, energy),
                    (mel_target.cpu().numpy(), pitch, energy),
                ],
                stats,
                ["Synthetized Spectrogram", "Ground-Truth Spectrogram"],
            )
            ### TODO: change to svg
            plt.savefig(os.path.join(output_plot_path, f"{basename}.png"))
            plt.close()

        # from .model import vocoder_infer

        mel_predictions = predictions[1].transpose(1, 2)
        mel_targets = targets[6].transpose(1, 2)
        
        lengths = predictions[9] * preprocess_config["preprocessing"]["stft"]["hop_length"]
        wav_predictions = vocoder_infer(
            mel_predictions, vocoder, model_config, preprocess_config, lengths=lengths
        )
        wav_targets = vocoder_infer(
        mel_targets, vocoder, model_config, preprocess_config, lengths=lengths
    )
    
        sampling_rate = preprocess_config["preprocessing"]["audio"]["sampling_rate"]
        for wav, basename in zip(wav_predictions, basenames):
            wavfile.write(os.path.join(output_wav_syn_path, f"{basename}.wav"), sampling_rate, wav)
        for wav, basename in zip(wav_targets, basenames):
            wavfile.write(os.path.join(output_wav_rec_path, f"{basename}.wav"), sampling_rate, wav)

        
    #     break
    # break

  0%|          | 0/2772 [00:00<?, ?it/s]

In [38]:
import os, sys, glob, shutil
import os.path as op
import argparse
import json
import yaml
import numpy as np
import pandas as pd
from pprint import pprint
import os.path as op

import torch
import torch.nn.functional as F
import torchaudio
import argparse
import fnmatch
import logging
import multiprocessing as mp
from typing import Dict, List, Tuple

import librosa
import pysptk
import pyworld as pw
import soundfile as sf
from fastdtw import fastdtw
from scipy import spatial

In [10]:
split_txt_val_path = r'.\preprocessed_data\Ego4D_final_v6\val.txt'
val_uids = []
with open(split_txt_val_path) as file:
    for line in file:
        # print(line.split('|')[0])
        val_uids.append(line.split('|')[0])

In [11]:
len(val_uids)

2772

In [12]:
recon_folder = r'.\output\0703a\result\Ego4D_final_v6\wav\reconstructed'
syn_folder = r'.\output\0703a\result\Ego4D_final_v6\wav\synthesized'
gt_folder = r'.\raw_data\Ego4D_final_v6\val\Ego4D_final_v6'

In [29]:
sr = 22050
melkwargs = {
    "n_fft": int(0.05 * sr), "win_length": int(0.05 * sr),
    "hop_length": int(0.0125 * sr), "f_min": 20,
    "n_mels": 80, "window_fn": torch.hann_window
}
mfcc_fn = torchaudio.transforms.MFCC(
    sr, n_mfcc=13, log_mels=True, melkwargs=melkwargs
)

In [30]:
%%time
uids = []
MCDs_recon = []
for uid in tqdm(val_uids):
    uids.append(uid)
    wav_1, sr = torchaudio.load(op.join(recon_folder, f"{uid}.wav"))
    wav_2, sr = torchaudio.load(op.join(syn_folder, f"{uid}.wav"))
    wav_1 = wav_1.squeeze()
    wav_2 = wav_2.squeeze()
    mel_1 = mfcc_fn(wav_1).T.numpy()
    mel_2 = mfcc_fn(wav_2).T.numpy()
    # DTW
    _, path = fastdtw(mel_2, mel_1, dist=spatial.distance.euclidean)
    twf = np.array(path).T
    mel_2 = mel_2[twf[0]]
    mel_1 = mel_1[twf[1]]
    # We sum the squared differences over the first K MFCCs, skipping ct,0
    mel_1 = mel_1[:, 1:]
    mel_2 = mel_2[:, 1:]
    result = (((mel_1 - mel_2) ** 2).sum(axis=1)**0.5).mean()
    MCDs_recon.append(result)

  0%|          | 0/2772 [00:00<?, ?it/s]

CPU times: total: 8min 29s
Wall time: 2min 13s


In [34]:
%%time
uids = []
MCDs_gt = []
for uid in tqdm(val_uids):
    uids.append(uid)
    wav_1, sr = torchaudio.load(op.join(gt_folder, f"{uid}.wav"))
    wav_2, sr = torchaudio.load(op.join(syn_folder, f"{uid}.wav"))
    wav_1 = wav_1.squeeze()
    wav_2 = wav_2.squeeze()
    mel_1 = mfcc_fn(wav_1).T.numpy()
    mel_2 = mfcc_fn(wav_2).T.numpy()
    # DTW
    _, path = fastdtw(mel_2, mel_1, dist=spatial.distance.euclidean)
    twf = np.array(path).T
    mel_2 = mel_2[twf[0]]
    mel_1 = mel_1[twf[1]]
    # We sum the squared differences over the first K MFCCs, skipping ct,0
    mel_1 = mel_1[:, 1:]
    mel_2 = mel_2[:, 1:]
    result = (((mel_1 - mel_2) ** 2).sum(axis=1)**0.5).mean()
    MCDs_gt.append(result)

  0%|          | 0/2772 [00:00<?, ?it/s]

CPU times: total: 9min 14s
Wall time: 2min 12s


In [35]:
v6_mcd_df = pd.DataFrame({
    'uid': uids,
    'MCD_recon': MCDs_recon,
    'MCD_gt': MCDs_gt,
})

In [36]:
v6_mcd_df.to_csv(r".\jupyter_walkthrough\metrics\MCD_0703a_1M.csv")

In [37]:
print(f"MCDs_recon mean on 0703a_1M: {torch.tensor(MCDs_recon).mean()}")
print(f"MCDs_recon std on 0703a_1M: {torch.tensor(MCDs_recon).std()}")

print(f"MCDs_gt mean on 0703a_1M: {torch.tensor(MCDs_gt).mean()}")
print(f"MCDs_gt std on 0703a_1M: {torch.tensor(MCDs_gt).std()}")

MCDs_recon mean on 0703a_1M: 12.919245719909668
MCDs_recon std on 0703a_1M: 2.471064805984497
MCDs_gt mean on 0703a_1M: 13.95725154876709
MCDs_gt std on 0703a_1M: 2.489434003829956


# log f0

In [15]:
# https://github.com/espnet/espnet/blob/3e0dad524d62ccd45e067e9b36049f2214ea972a/egs2/TEMPLATE/asr1/pyscripts/utils/evaluate_f0.py

In [16]:
def world_extract(
    x: np.ndarray,
    fs: int,
    f0min: int = 40,
    f0max: int = 800,
    n_fft: int = 512,
    n_shift: int = 256,
    mcep_dim: int = 25,
    mcep_alpha: float = 0.41,
) -> np.ndarray:
    """Extract World-based acoustic features.

    Args:
        x (ndarray): 1D waveform array.
        fs (int): Minimum f0 value (default=40).
        f0 (int): Maximum f0 value (default=800).
        n_shift (int): Shift length in point (default=256).
        n_fft (int): FFT length in point (default=512).
        n_shift (int): Shift length in point (default=256).
        mcep_dim (int): Dimension of mel-cepstrum (default=25).
        mcep_alpha (float): All pass filter coefficient (default=0.41).

    Returns:
        ndarray: Mel-cepstrum with the size (N, n_fft).
        ndarray: F0 sequence (N,).

    """
    # extract features
    x = x.astype(np.float64)
    f0, time_axis = pw.harvest(
        x,
        fs,
        f0_floor=f0min,
        f0_ceil=f0max,
        frame_period=n_shift / fs * 1000,
    )
    sp = pw.cheaptrick(x, f0, time_axis, fs, fft_size=n_fft)
    if mcep_dim is None or mcep_alpha is None:
        mcep_dim, mcep_alpha = _get_best_mcep_params(fs)
    mcep = pysptk.sp2mc(sp, mcep_dim, mcep_alpha)

    return mcep, f0


def _get_basename(path: str) -> str:
    return os.path.splitext(os.path.split(path)[-1])[0]


def _get_best_mcep_params(fs: int) -> Tuple[int, float]:
    if fs == 16000:
        return 23, 0.42
    elif fs == 22050:
        return 34, 0.45
    elif fs == 24000:
        return 34, 0.46
    elif fs == 44100:
        return 39, 0.53
    elif fs == 48000:
        return 39, 0.55
    else:
        raise ValueError(f"Not found the setting for {fs}.")

In [17]:
uids = []
logf0_rmses_recon = []
logf0_corrs_recon = []

for uid in tqdm(val_uids):
    
    # load wav file as int16
    gen_x, gen_fs = sf.read(op.join(syn_folder, f"{uid}.wav"), dtype="int16")
    gt_x, gt_fs = sf.read(op.join(recon_folder, f"{uid}.wav"), dtype="int16")
    fs = gen_fs
    # extract ground truth and converted features
    gen_mcep, gen_f0 = world_extract(
        x=gen_x,
        fs=fs,
        f0min=40,
        f0max=800,
        n_fft=1024,
        n_shift=256,
        mcep_dim=None,
        mcep_alpha=None,
    )
    gt_mcep, gt_f0 = world_extract(
        x=gt_x,
        fs=fs,
        f0min=40,
        f0max=800,
        n_fft=1024,
        n_shift=256,
        mcep_dim=None,
        mcep_alpha=None,
    )
    
    # DTW
    _, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean)
    twf = np.array(path).T
    gen_f0_dtw = gen_f0[twf[0]]
    gt_f0_dtw = gt_f0[twf[1]]
    
    # Get voiced part
    nonzero_idxs = np.where((gen_f0_dtw != 0) & (gt_f0_dtw != 0))[0]
    eps = 1e-7
    gen_f0_dtw_voiced = np.log(gen_f0_dtw[nonzero_idxs] + eps)
    gt_f0_dtw_voiced = np.log(gt_f0_dtw[nonzero_idxs] + eps)

    if len(gen_f0_dtw_voiced) == 0 or len(gt_f0_dtw_voiced) == 0:
        print(f"Skip uid {uid}. len == 0.")
        continue

    # log F0 RMSE
    log_f0_rmse = np.sqrt(np.mean((gen_f0_dtw_voiced - gt_f0_dtw_voiced) ** 2))
    # print(f"{uid} {log_f0_rmse:.4f}")

    # log F0 corr
    log_f0_corr = np.corrcoef(gen_f0_dtw_voiced, gt_f0_dtw_voiced)[0][1]
    # print(f"{uid} {log_f0_corr:.4f}")
    uids.append(uid)
    logf0_rmses_recon.append(log_f0_rmse)
    logf0_corrs_recon.append(log_f0_corr)

  0%|          | 0/2772 [00:00<?, ?it/s]

Skip uid b06ee775-fd4e-4d88-a2e5-c1eeeb96d7a8. len == 0.
Skip uid 342edcf2-1150-42a0-af79-030571c0996d. len == 0.
Skip uid f8173e76-1b4d-4e91-bfe9-62ea41129dc6. len == 0.
Skip uid 2ff974ac-b0f1-4611-8c9f-61c216a8ba96. len == 0.


  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)


Skip uid 959dcb2b-4291-45b4-aa24-dc144de6fc1a. len == 0.
Skip uid 40aab340-3072-4621-8706-6294bb4869ec. len == 0.
Skip uid 94ab3c42-86a9-427c-9dc7-9d3583836f89. len == 0.
Skip uid 3375a843-9145-4d5b-8861-7777412fb43f. len == 0.
Skip uid 774fb2b7-15af-4028-bb0f-17725e2cfb7f. len == 0.
Skip uid 7444a0db-41ce-4779-a17c-1b5ec92b1a54. len == 0.
Skip uid d7c942a6-bfd3-4ff2-9926-88fafe78781b. len == 0.
Skip uid 9f56d2e9-8536-4ce4-878e-e31f5c64345a. len == 0.
Skip uid 0d885f9d-8786-40d2-ab4b-d6874b5c6507. len == 0.
Skip uid cff28f93-e826-46a8-b57b-4959b4f485ae. len == 0.


In [18]:
v6_logf0_df = pd.DataFrame({
    'uid': uids,
    'logf0_rmse_recon': logf0_rmses_recon,
    'logf0_corr_recon': logf0_corrs_recon,
    # 'logf0_rmse_gt': logf0_rmses_gt,
    # 'logf0_corr_gt': logf0_corrs_gt,
})
v6_logf0_df.to_csv(r".\jupyter_walkthrough\metrics\logF0_0703a_1M.csv")

In [20]:
v6_logf0_df = v6_logf0_df.dropna()

In [22]:
print(f"logf0_rmse_recon mean on 0703a_1M: {v6_logf0_df['logf0_rmse_recon'].values.mean()}")
print(f"logf0_rmse_recon std on 0703a_1M: {v6_logf0_df['logf0_rmse_recon'].values.std()}")
print(f"logf0_corr_recon mean on 0703a_1M: {v6_logf0_df['logf0_corr_recon'].values.mean()}")
print(f"logf0_corr_recon std on 0703a_1M: {v6_logf0_df['logf0_corr_recon'].values.std()}")

logf0_rmse_recon mean on 0703a_1M: 0.2760883042142412
logf0_rmse_recon std on 0703a_1M: 0.20170691839225147
logf0_corr_recon mean on 0703a_1M: 0.4130407813583691
logf0_corr_recon std on 0703a_1M: 0.3914612579380411


In [1]:
uids = []
logf0_rmses_gt = []
logf0_corrs_gt = []

for uid in tqdm(val_uids):
    
    # load wav file as int16
    gen_x, gen_fs = sf.read(op.join(syn_folder, f"{uid}.wav"), dtype="int16")
    gt_x, gt_fs = sf.read(op.join(gt_folder, f"{uid}.wav"), dtype="int16")
    fs = gen_fs
    # extract ground truth and converted features
    gen_mcep, gen_f0 = world_extract(
        x=gen_x,
        fs=fs,
        f0min=40,
        f0max=800,
        n_fft=1024,
        n_shift=256,
        mcep_dim=None,
        mcep_alpha=None,
    )
    gt_mcep, gt_f0 = world_extract(
        x=gt_x,
        fs=fs,
        f0min=40,
        f0max=800,
        n_fft=1024,
        n_shift=256,
        mcep_dim=None,
        mcep_alpha=None,
    )
    
    # DTW
    _, path = fastdtw(gen_mcep, gt_mcep, dist=spatial.distance.euclidean)
    twf = np.array(path).T
    gen_f0_dtw = gen_f0[twf[0]]
    gt_f0_dtw = gt_f0[twf[1]]
    
    # Get voiced part
    nonzero_idxs = np.where((gen_f0_dtw != 0) & (gt_f0_dtw != 0))[0]
    eps = 1e-7
    gen_f0_dtw_voiced = np.log(gen_f0_dtw[nonzero_idxs] + eps)
    gt_f0_dtw_voiced = np.log(gt_f0_dtw[nonzero_idxs] + eps)

    if len(gen_f0_dtw_voiced) == 0 or len(gt_f0_dtw_voiced) == 0:
        print(f"Skip uid {uid}. len == 0.")
        continue

    # log F0 RMSE
    log_f0_rmse = np.sqrt(np.mean((gen_f0_dtw_voiced - gt_f0_dtw_voiced) ** 2))
    # print(f"{uid} {log_f0_rmse:.4f}")

    # log F0 corr
    log_f0_corr = np.corrcoef(gen_f0_dtw_voiced, gt_f0_dtw_voiced)[0][1]
    # print(f"{uid} {log_f0_corr:.4f}")
    uids.append(uid)
    logf0_rmses_gt.append(log_f0_rmse)
    logf0_corrs_gt.append(log_f0_corr)

NameError: name 'tqdm' is not defined

In [None]:
v6_logf0_df = pd.DataFrame({
    'uid': uids,
    'logf0_rmse_recon': logf0_rmses_recon,
    'logf0_corr_recon': logf0_corrs_recon,
    'logf0_rmse_gt': logf0_rmses_gt,
    'logf0_corr_gt': logf0_corrs_gt,
})
v6_logf0_df.to_csv(r".\jupyter_walkthrough\metrics\logF0_0703a_1M.csv")

In [None]:
v6_logf0_df = v6_logf0_df.dropna()

In [None]:
print(f"logf0_rmse_recon mean on 0703a_1M: {v6_logf0_df['logf0_rmse_recon'].values.mean()}")
print(f"logf0_rmse_recon std on 0703a_1M: {v6_logf0_df['logf0_rmse_recon'].values.std()}")
print(f"logf0_corr_recon mean on 0703a_1M: {v6_logf0_df['logf0_corr_recon'].values.mean()}")
print(f"logf0_corr_recon std on 0703a_1M: {v6_logf0_df['logf0_corr_recon'].values.std()}")

print(f"logf0_rmse_gt mean on 0703a_1M: {v6_logf0_df['logf0_rmse_gt'].values.mean()}")
print(f"logf0_rmse_gt std on 0703a_1M: {v6_logf0_df['logf0_rmse_gt'].values.std()}")
print(f"logf0_corr_gt mean on 0703a_1M: {v6_logf0_df['logf0_corr_gt'].values.mean()}")
print(f"logf0_corr_gt std on 0703a_1M: {v6_logf0_df['logf0_corr_gt'].values.std()}")

In [39]:
# from werpy_weighted import werpy
import werpy
import whisper
from whisper.tokenizer import get_tokenizer

In [40]:
# https://github.com/openai/whisper/discussions/1041#discussioncomment-5224837
tokenizer = get_tokenizer(multilingual=False)  # use multilingual=True if using multilingual model
number_tokens = [
    i 
    for i in range(tokenizer.eot)
    if all(c in "0123456789" for c in tokenizer.decode([i]).removeprefix(" "))
]

In [41]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

whisper_model = whisper.load_model('base.en').to(device)

In [42]:
split_txt_val_path = r'.\preprocessed_data\Ego4D_final_v6\val.txt'
val_uids = []
gt_transcripts_dict = {}
with open(split_txt_val_path) as file:
    for line in file:
        # print(line.split('|')[0])
        val_uids.append(line.split('|')[0])
        gt_transcripts_dict[line.split('|')[0]] = line.split('|')[-1]

In [28]:
syn_folder = r'.\output\0703a\result\Ego4D_final_v6\wav\synthesized'

In [29]:
%%time
uids = []
whisper_transcripts = []
gt_transcripts = []

for uid in tqdm(val_uids):
    uids.append(uid)
    output = whisper_model.transcribe(
            op.join(syn_folder, f"{uid}.wav"),
            suppress_tokens=[-1] + number_tokens,
    )
    whisper_transcript = output["text"]
    gt_transcript = gt_transcripts_dict[uid]

    whisper_transcripts.append(whisper_transcript)
    gt_transcripts.append(gt_transcript)


  0%|          | 0/2772 [00:00<?, ?it/s]

CPU times: total: 1h 20min 18s
Wall time: 43min 55s


In [30]:
whisper_df_0703a_1M = pd.DataFrame({
    'uid': uids,
    'whisper_transcripts': whisper_transcripts,
    'gt_transcripts': gt_transcripts,
})

In [31]:
whisper_df_0703a_1M.head()

Unnamed: 0,uid,whisper_transcripts,gt_transcripts
0,9d58583c-20de-439d-b1cd-9c2265bdedd8,What?,what \n
1,4a506707-52ea-493e-98c4-f667e3222d44,...and ship you and,pretty shit to be honest \n
2,0b67f942-6ebf-403d-aff0-f4e5d62d3140,I want to be able to cover it all and care fo...,oh when you nail them together they won't be ...
3,c237c6d5-f413-4e25-82e2-af7b408d390a,This style is too hard because I play in Hell.,no not necessary cuz if they're warped right ...
4,b8ed5cf9-b8ff-4da3-a764-a59020b85277,The area of sugar.,or we could just use the \n


In [32]:
whisper_df_0703a_1M.to_csv(r".\jupyter_walkthrough\metrics\whisper_df_0703a_1M.csv", index=False)

In [43]:
import pandas as pd

In [45]:
gt_folder = r'.\raw_data\Ego4D_final_v6\val\Ego4D_final_v6'

In [46]:
%%time
uids = []
whisper_transcripts = []
gt_transcripts = []

for uid in tqdm(val_uids):
    uids.append(uid)
    output = whisper_model.transcribe(
            op.join(gt_folder, f"{uid}.wav"),
            suppress_tokens=[-1] + number_tokens,
    )
    whisper_transcript = output["text"]
    gt_transcript = gt_transcripts_dict[uid]

    whisper_transcripts.append(whisper_transcript)
    gt_transcripts.append(gt_transcript)


  0%|          | 0/2772 [00:00<?, ?it/s]

CPU times: total: 1h 15min 34s
Wall time: 30min 12s


In [50]:
whisper_df_0703a_1M = pd.DataFrame({
    'uid': uids,
    'whisper_transcripts': whisper_transcripts,
    'gt_transcripts': gt_transcripts,
})

In [51]:
whisper_df_0703a_1M.head()

Unnamed: 0,uid,whisper_transcripts,gt_transcripts
0,9d58583c-20de-439d-b1cd-9c2265bdedd8,What?,what \n
1,4a506707-52ea-493e-98c4-f667e3222d44,"Pretty shit, to be honest.",pretty shit to be honest \n
2,0b67f942-6ebf-403d-aff0-f4e5d62d3140,One of them together.,oh when you nail them together they won't be ...
3,c237c6d5-f413-4e25-82e2-af7b408d390a,"No, no, I'm certainly... As if they're warped...",no not necessary cuz if they're warped right ...
4,b8ed5cf9-b8ff-4da3-a764-a59020b85277,See the dots and there.,or we could just use the \n


In [52]:
whisper_df_0703a_1M.to_csv(r".\jupyter_walkthrough\metrics\whisper_df_0703a_1M_gt.csv", index=False)

In [71]:
split_txt_val_path = r'.\preprocessed_data\Ego4D_final_v5\val.txt'
val_uids = []
gt_transcripts_dict = {}
with open(split_txt_val_path) as file:
    for line in file:
        # print(line.split('|')[0])
        val_uids.append(line.split('|')[0])
        gt_transcripts_dict[line.split('|')[0]] = line.split('|')[-1]

In [72]:
len(val_uids)

2715

In [73]:
import pandas as pd

In [74]:
gt_folder = r'.\raw_data\Ego4D_final_v5\val\Ego4D_final_v5'

In [75]:
%%time
uids = []
whisper_transcripts = []
gt_transcripts = []

for uid in tqdm(val_uids):
    uids.append(uid)
    output = whisper_model.transcribe(
            op.join(gt_folder, f"{uid}.wav"),
            suppress_tokens=[-1] + number_tokens,
    )
    whisper_transcript = output["text"]
    gt_transcript = gt_transcripts_dict[uid]

    whisper_transcripts.append(whisper_transcript)
    gt_transcripts.append(gt_transcript)


  0%|          | 0/2715 [00:00<?, ?it/s]

CPU times: total: 1h 10min 36s
Wall time: 29min 22s


In [76]:
whisper_df_0703a_1M = pd.DataFrame({
    'uid': uids,
    'whisper_transcripts': whisper_transcripts,
    'gt_transcripts': gt_transcripts,
})

In [77]:
whisper_df_0703a_1M.head()

Unnamed: 0,uid,whisper_transcripts,gt_transcripts
0,9d58583c-20de-439d-b1cd-9c2265bdedd8,What's...,what \n
1,4a506707-52ea-493e-98c4-f667e3222d44,"Pretty shit, to be honest.",pretty shit to be honest \n
2,0b67f942-6ebf-403d-aff0-f4e5d62d3140,this money worked out well you,oh when you nail them together they won't be ...
3,c237c6d5-f413-4e25-82e2-af7b408d390a,"Not necessarily, because if they're warped ri...",no not necessary cuz if they're warped right ...
4,22a8a78a-caea-4aa2-be9c-4e02e0d88009,a full time PLAY rings,stand them all down flat \n


In [78]:
whisper_df_0703a_1M.to_csv(r".\jupyter_walkthrough\metrics\whisper_df_0703a_1M_gt5.csv", index=False)

In [5]:
import pandas as pd
whisper_df_0703a_1M = pd.read_csv(r".\jupyter_walkthrough\metrics\whisper_df_0703a_1M.csv")

In [15]:
whisper_df_0703a_1M.shape

(2772, 3)

In [16]:
whisper_df_0703a_1M.dropna().shape

(2676, 3)

In [17]:
whisper_df_0703a_1M = whisper_df_0703a_1M.dropna()

In [18]:
whisper_df_0703a_1M['gt_transcripts'].values.tolist(),

([' what \n',
  ' pretty shit to be honest \n',
  " oh when you nail them together they won't be as hard right \n",
  " no not necessary cuz if they're warped right now you're going to nail them into a warped position \n",
  ' or we could just use the \n',
  ' stand them all down flat \n',
  ' right now \n',
  " no what's our only \n",
  ' maybe \n',
  ' i mean if we just do it in two positions \n',
  " it's harder to get done \n",
  ' is the biscuit thing up cuz then it might hold it in more position \n',
  ' no \n',
  " we're not doing the biscuit it's redundant \n",
  " if you want to get it done today then we're not doing the biscuit \n",
  ' can we do something that \n',
  ' so just we need to rearrange and play with the boards to figure out what orientation is the best \n',
  ' okay \n',
  " this'll probably be the front side \n",
  ' anything towards the back will seem is less likely to the seen \n',
  ' alright \n',
  ' is there anyway\n',
  " they're a different thing i should

In [19]:
normalized_ref = werpy.normalize(
    whisper_df_0703a_1M['gt_transcripts'].values.tolist(),
)

In [20]:
# for x in whisper_df_0703a_1M['whisper_transcripts'].values.tolist():
#     if type(x)!=type('str'):
#         print(x)

In [21]:
# normalized_hypo = werpy.normalize(
#     [str(s) for s in whisper_df_0703a_1M['whisper_transcripts'].values.tolist()],
# )
normalized_hypo = werpy.normalize(
    whisper_df_0703a_1M['whisper_transcripts'].values.tolist(),
)

In [22]:
normalized_ref[1]

'pretty shit to be honest'

In [23]:
normalized_hypo[1]

'and ship you and'

In [24]:
%%time
summary = werpy.summary(normalized_ref, normalized_hypo)
summary

CPU times: total: 62.5 ms
Wall time: 133 ms


Unnamed: 0,wer,ld,m,insertions,deletions,substitutions,inserted_words,deleted_words,substituted_words
0,0.000000,0,1,0,0,0,[],[],[]
1,1.000000,5,5,0,1,4,[],[pretty],"[(shit, and), (to, ship), (be, you), (honest, ..."
2,1.083333,13,12,1,0,12,[i],[],"[(oh, want), (when, to), (you, be), (nail, abl..."
3,1.000000,18,18,0,8,10,[],"[no, not, necessary, cuz, if, theyre, warped, ...","[(now, this), (youre, style), (going, is), (to..."
4,1.000000,6,6,0,2,4,[],"[or, we]","[(could, the), (just, area), (use, of), (the, ..."
...,...,...,...,...,...,...,...,...,...
2671,1.000000,42,42,0,34,8,[],"[face, down, in, a, pile, and, announces, the,...","[(be, thanks), (the, for), (same, having), (ra..."
2672,0.962264,51,53,0,14,37,[],"[id, read, the, instructions, to, you, right, ...","[(now, as), (pick, well), (up, as), (it, first..."
2673,0.944444,51,54,0,20,31,[],"[be, the, truth, or, a, lie, then, the, player...","[(places, its), (discard, tool), (pile, like),..."
2674,1.000000,40,40,0,39,1,[],"[but, lets, go, lets, go, with, this, and, may...","[(fly, look)]"


In [25]:
I = summary['insertions'].sum()
D = summary['deletions'].sum()
S = summary['substitutions'].sum()
M = summary['m'].sum()

In [26]:
summary.to_csv(r".\jupyter_walkthrough\metrics\0703a_1M_wer_summary.csv", index=False)

In [27]:
wer = (I + D + S) / M

In [28]:
I + D + S

15562

In [29]:
M

14730

In [30]:
I, D, S

(1674, 6132, 7756)

In [31]:
wer

1.0564833672776646

In [53]:
import pandas as pd
whisper_df_0703a_1M = pd.read_csv(r".\jupyter_walkthrough\metrics\whisper_df_0703a_1M_gt.csv")

In [54]:
whisper_df_0703a_1M.shape

(2772, 3)

In [55]:
whisper_df_0703a_1M.dropna().shape

(2733, 3)

In [56]:
whisper_df_0703a_1M = whisper_df_0703a_1M.dropna()

In [57]:
normalized_ref = werpy.normalize(
    whisper_df_0703a_1M['gt_transcripts'].values.tolist(),
)

In [58]:
normalized_hypo = werpy.normalize(
    whisper_df_0703a_1M['whisper_transcripts'].values.tolist(),
)

In [59]:
normalized_ref[1]

'pretty shit to be honest'

In [60]:
normalized_hypo[1]

'pretty shit to be honest'

In [61]:
%%time
summary = werpy.summary(normalized_ref, normalized_hypo)
# summary

CPU times: total: 93.8 ms
Wall time: 188 ms


In [62]:
I = summary['insertions'].sum()
D = summary['deletions'].sum()
S = summary['substitutions'].sum()
M = summary['m'].sum()

In [63]:
summary.to_csv(r".\jupyter_walkthrough\metrics\0703a_1M_wer_summary.csv", index=False)

In [64]:
wer = (I + D + S) / M

In [65]:
I + D + S

10461

In [66]:
M

14900

In [67]:
I, D, S

(2910, 2124, 5427)

In [68]:
wer

0.7020805369127516

In [79]:
import pandas as pd
whisper_df_0703a_1M = pd.read_csv(r".\jupyter_walkthrough\metrics\whisper_df_0703a_1M_gt5.csv")

In [80]:
whisper_df_0703a_1M.shape

(2715, 3)

In [81]:
whisper_df_0703a_1M.dropna().shape

(2673, 3)

In [82]:
whisper_df_0703a_1M = whisper_df_0703a_1M.dropna()

In [83]:
normalized_ref = werpy.normalize(
    whisper_df_0703a_1M['gt_transcripts'].values.tolist(),
)

In [84]:
normalized_hypo = werpy.normalize(
    whisper_df_0703a_1M['whisper_transcripts'].values.tolist(),
)

In [85]:
normalized_ref[1]

'pretty shit to be honest'

In [86]:
normalized_hypo[1]

'pretty shit to be honest'

In [87]:
%%time
summary = werpy.summary(normalized_ref, normalized_hypo)
# summary

CPU times: total: 125 ms
Wall time: 200 ms


In [88]:
I = summary['insertions'].sum()
D = summary['deletions'].sum()
S = summary['substitutions'].sum()
M = summary['m'].sum()

In [89]:
summary.to_csv(r".\jupyter_walkthrough\metrics\0703a_1M_wer_summary_v5.csv", index=False)

In [1]:
import pandas as pd
%cd "D:\Schoolwork\TERM 3\WORK\visual_prosody"

D:\Schoolwork\TERM 3\WORK\visual_prosody


In [2]:
summary = pd.read_csv(r".\jupyter_walkthrough\metrics\0703a_1M_wer_summary_v5.csv")

In [4]:
I = summary['insertions'].sum()
D = summary['deletions'].sum()
S = summary['substitutions'].sum()
M = summary['m'].sum()

In [5]:
wer = (I + D + S) / M

In [6]:
I + D + S

10139

In [7]:
M

14780

In [8]:
I, D, S, M

(3044, 2070, 5025, 14780)

In [9]:
I/M, D/M, S/M

(0.20595399188092017, 0.14005412719891747, 0.3399864682002706)

In [10]:
wer

0.6859945872801082