In [6]:
from diffwave.inference import predict as diffwave_predict
import torch
import numpy as np
import IPython.display as ipd
import torchaudio


import soundfile as sf
import librosa
import warnings
from pymcd.mcd import Calculate_MCD
from pystoi import stoi
from pesq import pesq

import time
import GPUtil

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"



In [7]:
model_dir = 'diffwave-ljspeech-22kHz-1000578.pt'
spectrogram_data = torch.from_numpy(np.load('mel_spectrogram_data.npy')).float().unsqueeze(0).to('cuda')
spectrogram = spectrogram_data
audio, sample_rate = diffwave_predict(spectrogram, model_dir, fast_sampling=True)



audio_for_playback = audio.squeeze().cpu()
ipd.display(ipd.Audio(audio_for_playback, rate=sample_rate))
torchaudio.save("original_diffwave.wav", audio_for_playback.unsqueeze(0), sample_rate)

In [8]:
# 3. Define the audio file paths
original_file = "example.wav"
generated_file = "original_diffwave.wav"

print("--- Running Vocoder Benchmarks ---")
print(f"Original: {original_file}")
print(f"Generated: {generated_file}\n")


# --- Metric 1: Mel-Cepstral Distortion (MCD) ↓ ---
# Lower is better. Checks spectral accuracy.
mcd_toolbox = Calculate_MCD(MCD_mode="dtw")
mcd_value = mcd_toolbox.calculate_mcd(original_file, generated_file)
print(f"MCD↓:  {mcd_value:.2f} dB")


# --- Metric 2: Short-Time Objective Intelligibility (STOI) ↑ ---
original_audio, sr = sf.read(original_file)
generated_audio, sr_gen = sf.read(generated_file)
if sr != sr_gen:
    generated_audio = librosa.resample(y=generated_audio, orig_sr=sr_gen, target_sr=sr)

# --- THIS IS THE FIX ---
# Trim both audio files to the length of the shorter one
min_len = min(len(original_audio), len(generated_audio))
original_audio = original_audio[:min_len]
generated_audio = generated_audio[:min_len]
# --- END OF FIX ---

stoi_score = stoi(original_audio, generated_audio, sr, extended=False)
print(f"STOI↑:  {stoi_score:.4f}")


# --- Metric 3: Wideband PESQ (WB-PESQ) ↑ ---
# Higher is better (~1-4.5). Checks overall perceptual quality.
# Note: WB-PESQ MUST use 16kHz audio. We resample to meet this requirement.
sr_pesq = 16000
original_16k = librosa.resample(y=original_audio, orig_sr=sr, target_sr=sr_pesq)
generated_16k = librosa.resample(y=generated_audio, orig_sr=sr, target_sr=sr_pesq)
pesq_score = pesq(sr_pesq, original_16k, generated_16k, 'wb')
print(f"PESQ↑:  {pesq_score:.2f}")


--- Running Vocoder Benchmarks ---
Original: example.wav
Generated: original_diffwave.wav



MCD↓:  2.62 dB
STOI↑:  0.9214
PESQ↑:  2.30


In [9]:
# Performance Metrics (using existing audio from cell 1)
print("\n--- Performance Metrics ---")

# Get model size/parameters
checkpoint = torch.load(model_dir, map_location='cpu')
if isinstance(checkpoint, dict):
    if 'model' in checkpoint:
        model_state = checkpoint['model']
    elif 'state_dict' in checkpoint:
        model_state = checkpoint['state_dict']
    else:
        model_state = checkpoint
    param_count = sum(param.numel() for param in model_state.values())
else:
    param_count = sum(p.numel() for p in checkpoint.parameters())

print(f"Model parameters: {param_count:,} ({param_count/1e6:.2f}M)")
del checkpoint
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Use the audio you ALREADY generated in cell 1
audio_duration = len(audio_for_playback) / sample_rate
print(f"Audio duration: {audio_duration:.3f} seconds")

# Note about timing
print("Note: To get accurate RTF, measure the execution time of cell 1")
print("(Time cell 1 takes to run / audio duration = RTF)")

# GPU utilization (current)
if torch.cuda.is_available():
    try:
        gpus = GPUtil.getGPUs()
        if gpus:
            gpu = gpus[0]
            print(f"Current GPU utilization: {gpu.load*100:.1f}%, Memory: {gpu.memoryUtil*100:.1f}%")
        else:
            print("GPU available but no GPUs found by GPUtil")
    except:
        print("Could not get GPU utilization")
else:
    print("No GPU available")


--- Performance Metrics ---
Model parameters: 2,619,971 (2.62M)
Audio duration: 17.821 seconds
Note: To get accurate RTF, measure the execution time of cell 1
(Time cell 1 takes to run / audio duration = RTF)
Current GPU utilization: 0.0%, Memory: 0.2%


In [10]:
# Debug the RTF calculation (FIXED - using correct variables)
print("=== Debugging RTF calculation ===")

# Check the audio details (using YOUR actual variables)
print(f"Audio shape: {audio.shape}")
print(f"Sample rate: {sample_rate}")
print(f"Audio length (samples): {len(audio_for_playback)}")

# Calculate duration manually
audio_duration_debug = len(audio_for_playback) / sample_rate
print(f"Audio duration: {audio_duration_debug:.3f} seconds")

# Note about timing
print("To get RTF: Time how long cell 1 takes to run, then divide by audio duration")
print(f"RTF = (cell 1 execution time) / {audio_duration_debug:.3f}")

# Also check the original audio for comparison
original_audio_check, original_sr = torchaudio.load("example.wav")
original_duration = len(original_audio_check.squeeze()) / original_sr
print(f"Original audio duration: {original_duration:.3f} seconds")

=== Debugging RTF calculation ===
Audio shape: torch.Size([1, 392960])
Sample rate: 22050
Audio length (samples): 392960
Audio duration: 17.821 seconds
To get RTF: Time how long cell 1 takes to run, then divide by audio duration
RTF = (cell 1 execution time) / 17.821
Original audio duration: 17.820 seconds


In [None]:
# Test comment - checking if notebook editing works!
print("Hello from new cell")
