In [1]:
# import gi
# import os
import numpy as np
import librosa
import pandas as pd
from aquatk.metrics.PEAQ.peaq import PEAQ
from aquatk.metrics.PEAQ.peaq_basic import process_audio_files
import torch
# os.environ["GST_PLUGIN_PATH"] = "/usr/local/lib/gstreamer-1.0"

# gi.require_version('Gst', '1.0')

# from gi.repository import Gst

# Gst.init(None)

from torchmetrics.audio import SignalNoiseRatio
from torchmetrics.audio import ScaleInvariantSignalDistortionRatio

In [2]:
import gi
import os

os.environ["GST_PLUGIN_PATH"] = "/usr/local/lib/gstreamer-1.0"

gi.require_version('Gst', '1.0')
from gi.repository import Gst

Gst.init(None)

registry = Gst.Registry.get()
feature = registry.lookup_feature("peaq")

In [3]:
def peaq_score(ref, test):
    pipeline_str = (
        f"filesrc location=\"{os.path.abspath(ref)}\" ! decodebin ! audioconvert ! audioresample ! "
        f"audio/x-raw,format=F32LE,rate=48000,channels=1 ! queue ! peaq name=p "
        f"filesrc location=\"{os.path.abspath(test)}\" ! decodebin ! audioconvert ! audioresample ! "
        f"audio/x-raw,format=F32LE,rate=48000,channels=1 ! queue ! p.test"
    )
    pipeline = Gst.parse_launch(pipeline_str)
    peaq = pipeline.get_by_name("p")
    bus = pipeline.get_bus()
    pipeline.set_state(Gst.State.PLAYING)
    bus.timed_pop_filtered(Gst.CLOCK_TIME_NONE, Gst.MessageType.EOS | Gst.MessageType.ERROR)
    odg = peaq.get_property("odg")
    pipeline.set_state(Gst.State.NULL)
    return odg

In [4]:
peaq = PEAQ()

In [5]:
si_sdr = ScaleInvariantSignalDistortionRatio()
snr = SignalNoiseRatio()

In [6]:
def lsd(ref, test):
    s1 = librosa.stft(ref, n_fft=2048, hop_length=512)
    s2 = librosa.stft(test, n_fft=2048, hop_length=512)
    
    p1 = np.abs(s1)**2
    p2 = np.abs(s2)**2
    
    log_diff = 10 * np.log10(p1 + 1e-12) - 10 * np.log10(p2 + 1e-12)
    
    return np.mean(np.sqrt(np.mean(log_diff**2, axis=0)))

In [7]:
# noisy_path = "data/AhRep_smol.wav"
# denoised_path = "data/AhRep_smol_pred.wav"
# dolby_path = "data/AhRep_smol_dolby.wav"
# clean_path = "data/AhRep_clean.wav"
noisy_path = "data/DS_smol.wav"
denoised_path = "data/DS_smol_pred.wav"
dolby_path = "data/DS_smol_dolby.wav"
clean_path = "data/DS_clean.wav"

sr = 48000
clean, _ = librosa.load(clean_path, sr=sr, res_type='soxr_hq')
noisy, _ = librosa.load(noisy_path, sr=sr, res_type='soxr_hq')
pred, _ = librosa.load(denoised_path, sr=sr, res_type='soxr_hq')
dolby, _ = librosa.load(dolby_path, sr=sr, res_type='soxr_hq')

min_len = min(len(clean), len(noisy), len(pred), len(dolby))
clean, noisy, pred, dolby = clean[:min_len], noisy[:min_len], pred[:min_len], dolby[:min_len]

# peaq.analyze_files(clean_path, noisy_path)
# peaq.analyze_files(clean_path, denoised_path)
# peaq.analyze_files(clean_path, dolby_path)


results = {
    "Metric": ["PEAQ(GST)","PEAQ", "PEAQbasic", "SI-SDR (dB)", "SNR (dB)", "LSD"],
    "Original Noisy": [
        peaq_score(clean_path, noisy_path),
        peaq.analyze_files(clean_path, noisy_path).odg,
        process_audio_files(clean_path, noisy_path)["ODG_list"][-1],
        si_sdr(torch.from_numpy(clean), torch.from_numpy(noisy)),
        snr(torch.from_numpy(clean), torch.from_numpy(noisy)),
        lsd(clean, noisy),
    ],
    "Model": [
        peaq_score(clean_path, denoised_path),
        peaq.analyze_files(clean_path, denoised_path).odg,
        process_audio_files(clean_path, denoised_path)["ODG_list"][-1],
        si_sdr(torch.from_numpy(clean), torch.from_numpy(pred)),
        snr(torch.from_numpy(clean), torch.from_numpy(pred)),
        lsd(clean, pred),
    ],
    "Dolby": [
        peaq_score(clean_path, dolby_path),
        peaq.analyze_files(clean_path, dolby_path).odg,
        process_audio_files(clean_path, dolby_path)["ODG_list"][-1],
        si_sdr(torch.from_numpy(clean), torch.from_numpy(dolby)),
        snr(torch.from_numpy(clean), torch.from_numpy(dolby)),
        lsd(clean, dolby),
    ],
}

   BandwidthRefB: 905.984034
  BandwidthTestB: 905.513445
      Total NMRB: 7.228015
    WinModDiff1B: 24.355310
            ADBB: 2.134196
            EHSB: 1.651176
    AvgModDiff1B: 18.652351
    AvgModDiff2B: 774.813850
   RmsNoiseLoudB: 0.375269
           MFPDB: 1.000000
  RelDistFramesB: 1.000000
Objective Difference Grade: -3.722
   BandwidthRefB: 905.984034
  BandwidthTestB: 905.513445
      Total NMRB: 7.225276
    WinModDiff1B: 24.345964
            ADBB: 2.134005
            EHSB: 1.651236
    AvgModDiff1B: 18.642007
    AvgModDiff2B: 774.087059
   RmsNoiseLoudB: 0.375136
           MFPDB: 1.000000
  RelDistFramesB: 1.000000
Objective Difference Grade: -3.721


Channel 1/2: 100%|██████████| 1094/1094 [00:41<00:00, 26.64it/s]
Channel 2/2: 100%|██████████| 1094/1094 [00:41<00:00, 26.35it/s]


int16
9108 -11012
int16
9185 -11154


100%|██████████| 1094/1094 [00:41<00:00, 26.38it/s]


Distortion Index: -1.5691365941723532, Objective Difference Grade: -3.2333163217145415
   BandwidthRefB: 919.087322
  BandwidthTestB: 912.035264
      Total NMRB: 3.825056
    WinModDiff1B: 16.084964
            ADBB: 1.964296
            EHSB: 14.142104
    AvgModDiff1B: 11.889239
    AvgModDiff2B: 234.945406
   RmsNoiseLoudB: 0.375683
           MFPDB: 1.000000
  RelDistFramesB: 1.000000
Objective Difference Grade: -2.846
   BandwidthRefB: 919.088926
  BandwidthTestB: 912.040268
      Total NMRB: 3.822257
    WinModDiff1B: 16.082160
            ADBB: 1.964164
            EHSB: 14.132085
    AvgModDiff1B: 11.911689
    AvgModDiff2B: 234.837986
   RmsNoiseLoudB: 0.376055
           MFPDB: 1.000000
  RelDistFramesB: 1.000000
Objective Difference Grade: -2.848


Channel 1/2: 100%|██████████| 1094/1094 [00:41<00:00, 26.38it/s]
Channel 2/2: 100%|██████████| 1094/1094 [00:41<00:00, 26.55it/s]


int16
9108 -11012
int16
9161 -11146


100%|██████████| 1094/1094 [00:41<00:00, 26.66it/s]


Distortion Index: -0.4007517022025223, Objective Difference Grade: -2.294732023407698
   BandwidthRefB: 919.147775
  BandwidthTestB: 914.941226
      Total NMRB: -0.808709
    WinModDiff1B: 13.538673
            ADBB: 1.803438
            EHSB: 3.075342
    AvgModDiff1B: 11.478913
    AvgModDiff2B: 270.358391
   RmsNoiseLoudB: 0.272867
           MFPDB: 1.000000
  RelDistFramesB: 0.987406
Objective Difference Grade: -2.880
   BandwidthRefB: 919.149329
  BandwidthTestB: 914.942953
      Total NMRB: -0.809423
    WinModDiff1B: 13.536052
            ADBB: 1.803777
            EHSB: 3.079519
    AvgModDiff1B: 11.500885
    AvgModDiff2B: 270.237117
   RmsNoiseLoudB: 0.273288
           MFPDB: 1.000000
  RelDistFramesB: 0.987416
Objective Difference Grade: -2.882


Channel 1/2: 100%|██████████| 1094/1094 [00:41<00:00, 26.44it/s]
Channel 2/2: 100%|██████████| 1094/1094 [00:41<00:00, 26.28it/s]


int16
9108 -11012
int16
9190 -11232


100%|██████████| 1094/1094 [00:41<00:00, 26.28it/s]


Distortion Index: -0.4330821000324772, Objective Difference Grade: -2.322270244044949


In [8]:
df = pd.DataFrame(results)
print(df.to_string(index=False))

     Metric  Original Noisy           Model           Dolby
  PEAQ(GST)       -3.721741       -2.845786       -2.879942
       PEAQ       -3.360937       -2.279436       -3.517137
  PEAQbasic       -2.963845       -2.234199       -3.052231
SI-SDR (dB) tensor(24.9931) tensor(23.4466) tensor(16.6954)
   SNR (dB) tensor(24.9787) tensor(23.4481) tensor(16.7832)
        LSD       19.959263       12.139639       13.510159
