# Test VCC2018

In [1]:
import models
import dataset
import torch    
from collections import OrderedDict

def extract_prefix(prefix, weights):
    result = OrderedDict()
    for key in weights:
        if key.find(prefix) == 0:
            result[key[len(prefix):]] = weights[key]
    return result     


ds = dataset.VCC2018DatasetWav2Vec2(list_path='../MOSNet/data/mos_list.txt', data_path='../MOSNet/data/wav/')
model = models.Wav2Vec2MOS().cuda()
model.load_state_dict(extract_prefix('model.', torch.load('wav2vec2.ckpt')['state_dict']))
_=model.eval()

comet_ml is installed but `COMET_API_KEY` is not set.
################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

  '"sox" backend is being deprecated. '
[NeMo W 2021-05-28 19:42:11 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.


In [2]:
from transformers import Wav2Vec2Model, Wav2Vec2Processor
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")

In [3]:
import tqdm

pred_mos = []
gt_mos   = []
for i in tqdm.trange(ds.getlen('test')):
    signal, mos = ds.getitem('test', i)
    x = processor(signal, return_tensors="pt", padding=True, sampling_rate=16000).input_values
    with torch.no_grad():
        res = model(x.cuda()).mean()
    
    gt_mos.append(mos.item())
    pred_mos.append(res.item())

100%|██████████| 4000/4000 [01:20<00:00, 49.82it/s]


In [4]:
import numpy as np
import scipy.stats as ss

mse = ((np.array(pred_mos)-np.array(gt_mos))**2).mean()
lcc = np.corrcoef(np.array(gt_mos), np.array(pred_mos))[0,1]
srcc = ss.spearmanr(np.array(gt_mos), np.array(pred_mos))[0]

In [5]:
ds.samples['test'].columns = ['audio', 'true_mos']
df = ds.samples['test'].copy()
df['predict_mos'] = np.array(pred_mos)

In [6]:
import pandas as pd

sys_df = pd.read_csv('../MOSNet/data/vcc2018_system.csv')
df['system_ID'] = df['audio'].str.split('_').str[-1].str.split('.').str[0] + '_' + df['audio'].str.split('_').str[0]
result_mean = df[['system_ID', 'predict_mos']].groupby(['system_ID']).mean()
mer_df = pd.merge(result_mean, sys_df, on='system_ID')                                                                                                                 

sys_true = mer_df['mean']
sys_predicted = mer_df['predict_mos']

In [7]:
sys_mse = ((sys_true-sys_predicted)**2).mean()
sys_lcc = np.corrcoef(sys_true, sys_predicted)[0,1]
sys_srcc = ss.spearmanr(sys_true, sys_predicted)[0]

In [8]:
print('[Utterance] MSE = {:.4f} LCC = {:.4f} SRCC = {:.4f}'.format(mse, lcc, srcc))
print('[System   ] MSE = {:.4f} LCC = {:.4f} SRCC = {:.4f}'.format(sys_mse, sys_lcc, sys_srcc))

[Utterance] MSE = 0.7079 LCC = 0.6554 SRCC = 0.6244
[System   ] MSE = 0.2302 LCC = 0.9673 SRCC = 0.9337


# Test VCC2016

In [9]:
from io import StringIO

eh1 = pd.read_csv(StringIO('\n'.join([s.strip().replace(':MOS', '') for s in open('mos_EH1.txt', 'r').readlines()])), sep='\t')
eh2 = pd.read_csv(StringIO('\n'.join([s.strip().replace(':MOS', '') for s in open('mos_EH2.txt', 'r').readlines()])), sep='\t')

mos_vcc2016 = pd.DataFrame(pd.concat([eh1, eh2]).mean())
mos_vcc2016.columns = ['system_mos']

In [10]:
import glob

vcc2016 = glob.glob('../vcc2016_submissions/*/*.wav')

df = pd.DataFrame({
    'audio': vcc2016,
    'system': [s.split('/')[-2] for s in vcc2016]
})

In [11]:
import librosa
import scipy.signal

def load(path):
    signal,sr = librosa.load(path, sr=16000)
    return signal

    
pred_mos = []

for i in tqdm.trange(df.shape[0]):
    signal = load(df.iloc[i]['audio'])
    x = processor(signal, return_tensors="pt", padding=True, sampling_rate=16000).input_values
    with torch.no_grad():
        res = model(x.cuda()).mean()
    
    pred_mos.append(res.item())

100%|██████████| 26028/26028 [09:03<00:00, 47.85it/s]


In [12]:
df['pred_mos'] = np.array(pred_mos)

In [13]:
gt_mos = np.array(mos_vcc2016.sort_index()['system_mos'])
pred_mos = np.array(df.groupby('system').mean()['pred_mos'])

In [14]:
def score(gt, pred):
    mse = ((gt-pred)**2).mean()
    lcc = np.corrcoef(gt, pred)[0,1]
    srcc = ss.spearmanr(gt, pred)[0]
    print('MSE = {:.4f} LCC = {:.4f} SRCC = {:.4f}'.format(mse, lcc, srcc))
    
score(gt_mos, pred_mos)

MSE = 0.9769 LCC = 0.9384 SRCC = 0.8571


# Test on main

In [15]:
import os
model_scores = {}

for model_name in ['waveglow', 'hifigan_v1', 'hifigan_v2', 'hifigan_v3', 'melgan', 'val_dataset']:
    print(model_name, '...')
    model_scores[model_name] = []
    for f in os.listdir('to_score/' + model_name):
        signal = load(os.path.join('to_score', model_name, f))
        x = processor(signal, return_tensors="pt", padding=True, sampling_rate=16000).input_values
        with torch.no_grad():
            res = model(x.cuda()).mean()
            model_scores[model_name].append(res)
            
for model_name in model_scores:
    print(model_name, torch.tensor(model_scores[model_name]).mean().item())

waveglow ...
hifigan_v1 ...
hifigan_v2 ...
hifigan_v3 ...
melgan ...
val_dataset ...
waveglow 3.9508957862854004
hifigan_v1 4.0383405685424805
hifigan_v2 4.030723571777344
hifigan_v3 3.937237501144409
melgan 3.645268201828003
val_dataset 4.23707389831543
