# Test VCC2018

In [1]:
import warnings; warnings.simplefilter('ignore')
import models
import dataset
import torch    
from collections import OrderedDict

def extract_prefix(prefix, weights):
    result = OrderedDict()
    for key in weights:
        if key.find(prefix) == 0:
            result[key[len(prefix):]] = weights[key]
    return result   

comet_ml is installed but `COMET_API_KEY` is not set.
################################################################################
###          (please add 'export KALDI_ROOT=<your_path>' in your $HOME/.profile)
###          (or run as: KALDI_ROOT=<your_path> python <your_script>.py)
################################################################################

[NeMo W 2021-05-29 21:31:09 experimental:28] Module <class 'nemo.collections.asr.data.audio_to_text_dali.AudioToCharDALIDataset'> is experimental, not ready for production and is not fully supported. Use at your own risk.


In [3]:
# Set here checkpoint path and model type
CHECKPOINT_PATH = '../../my_mosnet/mbblock_ep40.ckpt'
VCC2018_PATH = '../../MOSNet/data/wav/'
VCC2016_PATH = '../../vcc2016_submissions/'
MODEL_TYPE = models.MOSNetBatchNorm

In [4]:
ds = dataset.VCC2018Dataset(list_path='data/mos_list.txt', data_path=VCC2018_PATH)
model = MODEL_TYPE().cuda()
model.load_state_dict(extract_prefix('model.', torch.load(CHECKPOINT_PATH)['state_dict']))
_=model.eval()

In [5]:
import tqdm

pred_mos = []
gt_mos   = []
for i in tqdm.trange(ds.getlen('test')):
    spec, mos = ds.getitem('test', i)
    with torch.no_grad():
        res = model(spec.cuda()).mean()
    
    gt_mos.append(mos.item())
    pred_mos.append(res.item())

100%|██████████| 4000/4000 [00:26<00:00, 149.89it/s]


In [6]:
import numpy as np
import scipy.stats as ss

mse = ((np.array(pred_mos)-np.array(gt_mos))**2).mean()
lcc = np.corrcoef(np.array(gt_mos), np.array(pred_mos))[0,1]
srcc = ss.spearmanr(np.array(gt_mos), np.array(pred_mos))[0]

In [7]:
ds.samples['test'].columns = ['audio', 'true_mos']
df = ds.samples['test'].copy()
df['predict_mos'] = np.array(pred_mos)

In [8]:
import pandas as pd

sys_df = pd.read_csv('data/vcc2018_system.csv')
df['system_ID'] = df['audio'].str.split('_').str[-1].str.split('.').str[0] + '_' + df['audio'].str.split('_').str[0]
result_mean = df[['system_ID', 'predict_mos']].groupby(['system_ID']).mean()
mer_df = pd.merge(result_mean, sys_df, on='system_ID')                                                                                                                 

sys_true = mer_df['mean']
sys_predicted = mer_df['predict_mos']

In [9]:
sys_mse = ((sys_true-sys_predicted)**2).mean()
sys_lcc = np.corrcoef(sys_true, sys_predicted)[0,1]
sys_srcc = ss.spearmanr(sys_true, sys_predicted)[0]

In [10]:
print('[Utterance] MSE = {:.4f} LCC = {:.4f} SRCC = {:.4f}'.format(mse, lcc, srcc))
print('[System   ] MSE = {:.4f} LCC = {:.4f} SRCC = {:.4f}'.format(sys_mse, sys_lcc, sys_srcc))

[Utterance] MSE = 0.4349 LCC = 0.6908 SRCC = 0.6611
[System   ] MSE = 0.0381 LCC = 0.9796 SRCC = 0.9381


# Test VCC2016

In [13]:
from io import StringIO

eh1 = pd.read_csv(StringIO('\n'.join([s.strip().replace(':MOS', '') for s in open('data/vcc16/mos_EH1.txt', 'r').readlines()])), sep='\t')
eh2 = pd.read_csv(StringIO('\n'.join([s.strip().replace(':MOS', '') for s in open('data/vcc16/mos_EH2.txt', 'r').readlines()])), sep='\t')

mos_vcc2016 = pd.DataFrame(pd.concat([eh1, eh2]).mean())
mos_vcc2016.columns = ['system_mos']

In [16]:
import glob
import os

vcc2016 = glob.glob(os.path.join(VCC2016_PATH, '*/*.wav'))

df = pd.DataFrame({
    'audio': vcc2016,
    'system': [s.split('/')[-2] for s in vcc2016]
})

In [17]:
import librosa
import scipy.signal

def load(path):
    signal,sr = librosa.load(path, sr=16000)
    spec = np.abs(librosa.stft(signal, n_fft=512, hop_length=256, win_length=512, window=scipy.signal.hamming).T).astype(np.float32) # [time, 257]

    return torch.as_tensor(spec).unsqueeze(0).unsqueeze(1)

    
pred_mos = []

for i in tqdm.trange(df.shape[0]):
    spec = load(df.iloc[i]['audio'])
    
    with torch.no_grad():
        res = model(spec.cuda()).mean()
    
    pred_mos.append(res.item())

100%|██████████| 26028/26028 [02:50<00:00, 152.51it/s]


In [18]:
df['pred_mos'] = np.array(pred_mos)

In [19]:
gt_mos = np.array(mos_vcc2016.sort_index()['system_mos'])
pred_mos = np.array(df.groupby('system').mean()['pred_mos'])

In [20]:
def score(gt, pred):
    mse = ((gt-pred)**2).mean()
    lcc = np.corrcoef(gt, pred)[0,1]
    srcc = ss.spearmanr(gt, pred)[0]
    print('MSE = {:.4f} LCC = {:.4f} SRCC = {:.4f}'.format(mse, lcc, srcc))
    
score(gt_mos, pred_mos)

MSE = 0.4226 LCC = 0.9347 SRCC = 0.8872
