In [1]:
from inaGVAD.vad_metrics import VadEval
from inaGVAD.metadata import fileid2metadata
import glob
import pandas as pd

In [2]:
df = pd.DataFrame.from_records([fileid2metadata(x) for x in glob.glob('../../annotations/trs/*.trs')])
df

Unnamed: 0,fileid,media,channel_code,channel_name,channel_category,broadcast_hour
0,tv-C+_-141814,tv,C+_,Canal+,generalist_tv,14
1,tv-TF1-194413,tv,TF1,TF1,generalist_tv,19
2,radio-FIF-103502,radio,FIF,France Info,generalist_radio,10
3,radio-FBL-012717,radio,FBL,France Bleu,music_radio,1
4,radio-FBL-011904,radio,FBL,France Bleu,music_radio,1
...,...,...,...,...,...,...
272,radio-FMU-000842,radio,FMU,France Musique,music_radio,0
273,tv-C+N-184644,tv,C+N,CNEWS,news_tv,18
274,tv-LCI-200527,tv,LCI,LCI,news_tv,20
275,radio-MUV-031209,radio,MUV,Mouv',music_radio,3


In [3]:
lsystems = ['inaspeechsegmenter', 'liumspkdiarization_csv', 'rvad_csv', 'silero_vad', 'pyannote_vad', 'speechbrain_vad']
ve = VadEval(collar=0.3)

In [4]:
for system in lsystems:
    tmp, summary = ve.compare_directories('../../annotations/vad/', '../../automatic_baselines/' + system)
    print(system, summary)
    tmp = tmp[['basename', 'accuracy']].rename({'accuracy' : system}, axis=1)
    df = df.join(tmp.set_index('basename'), on='fileid')

inaspeechsegmenter {'accuracy': 0.9328096686186261, 'precision': 0.9216109208956147, 'recall': 0.9724278249623988, 'fmeasure': 0.9463376662467702}
liumspkdiarization_csv {'accuracy': 0.8412262485689355, 'precision': 0.8139075360910878, 'recall': 0.9585612039848338, 'fmeasure': 0.8803316753494729}
rvad_csv {'accuracy': 0.6974821088462414, 'precision': 0.6778461662612372, 'recall': 0.9594465372805209, 'fmeasure': 0.7944299215667192}
silero_vad {'accuracy': 0.8484739332910665, 'precision': 0.8117290461870477, 'recall': 0.9781648446105309, 'fmeasure': 0.8872088121107543}
pyannote_vad {'accuracy': 0.8905697128181659, 'precision': 0.8541998983114646, 'recall': 0.989233983070536, 'fmeasure': 0.9167712238332169}
speechbrain_vad {'accuracy': 0.9104895674326066, 'precision': 0.8865004181586101, 'recall': 0.9783388505035628, 'fmeasure': 0.930158233630953}


In [5]:
df['best3acc'] = (df.inaspeechsegmenter + df.pyannote_vad + df.speechbrain_vad) / 3.

In [6]:
def difficulty(x):
    if x < .85:
        return 'HARD'
    elif x >= .95:
        return 'EASY'
    return 'MEDIUM'

df['vad_difficulty'] = df.best3acc.map(difficulty)

In [7]:
df = df.drop(lsystems + ['best3acc'], axis=1)

In [13]:
ldev = []
ltest = []

for k, sdf in df.groupby('channel_category'):
    shuffled = sdf.sample(frac=1).reset_index(drop=True)
    ldev.append(shuffled[:15])
    ltest.append(shuffled[15:])

In [14]:
dfdev = pd.concat(ldev).sort_values(by='fileid').reset_index(drop=True)
dftest = pd.concat(ltest).sort_values(by='fileid').reset_index(drop=True)


In [15]:
dfdev.to_csv('../../annotations/filesplit/devset.csv', index=False)
dftest.to_csv('../../annotations/filesplit/testset.csv', index=False)

In [18]:
for k, sdf in dfdev.groupby('channel_category'):
    print(k, len(sdf))
    for k in ['EASY', 'MEDIUM', 'HARD']:
        nb = (sdf.vad_difficulty == k).sum()
        print('  ',k, nb, 100* nb/ len(sdf))

generalist_radio 15
   EASY 10 66.66666666666667
   MEDIUM 5 33.333333333333336
   HARD 0 0.0
generalist_tv 15
   EASY 2 13.333333333333334
   MEDIUM 9 60.0
   HARD 4 26.666666666666668
music_radio 15
   EASY 7 46.666666666666664
   MEDIUM 4 26.666666666666668
   HARD 4 26.666666666666668
news_tv 15
   EASY 9 60.0
   MEDIUM 5 33.333333333333336
   HARD 1 6.666666666666667


In [19]:
for k, sdf in dftest.groupby('channel_category'):
    print(k, len(sdf))
    for k in ['EASY', 'MEDIUM', 'HARD']:
        nb = (sdf.vad_difficulty == k).sum()
        print('  ',k, nb, 100* nb/ len(sdf))

generalist_radio 34
   EASY 25 73.52941176470588
   MEDIUM 6 17.647058823529413
   HARD 3 8.823529411764707
generalist_tv 93
   EASY 17 18.27956989247312
   MEDIUM 49 52.68817204301075
   HARD 27 29.032258064516128
music_radio 46
   EASY 25 54.34782608695652
   MEDIUM 8 17.391304347826086
   HARD 13 28.26086956521739
news_tv 44
   EASY 28 63.63636363636363
   MEDIUM 15 34.09090909090909
   HARD 1 2.272727272727273
