forked from igormq/asr-study
-
Notifications
You must be signed in to change notification settings - Fork 0
/
brsd.py
102 lines (77 loc) · 3.45 KB
/
brsd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from datasets import DatasetParser
from datasets import CSLU, VoxForge, Sid, LapsBM
import os
import re
import librosa
import codecs
from utils.generic_utils import get_from_module
class BRSD(DatasetParser):
""" Brazilian Portuguese Speech dataset reader and parser
This dataset is a combination of four smaller datasets (voxforge, lapsbm,
sid, and cslu spoltech port). The dataset was divided in the following
way:
* Train: voxforge, sid, and cslu spoltech port
* Valid: 5 women and 15 men from LaspBM
* Test: 5 women 10 men from LapsBM (without overlapping with valid set
either in speaker and utterance spoken)
After cleaning (removing label with zero length, label with numeric
digits, e.g., 4 instead of four) the training set contains 11702
utterances with 425 speakers.
"""
def __init__(self, dataset_dir=None, name='brsp', **kwargs):
dataset_dir = dataset_dir or {'lapsbm': None,
'voxforge': None,
'sid': None,
'cslu': None}
super(BRSD, self).__init__(dataset_dir, name, **kwargs)
@property
def dataset_dir(self):
"""Filepath to the dataset directory"""
return self._dataset_dir
@dataset_dir.setter
def dataset_dir(self, value):
"""Filepath to the dataset directory"""
if value is None:
raise ValueError("You must set the variable dataset_dir (the location of dataset) before continue")
if not isinstance(value, dict):
raise ValueError("dataset_dir must be a dictionary")
for key in ('lapsbm', 'voxforge', 'sid'):
if key not in value:
raise KeyError("dataset_dir must have the key %s" % key)
if 'cslu' not in value:
self._logger.warning('CSLU not found. Ignoring it.')
self._dataset_dir = value
def _iter(self):
for name, path in self.dataset_dir.items():
if name == 'lapsbm':
continue
try:
dataset_cls = get_from_module('datasets*', name, regex=True)
dataset = dataset_cls(dataset_dir=path)
for d in dataset._iter():
yield {'duration': d['duration'],
'input': d['input'],
'label': d['label'],
'speaker': '%s_%s' % (str(dataset), d['speaker']),
'dataset': 'train'}
except ValueError, e:
self._logger.warning('Skipping dataset %s: %s' % (name, e.message))
# Test and valid set
lapsbm = LapsBM(dataset_dir=self.dataset_dir['lapsbm'], split=True)
for d in lapsbm._iter():
yield {'duration': d['duration'],
'input': d['input'],
'label': d['label'],
'speaker': '%s_%s' % (str(dataset), d['speaker']),
'dataset': d['dataset']}
def _report(self, dl):
report = '''General information:
Number of utterances: %d
Total size (in seconds) of utterances: %.f
Number of speakers: %d''' % (len(dl['input']),
sum(dl['duration']),
len(set(dl['speaker'])))
return report