code/pylearn2/datasets/timit.py

"""
Pylearn2 wrapper for the TIMIT dataset
"""
__authors__ = ["Vincent Dumoulin"]
__copyright__ = "Copyright 2014, Universite de Montreal"
__credits__ = ["Laurent Dinh", "Vincent Dumoulin"]
__license__ = "3-clause BSD"
__maintainer__ = "Vincent Dumoulin"
__email__ = "dumouliv@iro"


import os.path
import functools
import numpy
from pylearn2.utils.iteration import resolve_iterator_class
from pylearn2.datasets.dense_design_matrix import DenseDesignMatrix
from pylearn2.datasets.dataset import Dataset
from pylearn2.space import CompositeSpace, VectorSpace, IndexSpace, Conv2DSpace
from research.code.pylearn2.space import (
    VectorSequenceSpace,
    IndexSequenceSpace,
)
from pylearn2.utils import serial
from pylearn2.utils import safe_zip
from research.code.scripts.segmentaxis import segment_axis
from research.code.pylearn2.utils.iteration import FiniteDatasetIterator
import scipy.stats


def index_from_one_hot(one_hot):
    return numpy.where(one_hot == 1.0)[0][0]

class TIMIT(Dataset):
    """
    Frame-based TIMIT dataset
    """
    _default_seed = (17, 2, 946)

    # Mean and standard deviation of the acoustic samples from the whole
    # dataset (train, valid, test).
    _mean = 0.0035805809921434142
    _std = 542.48824133746177

    def __init__(self, which_set, frame_length, overlap=0,
                 frames_per_example=1, start=0, stop=None, audio_only=False,
                 rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in a frame
        overlap : int, optional
            Number of overlapping acoustic samples for two consecutive frames.
            Defaults to 0, meaning frames don't overlap.
        frames_per_example : int, optional
            Number of frames in a training example. Defaults to 1.
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.overlap = overlap
        self.frames_per_example = frames_per_example
        self.offset = self.frame_length - self.overlap
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max([numpy.max(sequence) for sequence
                                         in self.phones]) + 1
            self.num_phonemes = numpy.max([numpy.max(sequence) for sequence
                                           in self.phonemes]) + 1
            self.num_words = numpy.max([numpy.max(sequence) for sequence
                                        in self.words]) + 1
            # The following is hard coded. However, the way it is done above
            # could be problematic if a max value (the max over the whole
            # dataset (train + valid + test)) is not present in at least one
            # one of the three subsets. This is the case for speakers. This is
            # not the case for phones.
            self.num_speakers = 630

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
                self.speaker_id = self.speaker_id[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]
                self.speaker_id = self.speaker_id[start:]

        examples_per_sequence = [0]

        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            if not self.audio_only:
                # Phones segmentation
                phones_sequence = self.phones[sequence_id]
                phones_segmented_sequence = segment_axis(phones_sequence,
                                                         frame_length,
                                                         overlap)
                self.phones[sequence_id] = phones_segmented_sequence
                # phones_segmented_sequence = scipy.stats.mode(
                #     phones_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phones_segmented_sequence = numpy.asarray(
                #     phones_segmented_sequence,
                #     dtype='int'
                # )
                # phones_sequence_list.append(phones_segmented_sequence)
                # Phonemes segmentation
                phonemes_sequence = self.phonemes[sequence_id]
                phonemes_segmented_sequence = segment_axis(phonemes_sequence,
                                                           frame_length,
                                                           overlap)
                self.phonemes[sequence_id] = phonemes_segmented_sequence
                # phonemes_segmented_sequence = scipy.stats.mode(
                #     phonemes_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # phonemes_segmented_sequence = numpy.asarray(
                #     phonemes_segmented_sequence,
                #     dtype='int'
                # )
                # phonemes_sequence_list.append(phonemes_segmented_sequence)
                # Words segmentation
                words_sequence = self.words[sequence_id]
                words_segmented_sequence = segment_axis(words_sequence,
                                                        frame_length,
                                                        overlap)
                self.words[sequence_id] = words_segmented_sequence
                # words_segmented_sequence = scipy.stats.mode(
                #     words_segmented_sequence,
                #     axis=1
                # )[0].flatten()
                # words_segmented_sequence = numpy.asarray(words_segmented_sequence,
                #                                          dtype='int')
                # words_sequence_list.append(words_segmented_sequence)

            # TODO: look at this, does it force copying the data?
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      overlap)
            self.raw_wav[sequence_id] = samples_segmented_sequence

            # TODO: change me
            # Generate features/targets/phones/phonemes/words map
            num_frames = samples_segmented_sequence.shape[0]
            num_examples = num_frames - self.frames_per_example
            examples_per_sequence.append(num_examples)

        self.cumulative_example_indexes = numpy.cumsum(examples_per_sequence)
        self.samples_sequences = self.raw_wav
        if not self.audio_only:
            self.phones_sequences = self.phones
            self.phonemes_sequences = self.phonemes
            self.words_sequences = self.words
        self.num_examples = self.cumulative_example_indexes[-1]

        # DataSpecs
        features_space = VectorSpace(
            dim=self.frame_length * self.frames_per_example
        )
        features_source = 'features'
        def features_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index:example_index
                    + self.frames_per_example].ravel())
            return rval

        targets_space = VectorSpace(dim=self.frame_length)
        targets_source = 'targets'
        def targets_map_fn(indexes):
            rval = []
            for sequence_index, example_index in self._fetch_index(indexes):
                rval.append(self.samples_sequences[sequence_index][example_index
                    + self.frames_per_example].ravel())
            return rval

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        map_fn_components = [features_map_fn, targets_map_fn]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSpace(max_labels=self.num_phones, dim=1,
                                      dtype=str(self.phones_sequences[0].dtype))
            phones_source = 'phones'
            def phones_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phones_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            phonemes_space = IndexSpace(max_labels=self.num_phonemes, dim=1,
                                        dtype=str(self.phonemes_sequences[0].dtype))
            phonemes_source = 'phonemes'
            def phonemes_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.phonemes_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            words_space = IndexSpace(max_labels=self.num_words, dim=1,
                                     dtype=str(self.words_sequences[0].dtype))
            words_source = 'words'
            def words_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.words_sequences[sequence_index][example_index
                        + self.frames_per_example].ravel())
                return rval

            speaker_id_space = IndexSpace(max_labels=self.num_speakers, dim=1,
                                          dtype=str(self.speaker_id.dtype))
            speaker_id_source = 'speaker_id'
            def speaker_id_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    rval.append(self.speaker_id[sequence_index].ravel())
                return rval

            dialect_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            dialect_source = 'dialect'
            def dialect_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    info = self.speaker_info_list[self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[1:9]))
                return rval

            education_space = IndexSpace(max_labels=6, dim=1, dtype='int32')
            education_source = 'education'
            def education_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    info = self.speaker_info_list[self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[9:15]))
                return rval

            race_space = IndexSpace(max_labels=8, dim=1, dtype='int32')
            race_source = 'race'
            def race_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    info = self.speaker_info_list[self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[16:24]))
                return rval
              
            gender_space = IndexSpace(max_labels=2, dim=1, dtype='int32')
            gender_source = 'gender'
            def gender_map_fn(indexes):
                rval = []
                for sequence_index, example_index in self._fetch_index(indexes):
                    info = self.speaker_info_list[self.speaker_id[sequence_index]]
                    rval.append(index_from_one_hot(info[24:]))
                return rval

            space_components.extend([phones_space, phonemes_space,
                                     words_space, speaker_id_space,
                                     dialect_space, education_space,
                                     race_space, gender_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source, speaker_id_source,
                                     dialect_source, education_source,
                                     race_source, gender_source])
            map_fn_components.extend([phones_map_fn, phonemes_map_fn,
                                     words_map_fn, speaker_id_map_fn,
                                     dialect_map_fn, education_map_fn,
                                     race_map_fn, gender_map_fn])
            batch_components.extend([None, None, None, None, None, None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.map_functions = tuple(map_fn_components)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))

    def _fetch_index(self, indexes):
        digit = numpy.digitize(indexes, self.cumulative_example_indexes) - 1
        return zip(digit,
                   numpy.array(indexes) - self.cumulative_example_indexes[digit])

    def _load_data(self, which_set):
        """
        Load the TIMIT data from disk.

        Parameters
        ----------
        which_set : str
            Subset of the dataset to use (either "train", "valid" or "test")
        """
        # Check which_set
        if which_set not in ['train', 'valid', 'test']:
            raise ValueError(which_set + " is not a recognized value. " +
                             "Valid values are ['train', 'valid', 'test'].")

        # Create file paths
        timit_base_path = os.path.join(os.environ["PYLEARN2_DATA_PATH"],
                                       "timit/readable")
        speaker_info_list_path = os.path.join(timit_base_path, "spkrinfo.npy")
        phonemes_list_path = os.path.join(timit_base_path,
                                          "reduced_phonemes.pkl")
        words_list_path = os.path.join(timit_base_path, "words.pkl")
        speaker_features_list_path = os.path.join(timit_base_path,
                                                  "spkr_feature_names.pkl")
        speaker_id_list_path = os.path.join(timit_base_path,
                                            "speakers_ids.pkl")
        raw_wav_path = os.path.join(timit_base_path, which_set + "_x_raw.npy")
        phonemes_path = os.path.join(timit_base_path,
                                     which_set + "_x_phonemes.npy")
        phones_path = os.path.join(timit_base_path,
                                     which_set + "_x_phones.npy")
        words_path = os.path.join(timit_base_path, which_set + "_x_words.npy")
        speaker_path = os.path.join(timit_base_path,
                                    which_set + "_spkr.npy")

        # Load data. For now most of it is not used, as only the acoustic
        # samples are provided, but this is bound to change eventually.
        # Global data
        if not self.audio_only:
            self.speaker_info_list = serial.load(
                speaker_info_list_path
            ).tolist().toarray()
            self.speaker_id_list = serial.load(speaker_id_list_path)
            self.speaker_features_list = serial.load(speaker_features_list_path)
            self.words_list = serial.load(words_list_path)
            self.phonemes_list = serial.load(phonemes_list_path)
        # Set-related data
        self.raw_wav = serial.load(raw_wav_path)
        if not self.audio_only:
            self.phonemes = serial.load(phonemes_path)
            self.phones = serial.load(phones_path)
            self.words = serial.load(words_path)
            self.speaker_id = numpy.asarray(serial.load(speaker_path), 'int')

    def _validate_source(self, source):
        """
        Verify that all sources in the source tuple are provided by the
        dataset. Raise an error if some requested source is not available.

        Parameters
        ----------
        source : `tuple` of `str`
            Requested sources
        """
        for s in source:
            try:
                self.data_specs[1].index(s)
            except ValueError:
                raise ValueError("the requested source named '" + s + "' " +
                                 "is not provided by the dataset")

    def get_data_specs(self):
        """
        Returns the data_specs specifying how the data is internally stored.

        This is the format the data returned by `self.get_data()` will be.

        .. note::

            Once again, this is very hacky, as the data is not stored that way
            internally. However, the data that's returned by `TIMIT.get()`
            _does_ respect those data specs.
        """
        return self.data_specs

    def get(self, source, indexes):
        """
        .. todo::

            WRITEME
        """
        if type(indexes) is slice:
            indexes = numpy.arange(indexes.start, indexes.stop)
        self._validate_source(source)
        rval = []
        for so in source:
            batch = self.map_functions[self.data_specs[1].index(so)](indexes)
            batch_buffer = self.batch_buffers[self.data_specs[1].index(so)]
            dim = self.data_specs[0].components[self.data_specs[1].index(so)].dim
            if batch_buffer is None or batch_buffer.shape != (len(batch), dim):
                batch_buffer = numpy.zeros((len(batch), dim),
                                           dtype=batch[0].dtype)
            for i, example in enumerate(batch):
                batch_buffer[i] = example
            rval.append(batch_buffer)
        return tuple(rval)

    @functools.wraps(Dataset.iterator)
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 rng=None, data_specs=None, return_tuple=False):
        """
        .. todo::

            WRITEME
        """
        if data_specs is None:
            data_specs = self._iter_data_specs

        # If there is a view_converter, we have to use it to convert
        # the stored data for "features" into one that the iterator
        # can return.
        space, source = data_specs
        if isinstance(space, CompositeSpace):
            sub_spaces = space.components
            sub_sources = source
        else:
            sub_spaces = (space,)
            sub_sources = (source,)

        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            convert.append(None)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        return FiniteDatasetIterator(self,
                                     mode(self.num_examples, batch_size,
                                          num_batches, rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple,
                                     convert=convert)


class TIMITSequences(Dataset):
    """
    Sequence-based TIMIT dataset
    """
    _default_seed = (17, 2, 946)

    # Mean and standard deviation of the acoustic samples from the whole
    # dataset (train, valid, test).
    _mean = 0.0035805809921434142
    _std = 542.48824133746177

    def __init__(self, which_set, frame_length, start=0, stop=None,
                 audio_only=False, rng=_default_seed):
        """
        Parameters
        ----------
        which_set : str
            Either "train", "valid" or "test"
        frame_length : int
            Number of acoustic samples contained in the sliding window
        start : int, optional
            Starting index of the sequences to use. Defaults to 0.
        stop : int, optional
            Ending index of the sequences to use. Defaults to `None`, meaning
            sequences are selected all the way to the end of the array.
        audio_only : bool, optional
            Whether to load only the raw audio and no auxiliary information.
            Defaults to `False`.
        rng : object, optional
            A random number generator used for picking random indices into the
            design matrix when choosing minibatches.
        """
        self.frame_length = frame_length
        self.audio_only = audio_only

        # RNG initialization
        if hasattr(rng, 'random_integers'):
            self.rng = rng
        else:
            self.rng = numpy.random.RandomState(rng)

        # Load data from disk
        self._load_data(which_set)
        # Standardize data
        for i, sequence in enumerate(self.raw_wav):
            self.raw_wav[i] = (sequence - TIMIT._mean) / TIMIT._std

        if not self.audio_only:
            self.num_phones = numpy.max([numpy.max(sequence) for sequence
                                         in self.phones]) + 1
            self.num_phonemes = numpy.max([numpy.max(sequence) for sequence
                                           in self.phonemes]) + 1
            self.num_words = numpy.max([numpy.max(sequence) for sequence
                                        in self.words]) + 1

        # Slice data
        if stop is not None:
            self.raw_wav = self.raw_wav[start:stop]
            if not self.audio_only:
                self.phones = self.phones[start:stop]
                self.phonemes = self.phonemes[start:stop]
                self.words = self.words[start:stop]
        else:
            self.raw_wav = self.raw_wav[start:]
            if not self.audio_only:
                self.phones = self.phones[start:]
                self.phonemes = self.phonemes[start:]
                self.words = self.words[start:]

        samples_sequences = []
        targets_sequences = []
        phones_sequences = []
        phonemes_sequences = []
        words_sequences = []
        for sequence_id, samples_sequence in enumerate(self.raw_wav):
            # Sequence segmentation
            samples_segmented_sequence = segment_axis(samples_sequence,
                                                      frame_length,
                                                      frame_length - 1)[:-1]
            samples_sequences.append(samples_segmented_sequence)
            targets_sequences.append(samples_sequence[frame_length:].reshape(
                (samples_sequence[frame_length:].shape[0], 1)
            ))
            if not self.audio_only:
                target_phones = self.phones[sequence_id][frame_length:]
                phones_sequences.append(target_phones.reshape(
                    (target_phones.shape[0], 1)
                ))
                target_phonemes = self.phonemes[sequence_id][frame_length:]
                phonemes_sequences.append(target_phonemes.reshape(
                    (target_phonemes.shape[0], 1)
                ))
                target_words = self.words[sequence_id][frame_length:]
                words_sequences.append(target_words.reshape(
                    (target_words.shape[0], 1)
                ))

        del self.raw_wav
        self.samples_sequences = samples_sequences
        self.targets_sequences = targets_sequences
        self.data = [samples_sequences, targets_sequences]
        if not self.audio_only:
            del self.phones
            del self.phonemes
            del self.words
            self.phones_sequences = phones_sequences
            self.phonemes_sequences = phonemes_sequences
            self.words_sequences = words_sequences
            self.data.extend([phones_sequences, phonemes_sequences,
                              words_sequences])
        self.num_examples = len(samples_sequences)

        # DataSpecs
        features_space = VectorSequenceSpace(dim=self.frame_length)
        features_source = 'features'

        targets_space = VectorSequenceSpace(dim=1)
        targets_source = 'targets'

        space_components = [features_space, targets_space]
        source_components = [features_source, targets_source]
        batch_components = [None, None]

        if not self.audio_only:
            phones_space = IndexSequenceSpace(
                max_labels=self.num_phones,
                dim=1,
                dtype=str(self.phones_sequences[0].dtype)
            )
            phones_source = 'phones'

            phonemes_space = IndexSequenceSpace(
                max_labels=self.num_phonemes,
                dim=1,
                dtype=str(self.phonemes_sequences[0].dtype)
            )
            phonemes_source = 'phonemes'

            words_space = IndexSequenceSpace(
                max_labels=self.num_words,
                dim=1,
                dtype=str(self.words_sequences[0].dtype)
            )
            words_source = 'words'

            space_components.extend([phones_space, phonemes_space,
                                     words_space])
            source_components.extend([phones_source, phonemes_source,
                                     words_source])
            batch_components.extend([None, None, None])

        space = CompositeSpace(space_components)
        source = tuple(source_components)
        self.data_specs = (space, source)
        self.batch_buffers = batch_components

        # Defaults for iterators
        self._iter_mode = resolve_iterator_class('shuffled_sequential')
        self._iter_data_specs = (CompositeSpace((features_space,
                                                 targets_space)),
                                 (features_source, targets_source))

    def _fetch_index(self, indexes):
        digit = numpy.digitize(indexes, self.cumulative_example_indexes) - 1
        return zip(digit,
                   numpy.array(indexes) - self.cumulative_example_indexes[digit])

    def _load_data(self, which_set):
        """
        Load the TIMIT data from disk.

        Parameters
        ----------
        which_set : str
            Subset of the dataset to use (either "train", "valid" or "test")
        """
        # Check which_set
        if which_set not in ['train', 'valid', 'test']:
            raise ValueError(which_set + " is not a recognized value. " +
                             "Valid values are ['train', 'valid', 'test'].")

        # Create file paths
        timit_base_path = os.path.join(os.environ["PYLEARN2_DATA_PATH"],
                                       "timit/readable")
        speaker_info_list_path = os.path.join(timit_base_path, "spkrinfo.npy")
        phonemes_list_path = os.path.join(timit_base_path,
                                          "reduced_phonemes.pkl")
        words_list_path = os.path.join(timit_base_path, "words.pkl")
        speaker_features_list_path = os.path.join(timit_base_path,
                                                  "spkr_feature_names.pkl")
        speaker_id_list_path = os.path.join(timit_base_path,
                                            "speakers_ids.pkl")
        raw_wav_path = os.path.join(timit_base_path, which_set + "_x_raw.npy")
        phonemes_path = os.path.join(timit_base_path,
                                     which_set + "_x_phonemes.npy")
        phones_path = os.path.join(timit_base_path,
                                     which_set + "_x_phones.npy")
        words_path = os.path.join(timit_base_path, which_set + "_x_words.npy")
        speaker_path = os.path.join(timit_base_path,
                                    which_set + "_spkr.npy")

        # Load data. For now most of it is not used, as only the acoustic
        # samples are provided, but this is bound to change eventually.
        # Global data
        if not self.audio_only:
            self.speaker_info_list = serial.load(
                speaker_info_list_path
            ).tolist().toarray()
            self.speaker_id_list = serial.load(speaker_id_list_path)
            self.speaker_features_list = serial.load(speaker_features_list_path)
            self.words_list = serial.load(words_list_path)
            self.phonemes_list = serial.load(phonemes_list_path)
        # Set-related data
        self.raw_wav = serial.load(raw_wav_path)
        if not self.audio_only:
            self.phonemes = serial.load(phonemes_path)
            self.phones = serial.load(phones_path)
            self.words = serial.load(words_path)
            self.speaker_id = numpy.asarray(serial.load(speaker_path), 'int')

    def _validate_source(self, source):
        """
        Verify that all sources in the source tuple are provided by the
        dataset. Raise an error if some requested source is not available.

        Parameters
        ----------
        source : `tuple` of `str`
            Requested sources
        """
        for s in source:
            try:
                self.data_specs[1].index(s)
            except ValueError:
                raise ValueError("the requested source named '" + s + "' " +
                                 "is not provided by the dataset")

    def get_data_specs(self):
        """
        Returns the data_specs specifying how the data is internally stored.

        This is the format the data returned by `self.get_data()` will be.

        .. note::

            Once again, this is very hacky, as the data is not stored that way
            internally. However, the data that's returned by `TIMIT.get()`
            _does_ respect those data specs.
        """
        return self.data_specs

    def get(self, source, indexes):
        """
        .. todo::

            WRITEME
        """
        if type(indexes) is slice:
            indexes = numpy.arange(indexes.start, indexes.stop)
        assert indexes.shape == (1,)
        self._validate_source(source)
        rval = []
        for so in source:
            rval.append(
                self.data[self.data_specs[1].index(so)][indexes]
            )
        return tuple(rval)

    @functools.wraps(Dataset.iterator)
    def iterator(self, mode=None, batch_size=None, num_batches=None,
                 rng=None, data_specs=None, return_tuple=False):
        """
        .. todo::

            WRITEME
        """
        if data_specs is None:
            data_specs = self._iter_data_specs

        # If there is a view_converter, we have to use it to convert
        # the stored data for "features" into one that the iterator
        # can return.
        space, source = data_specs
        if isinstance(space, CompositeSpace):
            sub_spaces = space.components
            sub_sources = source
        else:
            sub_spaces = (space,)
            sub_sources = (source,)

        convert = []
        for sp, src in safe_zip(sub_spaces, sub_sources):
            convert.append(None)

        # TODO: Refactor
        if mode is None:
            if hasattr(self, '_iter_subset_class'):
                mode = self._iter_subset_class
            else:
                raise ValueError('iteration mode not provided and no default '
                                 'mode set for %s' % str(self))
        else:
            mode = resolve_iterator_class(mode)

        if batch_size is None:
            batch_size = getattr(self, '_iter_batch_size', None)
        if num_batches is None:
            num_batches = getattr(self, '_iter_num_batches', None)
        if rng is None and mode.stochastic:
            rng = self.rng
        return FiniteDatasetIterator(self,
                                     mode(self.num_examples, batch_size,
                                          num_batches, rng),
                                     data_specs=data_specs,
                                     return_tuple=return_tuple,
                                     convert=convert)


class TIMITPerPhone(DenseDesignMatrix):
    """
    Loads specified dataset created from the TIMIT dataset by Laurent Dinh
    into a matrix for time series prediction and generation.
    """
    _default_seed = 1
    _data_dir = '/data/lisa/data/timit/readable/per_phone'

    def __init__(self,
                 phone,
                 frame_length,
                 target_width=1,
                 max_examples=None,
                 example_list=None,
                 random_examples=False,
                 which_set='train',
                 unit_norm=False,
                 standardize=False,
                 mean=None,
                 std=None,
                 rng=None):
        """
        Parameters
        ----------
        phone : string
            The phone to be loaded.
        max_examples : int
            The maximum number of examples to load.
        example_list : list
            Specify examples to generate.
        random_examples: boolean
            Whether to select the examples from the data set at random if they
            are not all to be used (e.g. when max_examples is less than the
            total number examples).
        which_set : string
            Which set to load: 'train', 'validate', or 'test'.
        unit_norm : bool
            Normalize individual signal with it's L2 norm.
        standardize : bool
            Normalize all examples.
        mean : float
            Mean of training set. Only used if standarize flag is on.
        std : float
            Standard deviation of training set. Only used if standarize flag is
            on.
        rng : int
            Seed for random number generator.
        """
        # Validate parameters and set member variables
        file = 'wav_' + phone + '.npy'
        files = os.listdir(self._data_dir)
        assert(file in files)
        self.phone = phone
        self.file = file

        if example_list is not None:
            assert(
                numpy.asarray(example_list).mean() >= 0 and
                isinstance(example_list, list)
            )
        self.example_list = example_list

        self.sets = ['test', 'validate', 'train']
        assert(which_set in self.sets)
        self.which_set = which_set

        assert(type(unit_norm) is bool)
        self.unit_norm = unit_norm

        assert(type(standardize) is bool)
        self.standardize = standardize

        if (self.which_set != 'train' and self.standardize):
            assert(mean is not None and std is not None)

        self._mean = mean

        if std is not None:
            assert(std > 0)
        self._std = std

        self._mean_norm = 0

        assert(frame_length > 0)
        self.frame_length = frame_length

        assert(target_width > 0)
        self.target_width = target_width

        self.max_examples = None
        if (max_examples is not None):
            assert(max_examples > 0)
            self.max_examples = max_examples

        assert(type(random_examples) == bool)
        self.random_examples = random_examples

        # Initialize RNG
        if rng is None:
            self.rng = numpy.random.RandomState(self._default_seed)
        else:
            self.rng = numpy.random.RandomState(rng)

        (X, y) = self._load_data()

        super(TIMITPerPhone, self).__init__(X=X, y=y)

    def _load_data(self):
        data = serial.load(os.path.join(self._data_dir, self.file))

        if self.example_list is not None:
            idxs = self.example_list
        else:
            data = {
                'train': data[:-1000],
                'validate': data[-1000:-500],
                'test': data[-500:]
            }[self.which_set]
            idxs = numpy.arange(len(data))

            if self.random_examples:
                numpy.random.shuffle(idxs)

            if self.max_examples is not None:
                idxs = idxs[:self.max_examples]

        data = data[idxs]
        # TODO - Remove this
        self.data = data

        if self.unit_norm is True:
            for i in range(data.shape[0]):
                exp_euclidean_norm = numpy.sqrt(numpy.square(data[i]).sum())
                data[i] /= exp_euclidean_norm
                self._mean_norm += exp_euclidean_norm
            self._mean_norm /= data.shape[0]

        if self.standardize is True:
            if self._mean is None or self._std is None:
                exp_sum = 0
                exp_var = 0
                exp_cnt = 0
                for i in range(data.shape[0]):
                    exp_sum += data[i].sum()
                    exp_var += (numpy.square(data[i])).sum()
                    exp_cnt += len(data[i])
                self._mean = exp_sum / exp_cnt
                exp_var = exp_var/exp_cnt - self._mean**2
                self._std = numpy.sqrt(exp_var)

            for i in range(data.shape[0]):
                data[i] = (data[i] - self._mean) / self._std

        # Do math to determine how many samples there will be and make space
        total_rows = 0
        record_len = self.frame_length + self.target_width
        for i in range(len(data)):
            total_rows += len(data[i]) - record_len

        X = numpy.zeros((total_rows, self.frame_length))
        y = numpy.zeros((total_rows, self.target_width))

        count = 0
        for i in range(len(data)):
            current_phone = data[i]
            current_phone_len = len(current_phone)
            for j in range(current_phone_len - record_len):
                frame_end = j + self.frame_length
                target_end = frame_end + self.target_width
                X[count, :] = current_phone[j:frame_end]
                y[count, :] = current_phone[frame_end:target_end]
                count += 1

        return (X, y)


if __name__ == "__main__":
    valid_timit = TIMIT("valid", frame_length=1, frames_per_example=100,
                        audio_only=False)
    data_specs = (Conv2DSpace(shape=[100, 1], num_channels=1, axes=('b', 0, 1, 'c')),
                  'features')
    it = valid_timit.iterator(mode='sequential', data_specs=data_specs,
                              num_batches=1, batch_size=10)
    for rval in it:
        import pdb; pdb.set_trace()
        print [val.shape for val in rval]