# LibriSpeech dataset basic stats


In [2]:
import os
import sys
sys.path.append(os.path.abspath('../'))
import numpy as np
import IPython.display as ipd
import librosa
from collections import Counter
import matplotlib.pyplot as plt
from transcription_utils import create_vocab_id2transcript,get_ctc_char2ids,get_id2encoded_transcriptions
from audio_utils import load_raw_audio,wave2ampl_spectrogram,get_duration_in_s
from tfrecords_utils import load_data_by_split

Create input data as for training. I should probably modify s.t. one function call is enough

In [3]:
DATA_PATH = '../LibriSpeech/'
SAMPLE_RATE = 16000
chars_set, ids2trans = create_vocab_id2transcript(DATA_PATH)
chars2ids = get_ctc_char2ids(chars_set)
ids2chars = {idx : c for c,idx in chars2ids.items()}
encoded_transcriptions = get_id2encoded_transcriptions(ids2trans, chars2ids)

2018-05-05 16:46:47,191 : INFO : transcription_utils: Created character set of size 28
2018-05-05 16:46:47,192 : INFO : transcription_utils: Created transcriptions lookup
2018-05-05 16:46:47,194 : INFO : transcription_utils: Modified characters id lookup for compatibility with CTC loss
2018-05-05 16:46:47,241 : INFO : transcription_utils: Encoding transcription with character to indices lookup


This is the possible set of characters (labels) with corresponded lookup id.

In [4]:
print(chars2ids)

{'T': 11, 'H': 0, 'I': 12, 'E': 1, 'V': 8, 'X': 13, ' ': 27, 'Y': 18, 'W': 2, 'C': 14, 'Z': 15, 'N': 16, 'G': 3, 'U': 4, 'L': 5, 'Q': 17, 'A': 6, 'J': 19, 'O': 20, "'": 7, 'S': 9, 'D': 21, 'F': 10, 'P': 22, 'B': 23, 'M': 24, 'R': 25, 'K': 26}


Load data splits:

In [None]:
data_train = load_data_by_split(data_path=DATA_PATH,split='dev',
                              id2encoded_transc=encoded_transcriptions, limit = None)
data_dev = load_data_by_split(data_path=DATA_PATH,split='dev',
                              id2encoded_transc=encoded_transcriptions, limit = None)
data_test = load_data_by_split(data_path=DATA_PATH,split='dev',
                              id2encoded_transc=encoded_transcriptions, limit = None)

In [None]:
audio_example = data_dev[2]
print(audio_example)

Hear to one examples and plot its amplityde spectrogram

In [None]:
audio = load_raw_audio(audio_example.audio_path, sample_rate = SAMPLE_RATE)

In [None]:
ipd.Audio(audio, rate=SAMPLE_RATE)

In [None]:
print([ids2chars[c] for c in audio_example.transcription])

In [None]:
spec = wave2ampl_spectrogram(audio, fft_window = 512, hop_length = 128)
D = librosa.amplitude_to_db(spec, ref=np.max)
plt.figure(figsize=(14,8))
librosa.display.specshow(audio_ex_spect, y_axis='linear', cmap = 'inferno')
plt.colorbar(format='%+02.0f dB')
plt.tight_layout()
plt.show()

And now some basic stats



In [None]:
def histogram_from_counts(audio_lengths, width, split):
    labels, values = zip(*Counter(audio_lengths).items())
    indexes = np.arange(len(labels))
    plt.figure(figsize=(14,8))
    plt.bar(indexes, values, width)
    plt.title("{} audio lenghts distributions".format(split))
    plt.xticks(indexes + width * 0.5, labels)
    plt.show()


In [None]:
to_display = [('Train',data_train), ('Dev', data_dev) ,('Test',data_test)]
for (name,split) in to_display:
    audio_lenghts = [get_duration_in_s(load_raw_audio(a.audio_path, SAMPLE_RATE),SAMPLE_RATE) for a in split]
    print(audio_lenghts)
    print("Max audio length in {} : {}".format(name,max(audio_lenghts)))
    print("Min audio length in {} : {}".format(name,min(audio_lenghts)))
    histogram_from_counts(audio_lengths=audio_lenghts, width = 0.1, split = name)
    
    
    