In [25]:
import os
import json
import shutil
import logging
import speechbrain as sb
from pathlib import Path
from speechbrain.utils.data_utils import get_all_files, download_file
from speechbrain.dataio.dataio import read_audio

In [18]:
ICANLE_AUDIO_DIR = "./ICNALE_SM_2.0_A\ICNALE_Spoken_Monologue_2.0_Audio/ICNALE_SM_ENS_N600"
ICANLE_TRANS_DIR = "./ICNALE_SM_2.0_T/ICNALE_Spoken_Monologue_2.0_Transcripts/Unmerged_classified/ICNALE_SM_ENS_XXX_NX00"
DATA_DIR = "./data"
TRAIN_FILE = DATA_DIR+"/training.json"
TEST_FILE = DATA_DIR+"/training.json"
AUDIO_EXT = [".mp3"]
TRANS_EXT = [".txt"]

In [19]:
def skip(*filenames):
    """
    Detects if the data preparation has been already done.
    If the preparation has been done, we can skip it.
    Returns
    -------
    bool
        if True, the preparation phase can be skipped.
        if False, it must be done.
    """
    for filename in filenames:
        if not os.path.isfile(filename):
            return False
    return True

In [31]:
def get_transcription(trans_list):
    """
    Returns a dictionary with the transcription of each sentence in the dataset.
    Arguments
    ---------
    trans_list : list of str
        The list of transcription files.
    """
    # Processing all the transcription files in the list
    trans_dict = {}
    for trans_file in trans_list:
        # Reading the text file
        with open(trans_file, encoding='utf-8-sig') as f:
            text = f.read()
            f.close()
            name = Path(trans_file).stem
            trans_dict[name] = text

    return trans_dict

In [35]:
def create_json(wav_list, trans_dict, json_file):
    """
    Creates the json file given a list of mp3 files and their transcriptions.
    Arguments
    ---------
    wav_list : list of str
        The list of mp3 files.
    trans_dict : dict
        Dictionary of sentence ids and word transcriptions.
    json_file : str
        The path of the output json file
    """
    # Processing all the wav files in the list
    json_dict = {}
    for wav_file in wav_list:

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = signal.shape[0] / SAMPLERATE

        # Manipulate path to get relative path and uttid
        path_parts = wav_file.split(os.path.sep)
        uttid, _ = os.path.splitext(path_parts[-1])
        relative_path = os.path.join("{data_root}", *path_parts[-5:])

        # Create entry for this utterance
        json_dict[uttid] = {
            "wav": relative_path,
            "length": duration,
            "words": trans_dict[uttid],
        }

    # Writing the dictionary to the json file
    with open(json_file, mode="w") as json_f:
        json.dump(json_dict, json_f, indent=2)

    logger.info(f"{json_file} successfully created!")


In [36]:
def prepare_mini_librispeech(save_json_train, save_json_test):
    if skip(save_json_train, save_json_test):
            print("Preparation completed in previous run, skipping.")
            return

    audio_files = get_all_files(ICANLE_AUDIO_DIR,match_and=AUDIO_EXT)
    trans_files = get_all_files(ICANLE_TRANS_DIR,match_and=TRANS_EXT)
    trans_dict = get_transcription(trans_files)
    create_json(audio_files,trans_dict,TRAIN_FILE)

prepare_mini_librispeech(TRAIN_FILE, TEST_FILE,)

RuntimeError: Error opening './ICNALE_SM_2.0_A\\ICNALE_Spoken_Monologue_2.0_Audio/ICNALE_SM_ENS_N600\\SM_ENS_PTJ1_001_XX_3.mp3': File contains data in an unknown format.