In [47]:
import os
import random
import json
import shutil
from logging import Logger
import speechbrain as sb
from pathlib import Path
from speechbrain.utils.data_utils import get_all_files, download_file
from speechbrain.dataio.dataio import read_audio

In [58]:
ICANLE_AUDIO_DIR = "./ICNALE_Spoken_Monologue_2.0_Audio/ICNALE_SM_ENS_N600"
ICANLE_TRANS_DIR = "./ICNALE_Spoken_Monologue_2.0_Transcripts/Unmerged_classified/ICNALE_SM_ENS_XXX_NX00"
DATA_DIR = "./data"
TRAIN_FILE = DATA_DIR+"/train.json"
VALID_FILE = DATA_DIR+"/valid.json"
TEST_FILE = DATA_DIR+"/test.json"
AUDIO_EXT = [".mp3"]
TRANS_EXT = [".txt"]

In [49]:
def skip(*filenames):
    """
    Detects if the data preparation has been already done.
    If the preparation has been done, we can skip it.
    Returns
    -------
    bool
        if True, the preparation phase can be skipped.
        if False, it must be done.
    """
    for filename in filenames:
        if not os.path.isfile(filename):
            return False
    return True

In [50]:
def get_transcription(trans_list):
    """
    Returns a dictionary with the transcription of each sentence in the dataset.
    Arguments
    ---------
    trans_list : list of str
        The list of transcription files.
    """
    # Processing all the transcription files in the list
    trans_dict = {}
    for trans_file in trans_list:
        # Reading the text file
        with open(trans_file, encoding='utf-8-sig') as f:
            text = f.read()
            f.close()
            name = Path(trans_file).stem
            trans_dict[name] = text

    return trans_dict

In [51]:
def create_json(wav_list, trans_dict):
    """
    Creates the json file given a list of mp3 files and their transcriptions.
    Arguments
    ---------
    wav_list : list of str
        The list of mp3 files.
    trans_dict : dict
        Dictionary of sentence ids and word transcriptions.
    json_file : str
        The path of the output json file
    """
    # Processing all the wav files in the list
    SAMPLERATE = 44100

    json_dict = {}
    for wav_file in wav_list:

        # Reading the signal (to retrieve duration in seconds)
        signal = read_audio(wav_file)
        duration = signal.shape[0] / SAMPLERATE

        # Manipulate path to get relative path and uttid
        path_parts = wav_file.split(os.path.sep)
        uttid, _ = os.path.splitext(path_parts[-1])
        relative_path = os.path.join("{data_root}", *path_parts[-5:])

        # Create entry for this utterance
        json_dict[uttid] = {
            "wav": relative_path,
            "length": duration,
            "words": trans_dict[uttid],
        }
        key_list = list(json_dict)

        random.shuffle(key_list)

        data = {}

        for key in key_list:
                # print(key,json_dict[key])
                data[key] = json_dict[key]
    return data
   

In [53]:

# if skip(save_json_train, save_json_test):
#         print("Preparation completed in previous run, skipping.")
#         return

audio_files = get_all_files(ICANLE_AUDIO_DIR,match_and=AUDIO_EXT)
trans_files = get_all_files(ICANLE_TRANS_DIR,match_and=TRANS_EXT)
trans_dict = get_transcription(trans_files)
data = create_json(audio_files,trans_dict)




In [59]:
def split_and_store_data(data, training_json, valid_json, test_json):
    end_idx_training = int(len(data)*(2/3))
    end_idx_validation = int(len(data)*(2.5/3))
    print("training,validation",end_idx_training,end_idx_validation)
    
    print(end_idx_training)
    training_dict = dict(list(data.items())[0:end_idx_training])
    validation_dict = dict(list(data.items())[end_idx_training+1:end_idx_validation])
    test_dict = dict(list(data.items())[end_idx_validation+1:-1])
    print("length of training dict: ",len(training_dict))
    print("length of valid dict: ",len(validation_dict))
    print("length of test dict: ",len(test_dict))

    # Writing the dictionary to the json file
    with open(training_json, mode="w") as json_f:
        json.dump(training_dict, json_f, indent=2)
    with open(valid_json, mode="w") as json_f:
        json.dump(validation_dict, json_f, indent=2)
    with open(test_json, mode="w") as json_f:
        json.dump(test_dict, json_f, indent=2)


In [60]:
split_and_store_data(data,TRAIN_FILE,VALID_FILE,TEST_FILE)

training,validation 400 500
400
length of training dict:  400
length of valid dict:  99
length of test dict:  98


In [None]:
import torch
print(torch.cuda.is_available())
torch.tensor([1.0, 2.0])
torch.tensor([1.0, 2.0]).cuda()

: 