# Import et variables globales

In [1]:
import os
import scipy.io.wavfile as wav
from sklearn.model_selection import train_test_split
import numpy as np
import json
import pickle
import random


PATH_PROJECT_ROBIN = "/home/robin/Bureau/ETUDES/M2/S2/TAP/"
PATH_PROJECT_QUENTIN = "..."
PATH_PROJECT_PERSONNE = PATH_PROJECT_ROBIN
PATH_RAW_DATASET = "Projet-Traitement-Automatique-de-la-Parole/Dataset/"
GLOBAL_PATH_DATASET = os.path.join(PATH_PROJECT_PERSONNE, PATH_RAW_DATASET)
GLOBAL_PATH_RAW_DATASET = os.path.join(GLOBAL_PATH_DATASET, "speech_commands_v0.02/")
GLOBAL_PATH_BASIC_DATASET = os.path.join(GLOBAL_PATH_DATASET, "Dataset_basique/")

# Get filenames for partition

In [38]:
with open(os.path.join(GLOBAL_PATH_RAW_DATASET, "testing_list.txt"), 'r') as f:
    contenu = f.read()
    testing_list = contenu.split("\n")
with open(os.path.join(GLOBAL_PATH_RAW_DATASET, "validation_list.txt"), 'r') as f:
    contenu = f.read()
    validation_list = contenu.split("\n")
sound_dirs = [elem for elem in os.listdir(GLOBAL_PATH_RAW_DATASET) if os.path.isdir(os.path.join(GLOBAL_PATH_RAW_DATASET,elem))]
label2id = {"up" : 0, "down" : 1, "left" : 2, "right" : 3, "unknown" : 4, "silence" : 5}
id2label = {v : k for k,v in label2id.items()}
labels = label2id.keys()
dataset_train = []
dataset_validation = []
dataset_test = []

In [39]:
for sound_dir in sound_dirs:
    path_to_dir = os.path.join(GLOBAL_PATH_RAW_DATASET, sound_dir)
    if "background" in sound_dir :
        pass
    else :
        if sound_dir in labels :
            label = sound_dir
        else :
            label = "unknown"
        for sound_file in os.listdir(path_to_dir):
            if sound_file.endswith('0.wav') or sound_file.endswith('1.wav') :
                path_to_file = os.path.join(path_to_dir, sound_file)
                total_filename = os.path.join(sound_dir,sound_file)
                if total_filename in validation_list :
                    dataset_validation.append((label, total_filename))
                elif total_filename in testing_list :
                    dataset_test.append((label, total_filename))
                else :
                    dataset_train.append((label, total_filename))
            

In [40]:

def check_size_dataset(lists, list_names):
    dict_save = {}
    for k, list in enumerate(lists) :
        dict_save[list_names[k]] = {}
        dict_aff = {}
        for label, filename in list:
            if label in dict_aff:
                dict_aff[label] += 1
                dict_save[list_names[k]][label].append(filename)
            else :
                dict_aff[label] = 1
                dict_save[list_names[k]][label] = [filename]
        print(list_names[k]," : ",dict_aff)
    return dict_save["train"], dict_save["validation"], dict_save["test"]
train_names, validation_names, test_names = check_size_dataset([dataset_train,dataset_validation,dataset_test]
                                                               , ["train", "validation", "test"])

train  :  {'unknown': 51040, 'down': 2043, 'up': 1937, 'left': 1979, 'right': 1994}
validation  :  {'unknown': 6116, 'down': 262, 'up': 241, 'left': 237, 'right': 249}
test  :  {'unknown': 6335, 'down': 262, 'up': 257, 'left': 264, 'right': 260}


In [41]:
random.shuffle(train_names['unknown'])
random.shuffle(validation_names['unknown'])
random.shuffle(test_names['unknown'])
train_names['unknown'] = train_names['unknown'][:2000]
validation_names['unknown'] = validation_names['unknown'][:250]
test_names['unknown'] = test_names['unknown'][:250]

In [42]:
def save_dict_to_txt(dict_data, output_filenames):
    for i in range(len(dict_data)):
        count = 0
        with open(output_filenames[i], "w", encoding="utf-8") as f:
            for label, file_list in dict_data[i].items():
                for filename in file_list:
                    f.write(f"{filename}\n")
                    count += 1
        print(f"Succès : {count} fichiers ont été sauvegardés dans '{output_filenames[i]}'.")
dict_save_list = [train_names, validation_names, test_names]
filename_save_list = [os.path.join(GLOBAL_PATH_RAW_DATASET,"train_list_class.txt"),
                      os.path.join(GLOBAL_PATH_RAW_DATASET,"validation_list_class.txt"),
                      os.path.join(GLOBAL_PATH_RAW_DATASET,"test_list_class.txt")]
save_dict_to_txt(dict_save_list, filename_save_list)

Succès : 9953 fichiers ont été sauvegardés dans '/home/robin/Bureau/ETUDES/M2/S2/TAP/Projet-Traitement-Automatique-de-la-Parole/Dataset/speech_commands_v0.02/train_list_class.txt'.
Succès : 1239 fichiers ont été sauvegardés dans '/home/robin/Bureau/ETUDES/M2/S2/TAP/Projet-Traitement-Automatique-de-la-Parole/Dataset/speech_commands_v0.02/validation_list_class.txt'.
Succès : 1293 fichiers ont été sauvegardés dans '/home/robin/Bureau/ETUDES/M2/S2/TAP/Projet-Traitement-Automatique-de-la-Parole/Dataset/speech_commands_v0.02/test_list_class.txt'.


# Create base dataset

In [44]:
with open(os.path.join(GLOBAL_PATH_RAW_DATASET, "test_list_class.txt"), 'r') as f:
    contenu = f.read()
    testing_list = contenu.split("\n")
with open(os.path.join(GLOBAL_PATH_RAW_DATASET, "validation_list_class.txt"), 'r') as f:
    contenu = f.read()
    validation_list = contenu.split("\n")
with open(os.path.join(GLOBAL_PATH_RAW_DATASET, "train_list_class.txt"), 'r') as f:
    contenu = f.read()
    train_list = contenu.split("\n")
with open(os.path.join(GLOBAL_PATH_RAW_DATASET, "test_silence_list.txt"), 'r') as f:
    contenu = f.read()
    testing_list.extend(contenu.split("\n"))
with open(os.path.join(GLOBAL_PATH_RAW_DATASET, "val_silence_list.txt"), 'r') as f:
    contenu = f.read()
    validation_list.extend(contenu.split("\n"))
with open(os.path.join(GLOBAL_PATH_RAW_DATASET, "train_silence_list.txt"), 'r') as f:
    contenu = f.read()
    train_list.extend(contenu.split("\n"))

In [45]:
sound_dirs = [elem for elem in os.listdir(GLOBAL_PATH_RAW_DATASET) if os.path.isdir(os.path.join(GLOBAL_PATH_RAW_DATASET,elem))]
label2id = {"up" : 0, "down" : 1, "left" : 2, "right" : 3, "unknown" : 4, "silence" : 5}
id2label = {v : k for k,v in label2id.items()}
labels = label2id.keys()
dataset_train = []
dataset_validation = []
dataset_test = []

In [46]:
for sound_dir in sound_dirs:
    path_to_dir = os.path.join(GLOBAL_PATH_RAW_DATASET, sound_dir)
    if not ("background" in sound_dir) :
        if sound_dir in labels :
            label = label2id[sound_dir]
        else :
            label = label2id["unknown"]
        for sound_file in os.listdir(path_to_dir):
            if sound_file.endswith('0.wav') or sound_file.endswith('1.wav') or "silence" in sound_dir:
                path_to_file = os.path.join(path_to_dir, sound_file)
                total_filename = os.path.join(sound_dir,sound_file)
                fs, sig = wav.read(path_to_file)
                duree = len(sig) / fs
                if duree != 1 :
                    sig = np.pad(sig, (0, 16000 - len(sig)), mode='constant')
                if total_filename in validation_list :
                    dataset_validation.append((sig, fs, len(sig) / fs, label, total_filename))
                elif total_filename in testing_list :
                    dataset_test.append((sig, fs, len(sig) / fs, label, total_filename))
                elif total_filename in train_list :
                    dataset_train.append((sig, fs, len(sig) / fs, label, total_filename))
            

In [47]:

def check_size_dataset(lists, list_names):
    dict_save = {}
    for k, list in enumerate(lists) :
        dict_save[list_names[k]] = {}
        dict_aff = {}
        for (sig, fs, duree, label, filename) in list:
            if label in dict_aff:
                dict_aff[label] += 1
                dict_save[list_names[k]][label].append(filename)
            else :
                dict_aff[label] = 1
                dict_save[list_names[k]][label] = [filename]
        print(list_names[k]," : ",dict_aff)
    return dict_save["train"], dict_save["validation"], dict_save["test"]
train_names, validation_names, test_names = check_size_dataset([dataset_train,dataset_validation,dataset_test], ["train", "validation", "test"])

train  :  {4: 2000, 1: 2043, 0: 1937, 5: 2000, 2: 1979, 3: 1994}
validation  :  {4: 250, 1: 262, 0: 241, 5: 250, 2: 237, 3: 249}
test  :  {4: 250, 1: 262, 0: 257, 5: 250, 2: 264, 3: 260}


In [48]:
def get_data(dataset_list):
    X, Y, metadata = [], [], []
    for data in dataset_list :
        X.append(data[0])
        Y.append(data[3])
        metadata.append({"fs": data[1], "duree": data[2], "filename" : data[4]})
    return X, Y, metadata

X_train, Y_train, meta_train = get_data(dataset_train)
X_val, Y_val, meta_val = get_data(dataset_validation)
X_test, Y_test, meta_test = get_data(dataset_test)

dataset_dict = {
    "train": {
        "signals": X_train, 
        "labels": Y_train, 
        "metadata": meta_train
    },
    "val": {
        "signals": X_val,   
        "labels": Y_val,   
        "metadata": meta_val
    },
    "test": {
        "signals": X_test,  
        "labels": Y_test,  
        "metadata": meta_test
    },
    "mapping": label2id
}

filename = os.path.join(GLOBAL_PATH_DATASET, "dataset_basique.pkl")
with open(filename, "wb") as f:
    pickle.dump(dataset_dict, f)