# Training and Testing Data Preparation Notebook:

In [3]:
import sys
sys.path.append('../')

import os
import random
from functions import processing_funcs, utils
import librosa
import pandas as pd
from pydub.utils import mediainfo
import math
from pydub import AudioSegment

### Collect all the files from our processed directory:

In [4]:
voice_dir = os.path.join('..', 'voice_data')
mono_dir = 'mono_channels'
silence_dir = 'silence_removed'

sil_rmvd_dir = os.path.join(voice_dir, mono_dir, silence_dir)

voice_files = utils.read_dir_files(dir_path = sil_rmvd_dir, file_regex = r'[0-9]+\_(?:L|R)_sil_rmvd\.wav')

print('Total Files:', len(voice_files))

Collecting all files in ../voice_data/mono_channels/silence_removed matching regular expression [0-9]+\_(?:L|R)_sil_rmvd\.wav.

Total Files: 62


### Create a dictionary of features and load into a dataframe:

In [5]:
loaded_audio_dict = {}
voice_id = 0
for file_name in voice_files:
    file_path = os.path.join(sil_rmvd_dir, file_name)
    
    file_short_name = utils.get_file_name(file_name)[0:6]
    # Use [0] index to only return the array
    audio_array = librosa.load(path = file_path, sr = 8000, mono = True)[0]
    
    array_len = len(audio_array)
    
    file_duration_secs = utils.get_file_duration(file_path)
    file_duration_mins = utils.get_file_duration(file_path, minutes=True)
    
    loaded_audio_dict[file_short_name] = [audio_array,
                                          array_len,
                                          file_duration_secs,
                                          file_duration_mins,
                                          file_path,
                                          file_name,
                                          voice_id]
    voice_id = voice_id + 1

In [6]:
audio_data_df = pd.DataFrame.from_dict(data = loaded_audio_dict,
                                       orient = 'index',
                                       columns = ['time_series_array',
                                                  'array_len',
                                                  'duration (sec)',
                                                  'duration (min)',
                                                  'file_path',
                                                  'file_name',
                                                  'voice_id'])

In [7]:
audio_data_df.head()

Unnamed: 0,time_series_array,array_len,duration (sec),duration (min),file_path,file_name,voice_id
4175_L,"[-0.0005187988, -0.00048828125, -0.0004272461,...",5664256,708.032,11.8,../voice_data/mono_channels/silence_removed/41...,4175_L_sil_rmvd.wav,0
4175_R,"[0.0010070801, 0.0015563965, 0.0019226074, 0.0...",2532352,316.544,5.28,../voice_data/mono_channels/silence_removed/41...,4175_R_sil_rmvd.wav,1
4504_L,"[-0.0006713867, -0.0010375977, -0.0017089844, ...",1338368,167.296,2.79,../voice_data/mono_channels/silence_removed/45...,4504_L_sil_rmvd.wav,2
4504_R,"[-3.0517578e-05, 0.0, -3.0517578e-05, 0.0, -3....",932352,116.544,1.94,../voice_data/mono_channels/silence_removed/45...,4504_R_sil_rmvd.wav,3
4708_L,"[-0.000579834, -0.0005493164, -0.00048828125, ...",6243328,780.416,13.01,../voice_data/mono_channels/silence_removed/47...,4708_L_sil_rmvd.wav,4


### Create directories for each of the unique voice sources for training and testing:

In [8]:
train_top_dir = os.path.join(voice_dir, 'training')
test_top_dir = os.path.join(voice_dir, 'testing')
utils.make_dir(train_top_dir)
utils.make_dir(test_top_dir)

for file_name in audio_data_df.index:
    train_file_dir = os.path.join(train_top_dir, file_name)
    test_file_dir = os.path.join(test_top_dir, file_name)
    utils.make_dir(train_file_dir)
    utils.make_dir(test_file_dir)

Creating directory at ../voice_data/training...

Creating directory at ../voice_data/testing...

Creating directory at ../voice_data/training/4175_L...

Creating directory at ../voice_data/testing/4175_L...

Creating directory at ../voice_data/training/4175_R...

Creating directory at ../voice_data/testing/4175_R...

Creating directory at ../voice_data/training/4504_L...

Creating directory at ../voice_data/testing/4504_L...

Creating directory at ../voice_data/training/4504_R...

Creating directory at ../voice_data/testing/4504_R...

Creating directory at ../voice_data/training/4708_L...

Creating directory at ../voice_data/testing/4708_L...

Creating directory at ../voice_data/training/4708_R...

Creating directory at ../voice_data/testing/4708_R...

Creating directory at ../voice_data/training/4745_L...

Creating directory at ../voice_data/testing/4745_L...

Creating directory at ../voice_data/training/4745_R...

Creating directory at ../voice_data/testing/4745_R...

Creating direct

### Build a function to split audio files into smaller segments and put into individual training directories:

In [9]:
# Code based on: https://stackoverflow.com/questions/37999150/how-to-split-a-wav-file-into-multiple-wav-files

class SplitWavAudio():
    def __init__(self, folder, filename, save_folder):
        self.folder = folder
        self.filename = filename
        self.short_filename = filename[0:6]
        self.filepath = os.path.join(folder, filename)
        self.save_folder = save_folder
        
        self.audio = AudioSegment.from_wav(self.filepath)
    
    def get_duration(self):
        return self.audio.duration_seconds
    
    def single_split(self, from_sec, to_sec, split_filename):
        t1 = from_sec * 1000
        t2 = to_sec * 1000
        split_audio = self.audio[t1:t2]
        splt_aud_sav_nm = os.path.join(self.save_folder, split_filename)
        split_audio.export(splt_aud_sav_nm, format="wav")
        
    def multiple_split(self, sec_per_split, verbose=False):
        total_secs = math.ceil(self.get_duration())
        for i in range(0, total_secs, sec_per_split):
            if sec_per_split > total_secs - i:
                break
            split_fn = self.short_filename + '_' + str(i) + '.wav'
            self.single_split(i, i + sec_per_split, split_fn)
            if verbose == True:
                print(str(i) + ' Done')
            if i == total_secs - sec_per_split:
                print('All split successfully')

In [206]:
# Just for testing purposes, delete everything in training and testing files:

def clear_train_test_dirs():
    
    train_path = '../voice_data/training/'
    test_path = '../voice_data/testing/'
    
    training_dirs = os.listdir(train_path)
    testing_dirs = os.listdir(test_path)
    
    for train_dir in training_dirs:
        utils.del_dir_files(dir_path=train_path + train_dir, check=False, verbose=False)
        
    for testing_dir in testing_dirs:
        utils.del_dir_files(dir_path=test_path + testing_dir, check=False, verbose=False)
        
#clear_train_test_dirs()

In [207]:
%%time

# Split the files into segments of 5 seconds in length and move into their respective training directory:
for file in file_list:
    save_path = '../voice_data/training/' + file[0:6]
    aud_split = SplitWavAudio(sil_rmvd_dir, filename = file, save_folder = save_path)
    aud_split.multiple_split(5)

All split successfully
All split successfully
All split successfully
All split successfully
All split successfully
All split successfully
All split successfully
All split successfully
All split successfully
Wall time: 6.35 s


### Randomly move 15% of each of the files to the testing directory:

In [208]:
# Set random seed:
random.seed(8675309)

# Based on code here: https://stackoverflow.com/questions/59952200/move-a-random-sample-of-files-from-one-folder-to-another

def move_data(source_dir, dest_dir, perc_move=0.15, verbose=True):
    
    source_files = [f for f in os.listdir(source_dir) if os.path.isfile(os.path.join(source_dir, f))]
    num_source_files = len(source_files)
    num_files_to_move = math.floor(num_source_files * float(perc_move))
    
    if verbose:
        print(f'Number of files in {source_dir} = {num_source_files}.')
        print(f'Randomly moving {perc_move}% ({num_files_to_move}) of files from {source_dir} to {dest_dir}.')
        
    for i in range(num_files_to_move):
        # Choose a file randomly from the source directory:
        random_file = random.choice(os.listdir(source_dir))
        
        # Get path of the randomly-chosen file:
        source_file = "%s/%s"%(source_dir, random_file)
        
        # Create filepath for the destination:
        dest_file = dest_dir + '/' + random_file
        
        os.replace(source_file, dest_file)
        
    print(f'{num_files_to_move} files moved from {source_dir} to {dest_dir}.')

In [209]:
source = '../voice_data/training/'
destination = '../voice_data/testing/'

train_dirs = utils.read_dir_files(dir_path = '../voice_data/training/', file_regex=r'[0-9]+_(L|R)')

for train_dir in train_dirs:
    
    train_file = source + train_dir
    test_file = destination + train_dir
    
    move_data(train_file, test_file)

Collecting all files in ../voice_data/training/ matching regular expression [0-9]+_(L|R).

Number of files in ../voice_data/training/4175_L = 141.
Randomly moving 0.15% (21) of files from ../voice_data/training/4175_L to ../voice_data/testing/4175_L.
21 files moved from ../voice_data/training/4175_L to ../voice_data/testing/4175_L.
Number of files in ../voice_data/training/4175_R = 63.
Randomly moving 0.15% (9) of files from ../voice_data/training/4175_R to ../voice_data/testing/4175_R.
9 files moved from ../voice_data/training/4175_R to ../voice_data/testing/4175_R.
Number of files in ../voice_data/training/4504_L = 33.
Randomly moving 0.15% (4) of files from ../voice_data/training/4504_L to ../voice_data/testing/4504_L.
4 files moved from ../voice_data/training/4504_L to ../voice_data/testing/4504_L.
Number of files in ../voice_data/training/4504_R = 23.
Randomly moving 0.15% (3) of files from ../voice_data/training/4504_R to ../voice_data/testing/4504_R.
3 files moved from ../voice_

### Create training and testing dataframes with file_path to each audio file and class label:

In [222]:
audio_data_df.head(1)

Unnamed: 0,time_series_array,array_len,duration (sec),duration (min),file_path,file_name,voice_id
4175_L,"[-0.0005187988, -0.00048828125, -0.0004272461,...",5664256,708.032,11.8,../voice_data/mono_channels/silence_removed/41...,4175_L_sil_rmvd.wav,0


In [243]:
train_data_path = '../voice_data/training/'
test_data_path = '../voice_data/testing/'

train_dirs = utils.read_dir_files(dir_path = train_data_path, file_regex=r'[0-9]+_(L|R)')
test_dirs = utils.read_dir_files(dir_path = test_data_path, file_regex=r'[0-9]+_(L|R)')

training_df_dict = {}
testing_df_dict = {}

train_index = 0
test_index = 0

for train_dir in train_dirs:
    dir_path = train_data_path + train_dir
    files = utils.read_dir_files(dir_path, file_regex=None)
    
    for file in files:
        file_path = dir_path + '/' + file
        voice_id = audio_data_df.loc[train_dir].voice_id
        
        training_df_dict[train_index] = [file_path, voice_id]
        
        train_index = train_index + 1
        
for test_dir in test_dirs:
    dir_path = test_data_path + test_dir
    files = utils.read_dir_files(dir_path, file_regex=None)
    
    for file in files:
        file_path = dir_path + '/' + file
        voice_id = audio_data_df.loc[test_dir].voice_id
        
        testing_df_dict[test_index] = [file_path, voice_id]
        
        test_index = test_index + 1

Collecting all files in ../voice_data/training/ matching regular expression [0-9]+_(L|R).

Collecting all files in ../voice_data/testing/ matching regular expression [0-9]+_(L|R).

Collecting all files in ../voice_data/training/4175_L matching regular expression None.

Collecting all files in ../voice_data/training/4175_R matching regular expression None.

Collecting all files in ../voice_data/training/4504_L matching regular expression None.

Collecting all files in ../voice_data/training/4504_R matching regular expression None.

Collecting all files in ../voice_data/training/4708_L matching regular expression None.

Collecting all files in ../voice_data/training/4708_R matching regular expression None.

Collecting all files in ../voice_data/training/4745_L matching regular expression None.

Collecting all files in ../voice_data/training/4745_R matching regular expression None.

Collecting all files in ../voice_data/training/4823_L matching regular expression None.

Collecting all fil

In [247]:
training_data_df = pd.DataFrame.from_dict(data = training_df_dict,
                                          orient = 'index',
                                          columns = ['file_path', 'voice_id'])

testing_data_df = pd.DataFrame.from_dict(data = testing_df_dict,
                                         orient = 'index',
                                         columns = ['file_path', 'voice_id'])

In [248]:
training_data_df.head()

Unnamed: 0,file_path,voice_id
0,../voice_data/training/4175_L/4175_L_0.wav,0
1,../voice_data/training/4175_L/4175_L_10.wav,0
2,../voice_data/training/4175_L/4175_L_100.wav,0
3,../voice_data/training/4175_L/4175_L_110.wav,0
4,../voice_data/training/4175_L/4175_L_115.wav,0


In [249]:
testing_data_df.head()

Unnamed: 0,file_path,voice_id
0,../voice_data/testing/4175_L/4175_L_105.wav,0
1,../voice_data/testing/4175_L/4175_L_150.wav,0
2,../voice_data/testing/4175_L/4175_L_185.wav,0
3,../voice_data/testing/4175_L/4175_L_20.wav,0
4,../voice_data/testing/4175_L/4175_L_305.wav,0


In [250]:
training_data_df.to_csv('../meta_data/training_files.csv')

In [251]:
testing_data_df.to_csv('../meta_data/testing_files.csv')