In [2]:
from glob import iglob, glob
import os
import sys
from python_speech_features import mfcc
import numpy as np
import random
import progressbar
import librosa

def data_split(folder, partition_dict=None, seed=78):
    """Split VCTK data into train, dev, test sets.
        Args:
            folder: the folder path to the data (string)
            partition_dict: dictionary for train/dev/test split (default 0.8/0.1/0.1)
        Returns:
            None
    """
    if partition_dict is None:
        partition_dict = {'train':0.8, 'dev':0.1, 'test':0.1}
    assert sum(partition_dict.values()) == 1
    speaker_folders = glob(os.path.join(folder,'*'))
    for speaker_folder in speaker_folders:
        #print(speaker_folder)
        wav_files = glob(os.path.join(speaker_folder, '*.wav' ))
        #print(len(wav_files))
        random.seed(seed)
        random.shuffle(wav_files)
        quantities = [(name, round(ratio*len(wav_files))) for (name, ratio) in partition_dict.items()]
        for name, quantity in quantities:
            #print(quantity)
            for _ in range(quantity):
                try:
                    audio = wav_files.pop()
                    new_path_wav = os.path.join(folder, name, 'wav', speaker_folder.split('/')[-1], os.path.basename(audio))
                    os.renames(audio, new_path_wav)
                    old_path_txt = audio.replace("wav","txt").replace("wav","txt")
                    new_path_txt = new_path_wav.replace("wav","txt").replace("wav","txt")
                    os.renames(old_path_txt, new_path_txt)
                except IndexError as e:
                    pass

def find_files(directory, pattern='**/*.wav'):
    """Recursively finds all files matching the pattern."""
    return sorted(iglob(os.path.join(directory, pattern), recursive=True))

def read_audio_from_filename(filename, sample_rate):
    """Load a wav file and transpose the array."""
    audio, _ = librosa.load(filename, sr=sample_rate, mono=True)
    audio = audio.reshape(-1, 1)
    return audio

def convert_txt_index(target_txt):
    """Turn text into index."""
    original = ' '.join(target_txt.strip().lower().split(' ')).replace('.', '').replace('?', '').replace(',', '').replace("'", '').replace('!', '').replace('-', '').replace('\t', '').replace(')', '').replace('"', '')
    targets = original.replace(' ', '  ')
    targets = targets.split(' ')
    
    # Adding blank label
    targets = np.hstack([SPACE_TOKEN if x == '' else list(x) for x in targets])
    # Transform char into index
    targets = np.asarray([SPACE_INDEX if x == SPACE_TOKEN else ord(x) - FIRST_INDEX
                          for x in targets])
    return targets, original

def return_txt_path(wav_path):
    """Return the corresponding txt location for VCTK data set."""
    return wav_path.replace("wav","txt").replace("wav","txt")

def find_speaker_ID(wav_path):
    """Find speaker ID from the path of a wav file."""
    return wav_path.split('.')[0].split('/')[-1]

def pack_data_npz(DIR, input_mfcc, target, speaker_wav_ID, original):
    """Pickle data into npz files."""
    np.savez(os.path.join(DIR, speaker_wav_ID),\
             data_in=input_mfcc, target=target, seq_len=np.array([len(input_mfcc)]), original=np.array([original]))

def convert_wav_mfcc(file, fs):
    """Turn raw audio data into MFCC with sample rate=fs."""
    inputs = mfcc(read_audio_from_filename(file, fs),samplerate=16000,winlen=0.025,winstep=0.01,numcep=39,
                 nfilt=40)
    return inputs

In [4]:
#Corpus location
data_dir = '/home/minhhieu/My Projects/Compressed Speech Data/command_aHieu/data_cmds'

In [9]:
train_f = open(os.path.join(data_dir,"train.txt"),"r")
counter = 0
content = train_f.read()
colist = content.split("\n")
for i in colist:
    if i:
        counter += 1

print(counter)
train_f.close()

1207


In [3]:
# create a list of all commands
train_f = open(os.path.join(data_dir,"train.txt"),"r")

#delete whatever was in the file
cmd_list_f = open(os.path.join(data_dir,"cmd_list.txt"),"w")
cmd_list_f.close()

cmd_list_f = open(os.path.join(data_dir,"cmd_list.txt"),"a")

cmd = train_f.readline()
prev_cmd = ''
while cmd != '':
    if cmd != prev_cmd:
        cmd_list_f.write(cmd)
        prev_cmd = cmd
    cmd = train_f.readline()
train_f.close()
cmd_list_f.close()

In [4]:
#create a text file corresponding to every single records
cmd_list_f = open(os.path.join(data_dir,"cmd_list.txt"), "r")

directory = 0

while directory <= 70:
    text = cmd_list_f.readline()
    
    wavs = [f for f in os.listdir(os.path.join(data_dir, str(directory))) 
            if os.path.isfile(os.path.join(data_dir, str(directory), f)) 
            and f.endswith('.wav')]
    for wav in wavs:
        wavpath = os.path.join(data_dir, str(directory), wav)
        wavpath = wavpath.replace('wav','txt')
        
        text_f = open(os.path.join(wavpath), "w")
        text_f.write(text)
        text_f.close()
    directory = directory + 1
cmd_list_f.close()


In [5]:
#Split VCTK data into trian/test/dev set
data_split(data_dir)

In [6]:
#find mean, and varience of training data for normalization
directory = '/home/minhhieu/My Projects/Compressed Speech Data/command_aHieu/data_cmds/train/wav'

bar = progressbar.ProgressBar()
n =0
sum_mfcc = np.zeros(39) 
sumsq_mfcc = np.zeros(39)
total_len = 0
for file in bar(find_files(directory, pattern='**/*.wav')): 
    audio = mfcc(read_audio_from_filename(file, 16000),samplerate=16000,winlen=0.025,winstep=0.01,numcep=39,
                 nfilt=40)

    sum_mfcc += np.sum(audio, axis = 0)
    sumsq_mfcc +=np.sum(audio*audio, axis = 0)
    total_len += len(audio)
    n += 1

m = sum_mfcc/total_len
v = sumsq_mfcc/(total_len-1) - m*m
s = np.sqrt(v)

print(m)
print(v)
print(s)


100% |########################################################################|

[ -4.7142037    2.67463059  -6.26692545  -0.86938414 -17.76231234
  -5.00257211  -5.39799636 -16.52314351  -2.42566297 -15.41543841
  -6.1217356  -11.66405996 -15.78104423  -5.33257431  -7.94131976
  -2.43818405  -2.55286844  -0.92384073  -2.74482856  -0.65408305
  -1.35580234  -0.29269695  -0.23433919   0.12245257   0.50280009
  -0.39649036   0.81109185  -1.29818507   0.08517973  -0.58516622
   0.53700888   0.02567575   0.4447999   -0.70513672  -0.12575338
  -1.02424198  -0.78371256  -0.70379923  -0.52070044]
[8.43151570e+00 2.69039695e+02 3.08331083e+02 3.05043797e+02
 6.88538956e+02 3.97221865e+02 3.63112248e+02 3.24360884e+02
 3.15167410e+02 3.57302126e+02 2.48696067e+02 2.90939072e+02
 2.78930546e+02 1.83713492e+02 1.36820044e+02 8.69230245e+01
 7.86322820e+01 5.17433923e+01 2.86640510e+01 1.86508130e+01
 9.42721648e+00 3.33434792e+00 4.73556895e-01 1.29365346e-01
 1.68558517e+00 4.46272142e+00 7.74387472e+00 1.21501602e+01
 1.76504581e+01 1.96237808e+01 2.24384471e+01 2.27667003e




In [7]:
print(total_len)

158963


In [8]:
SPACE_TOKEN = '<space>'
SPACE_INDEX = 0
FIRST_INDEX = ord('a') - 1 

In [9]:
Train_DIR = '/home/minhhieu/My Projects/Compressed Speech Data/command_aHieu/MFCC_39_16khz/train/'
directory = '/home/minhhieu/My Projects/Compressed Speech Data/command_aHieu/data_cmds/train/'

In [10]:
# Pickle Training data

if not os.path.exists(Train_DIR):
    os.makedirs(Train_DIR)
    
bar = progressbar.ProgressBar()
for wav_path in bar(find_files(directory, pattern='**/*.wav')):
    #print(wav_path)
    speaker_ID = find_speaker_ID(wav_path)
    #print(speaker_ID)
    txt_path = return_txt_path(wav_path)
    target, original = convert_txt_index(open(txt_path).read().strip())
    inputs = convert_wav_mfcc(wav_path, 16000)
    normalize_inputs = (inputs - m)/s
    pack_data_npz(Train_DIR, normalize_inputs, target, speaker_ID, original)


100% |########################################################################|


In [11]:
# Pickle Dev data

DIR = '/home/minhhieu/My Projects/Compressed Speech Data/command_aHieu/MFCC_39_16khz/dev/'
directory = '/home/minhhieu/My Projects/Compressed Speech Data/command_aHieu/data_cmds/dev/'

if not os.path.exists(DIR):
    os.makedirs(DIR)
    
bar = progressbar.ProgressBar()
for wav_path in bar(find_files(directory, pattern='**/*.wav')):
    #print(wav_path)
    speaker_ID = find_speaker_ID(wav_path)
    #print(speaker_ID)
    txt_path = return_txt_path(wav_path)
    target, original = convert_txt_index(open(txt_path).read().strip())
    inputs = convert_wav_mfcc(wav_path, 16000)
    normalize_inputs = (inputs - m)/s
    pack_data_npz(DIR, normalize_inputs, target, speaker_ID, original)

100% |########################################################################|


In [12]:
# Pickle Test data

DIR = '/home/minhhieu/My Projects/Compressed Speech Data/command_aHieu/MFCC_39_16khz/test/'
directory = '/home/minhhieu/My Projects/Compressed Speech Data/command_aHieu/data_cmds/test/'

if not os.path.exists(DIR):
    os.makedirs(DIR)
    
bar = progressbar.ProgressBar()
for wav_path in bar(find_files(directory, pattern='**/*.wav')):
    #print(wav_path)
    speaker_ID = find_speaker_ID(wav_path)
    #print(speaker_ID)
    txt_path = return_txt_path(wav_path)
    target, original = convert_txt_index(open(txt_path).read().strip())
    inputs = convert_wav_mfcc(wav_path, 16000)
    normalize_inputs = (inputs - m)/s
    pack_data_npz(DIR, normalize_inputs, target, speaker_ID, original)

100% |########################################################################|
