<a href="https://colab.research.google.com/github/Freireg/SpeechRecognition/blob/main/VoiceRecognition_RPI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [56]:
!pip install python_speech_features




In [57]:
from os import listdir
from os.path import isdir, join
import librosa
import random
import numpy as np
import matplotlib.pyplot as plt
import python_speech_features
from google.colab import drive

In [58]:
#drive.mount('Machine_Learning_Dataset')
dataset_path = '/content/Machine_Learning_Dataset/MyDrive/Machine_Learning_Dataset/Datasets'

for name in listdir(dataset_path):
  if isdir(join(dataset_path, name)):
    print(name)

follow
_background_noise_
backward
bird
cat
dog
down


In [59]:
#Create an all targets list
all_targets = [name for name in listdir(dataset_path) if isdir(join(dataset_path, name))]
all_targets.remove('_background_noise_')
print(all_targets)

['follow', 'backward', 'bird', 'cat', 'dog', 'down']


In [60]:
# Settings
target_list = all_targets
feature_sets_file = '/content/Machine_Learning_Dataset/MyDrive/Machine_Learning_Dataset/all_targets_mfcc_sets.npz'
perc_keep_samples = 1.0 # 1.0 is keep all samples
val_ratio = 0.1
test_ratio = 0.1
sample_rate = 8000
num_mfcc = 16
len_mfcc = 16

In [61]:
# Create list of filenames along with ground truth vector (y)
filenames = []
y = []
for index, target in enumerate(target_list):
    print(join(dataset_path, target))
    filenames.append(listdir(join(dataset_path, target)))
    y.append(np.ones(len(filenames[index])) * index)

/content/Machine_Learning_Dataset/MyDrive/Machine_Learning_Dataset/Datasets/follow
/content/Machine_Learning_Dataset/MyDrive/Machine_Learning_Dataset/Datasets/backward
/content/Machine_Learning_Dataset/MyDrive/Machine_Learning_Dataset/Datasets/bird
/content/Machine_Learning_Dataset/MyDrive/Machine_Learning_Dataset/Datasets/cat
/content/Machine_Learning_Dataset/MyDrive/Machine_Learning_Dataset/Datasets/dog
/content/Machine_Learning_Dataset/MyDrive/Machine_Learning_Dataset/Datasets/down


In [62]:
# Check ground truth Y vector
print(y)
for item in y:
    print(len(item))

[array([0., 0., 0., ..., 0., 0., 0.]), array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.

In [63]:
# Flatten filename and y vectors
filenames = [item for sublist in filenames for item in sublist]
y = [item for sublist in y for item in sublist]

In [64]:
# Associate filenames with true output and shuffle
filenames_y = list(zip(filenames, y))
random.shuffle(filenames_y)
filenames, y = zip(*filenames_y)

In [65]:
# Only keep the specified number of samples (shorter extraction/training)
print(len(filenames))
filenames = filenames[:int(len(filenames) * perc_keep_samples)]
print(len(filenames))

9969
9969


In [66]:
# Calculate validation and test set sizes
val_set_size = int(len(filenames) * val_ratio)
test_set_size = int(len(filenames) * test_ratio)

In [67]:
# Break dataset apart into train, validation, and test sets
filenames_val = filenames[:val_set_size]
filenames_test = filenames[val_set_size:(val_set_size + test_set_size)]
filenames_train = filenames[(val_set_size + test_set_size):]

In [68]:
# Break y apart into train, validation, and test sets
y_orig_val = y[:val_set_size]
y_orig_test = y[val_set_size:(val_set_size + test_set_size)]
y_orig_train = y[(val_set_size + test_set_size):]

In [69]:
# Function: Create MFCC from given path
def calc_mfcc(path):
    
    # Load wavefile
    signal, fs = librosa.load(path, sr=sample_rate)
    
    # Create MFCCs from sound clip
    mfccs = python_speech_features.base.mfcc(signal, 
                                            samplerate=fs,
                                            winlen=0.256,
                                            winstep=0.050,
                                            numcep=num_mfcc,
                                            nfilt=26,
                                            nfft=2048,
                                            preemph=0.0,
                                            ceplifter=0,
                                            appendEnergy=False,
                                            winfunc=np.hanning)
    return mfccs.transpose()

In [70]:
# TEST: Construct test set by computing MFCC of each WAV file
prob_cnt = 0
x_test = []
y_test = []
for index, filename in enumerate(filenames_train):
    
    # Stop after 500
    if index >= 500:
        break
    
    # Create path from given filename and target item
    path = join(dataset_path, target_list[int(y_orig_train[index])], 
                filename)
    
    # Create MFCCs
    mfccs = calc_mfcc(path)
    
    if mfccs.shape[1] == len_mfcc:
        x_test.append(mfccs)
        y_test.append(y_orig_train[index])
    else:
        print('Dropped:', index, mfccs.shape)
        prob_cnt += 1
        

Dropped: 6 (16, 11)
Dropped: 11 (16, 9)
Dropped: 21 (16, 13)
Dropped: 30 (16, 9)
Dropped: 42 (16, 10)
Dropped: 50 (16, 7)
Dropped: 68 (16, 14)
Dropped: 82 (16, 12)
Dropped: 90 (16, 13)
Dropped: 102 (16, 11)
Dropped: 108 (16, 11)
Dropped: 127 (16, 15)
Dropped: 129 (16, 10)
Dropped: 140 (16, 14)
Dropped: 165 (16, 13)
Dropped: 171 (16, 14)
Dropped: 178 (16, 8)
Dropped: 183 (16, 11)
Dropped: 195 (16, 13)
Dropped: 199 (16, 11)
Dropped: 203 (16, 13)
Dropped: 209 (16, 13)
Dropped: 213 (16, 9)
Dropped: 223 (16, 14)
Dropped: 224 (16, 15)
Dropped: 230 (16, 15)
Dropped: 241 (16, 11)
Dropped: 279 (16, 15)
Dropped: 287 (16, 14)
Dropped: 289 (16, 12)
Dropped: 303 (16, 13)
Dropped: 329 (16, 12)
Dropped: 346 (16, 12)
Dropped: 352 (16, 15)
Dropped: 366 (16, 12)
Dropped: 375 (16, 12)
Dropped: 377 (16, 11)
Dropped: 380 (16, 15)
Dropped: 383 (16, 13)
Dropped: 384 (16, 8)
Dropped: 386 (16, 14)
Dropped: 391 (16, 10)
Dropped: 395 (16, 11)
Dropped: 398 (16, 7)
Dropped: 419 (16, 10)
Dropped: 438 (16, 11)
Dropp

In [71]:
print('% of problematic samples:', prob_cnt / 500)

% of problematic samples: 0.096


In [72]:
# TEST: Test shorter MFCC
#!pip install playsound
#from playsound import playsound

#idx = 13

# Create path from given filename and target item
#path = join(dataset_path, target_list[int(y_orig_train[idx])], 
#            filenames_train[idx])

# Create MFCCs
#mfccs = calc_mfcc(path)
#print("MFCCs:", mfccs)

# Plot MFCC
#fig = plt.figure()
#plt.imshow(mfccs, cmap='inferno', origin='lower')

# TEST: Play problem sounds
#print(target_list[int(y_orig_train[idx])])
#playsound(path)

In [73]:
# Function: Create MFCCs, keeping only ones of desired length
def extract_features(in_files, in_y):
    prob_cnt = 0
    out_x = []
    out_y = []
        
    for index, filename in enumerate(in_files):
    
        # Create path from given filename and target item
        path = join(dataset_path, target_list[int(in_y[index])], 
                    filename)
        
        # Check to make sure we're reading a .wav file
        if not path.endswith('.wav'):
            continue

        # Create MFCCs
        mfccs = calc_mfcc(path)

        # Only keep MFCCs with given length
        if mfccs.shape[1] == len_mfcc:
            out_x.append(mfccs)
            out_y.append(in_y[index])
        else:
            print('Dropped:', index, mfccs.shape)
            prob_cnt += 1
            
    return out_x, out_y, prob_cnt

In [None]:
# Create train, validation, and test sets
x_train, y_train, prob = extract_features(filenames_train, 
                                          y_orig_train)
print('Removed percentage:', prob / len(y_orig_train))
x_val, y_val, prob = extract_features(filenames_val, y_orig_val)
print('Removed percentage:', prob / len(y_orig_val))
x_test, y_test, prob = extract_features(filenames_test, y_orig_test)
print('Removed percentage:', prob / len(y_orig_test))

In [75]:
# Save features and truth vector (y) sets to disk
np.savez(feature_sets_file, 
         x_train=x_train, 
         y_train=y_train, 
         x_val=x_val, 
         y_val=y_val, 
         x_test=x_test, 
         y_test=y_test)

In [76]:
# TEST: Load features
feature_sets = np.load(feature_sets_file)
feature_sets.files

['x_train', 'y_train', 'x_val', 'y_val', 'x_test', 'y_test']

In [77]:
len(feature_sets['x_train'])

7107

In [78]:
print(feature_sets['y_val'])

[5. 0. 2. 2. 4. 4. 5. 0. 4. 3. 0. 1. 5. 5. 4. 3. 5. 2. 0. 4. 1. 4. 5. 3.
 4. 3. 0. 4. 2. 2. 2. 4. 3. 5. 0. 3. 2. 5. 2. 5. 2. 3. 0. 2. 5. 3. 5. 0.
 5. 5. 4. 0. 3. 2. 0. 5. 4. 3. 0. 5. 4. 0. 1. 5. 4. 0. 2. 0. 4. 3. 0. 4.
 0. 4. 4. 2. 0. 3. 0. 0. 2. 5. 2. 3. 4. 5. 3. 2. 4. 5. 2. 4. 0. 2. 4. 0.
 5. 5. 2. 5. 0. 5. 4. 2. 5. 5. 0. 5. 4. 5. 1. 4. 2. 5. 5. 4. 4. 4. 0. 4.
 3. 5. 5. 3. 2. 3. 3. 5. 4. 0. 3. 0. 5. 0. 4. 5. 2. 0. 5. 5. 0. 4. 5. 5.
 3. 0. 5. 2. 2. 0. 0. 0. 0. 2. 4. 5. 2. 5. 2. 2. 4. 0. 3. 3. 4. 5. 2. 2.
 2. 3. 0. 3. 0. 2. 5. 2. 5. 4. 0. 3. 0. 4. 0. 4. 3. 2. 2. 3. 3. 5. 5. 4.
 3. 5. 3. 4. 3. 2. 3. 1. 4. 0. 3. 0. 4. 4. 3. 5. 0. 2. 5. 0. 0. 4. 0. 0.
 5. 5. 4. 4. 3. 0. 2. 0. 2. 5. 3. 2. 2. 4. 1. 4. 0. 4. 5. 5. 3. 3. 4. 5.
 4. 2. 4. 3. 0. 0. 0. 3. 4. 3. 1. 4. 0. 3. 5. 2. 4. 5. 4. 0. 2. 4. 4. 5.
 4. 4. 5. 3. 5. 2. 2. 4. 3. 0. 0. 3. 3. 3. 3. 2. 4. 3. 1. 3. 4. 3. 4. 3.
 0. 0. 0. 2. 4. 2. 3. 1. 4. 0. 2. 0. 4. 4. 5. 3. 5. 5. 2. 0. 4. 2. 2. 1.
 2. 4. 5. 3. 4. 3. 0. 4. 5. 2. 5. 5. 4. 3. 4. 2. 0.