# Voice Control Model

## Importing Libraries

In [21]:
# General math and plotting libs
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# Audio processing lib
import librosa

import os

Link to online pinball game for simulation:<br>
http://www.gamesxl.com/action/vanilla-pinball<br>
Another game:<br>
http://www.coolmath-games.com/0-jumpingarrows
<br><br>
Link to convert recorded audio to .wav format<br>
https://online-audio-converter.com

## Getting the Data

In [22]:
dataset = pd.DataFrame()
directory = os.fsencode('../DirectionAudio/')

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.startswith('left') and filename.endswith('.wav'):
        print(os.path.join(directory,file))
        data_left, sampling_rate_left = librosa.load(os.path.join(directory,file))
        mfccs_mean_left = np.mean(librosa.feature.mfcc(y=data_left, sr=sampling_rate_left, n_mfcc=40).T,axis=0)
        dataset = dataset.append([np.append(mfccs_mean_left,[1,0,0])])
    elif filename.startswith('right') and filename.endswith('.wav'):
        data_right, sampling_rate_right = librosa.load(os.path.join(directory,file))
        mfccs_mean_right = np.mean(librosa.feature.mfcc(y=data_right, sr=sampling_rate_right, n_mfcc=40).T,axis=0)
        dataset = dataset.append([np.append(mfccs_mean_right,[0,1,0])])
        print(os.path.join(directory,file))
    elif filename.startswith('quiet') and filename.endswith('.wav'):
        data_quiet, sampling_rate_quiet = librosa.load(os.path.join(directory,file))
        mfccs_mean_quiet = np.mean(librosa.feature.mfcc(y=data_quiet, sr=sampling_rate_quiet, n_mfcc=40).T,axis=0)
        dataset = dataset.append([np.append(mfccs_mean_quiet,[0,0,1])])
        print(os.path.join(directory,file))

b'../DirectionAudio/left1.wav'
b'../DirectionAudio/left10.wav'
b'../DirectionAudio/left11.wav'
b'../DirectionAudio/left12.wav'
b'../DirectionAudio/left13.wav'
b'../DirectionAudio/left14.wav'
b'../DirectionAudio/left15.wav'
b'../DirectionAudio/left16.wav'
b'../DirectionAudio/left17.wav'
b'../DirectionAudio/left18.wav'
b'../DirectionAudio/left19.wav'
b'../DirectionAudio/left2.wav'
b'../DirectionAudio/left20.wav'
b'../DirectionAudio/left21.wav'
b'../DirectionAudio/left22.wav'
b'../DirectionAudio/left23.wav'
b'../DirectionAudio/left24.wav'
b'../DirectionAudio/left25.wav'
b'../DirectionAudio/left3.wav'
b'../DirectionAudio/left4.wav'
b'../DirectionAudio/left5.wav'
b'../DirectionAudio/left6.wav'
b'../DirectionAudio/left7.wav'
b'../DirectionAudio/left8.wav'
b'../DirectionAudio/left9.wav'
b'../DirectionAudio/left_le_1.wav'
b'../DirectionAudio/left_le_2.wav'
b'../DirectionAudio/left_le_3.wav'
b'../DirectionAudio/left_le_4.wav'
b'../DirectionAudio/left_le_5.wav'
b'../DirectionAudio/left_n_1.wav'


In [23]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,-288.726619,93.892599,-15.389933,10.274669,-3.393598,7.298333,-1.401164,-19.125795,-17.020796,-12.49538,...,-1.851036,-3.123573,-2.055568,-2.017058,-3.753701,-4.313261,-3.216378,1.0,0.0,0.0
0,-429.301798,64.128178,-40.126702,7.746713,20.463285,5.440499,3.448309,-7.001011,-10.469566,-3.095748,...,-4.201741,-4.504744,-3.13581,-4.396649,-3.705075,-4.985896,-1.98104,1.0,0.0,0.0
0,-410.04522,117.487378,-28.330071,21.231326,18.668342,13.03376,7.315677,-11.429619,-15.532885,-8.375415,...,-3.895937,-3.000967,-5.143781,-3.346596,-4.273768,-6.199298,-4.568073,1.0,0.0,0.0
0,-477.020967,82.562815,-11.715563,8.372501,19.072891,10.195475,12.291642,-8.970161,-16.830773,-7.055139,...,-7.092507,-2.952451,-7.355587,-4.218381,-4.573473,-5.709709,-3.624939,1.0,0.0,0.0
0,-446.279339,96.153835,-27.320466,21.262665,3.398589,1.349921,12.96321,-12.153073,-19.508947,-0.186075,...,-3.99571,-2.954876,-8.280106,-4.333872,-5.477717,-8.561925,-5.28621,1.0,0.0,0.0


In [24]:
X = dataset.iloc[:,0:40].values
y = dataset.iloc[:,-3:].values

### Splitting data into training and testing sets

In [44]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.1,random_state=0)

## Creating the Deep Learning Model

In [45]:
import keras
from keras.models import Sequential
from keras.layers import Dense

In [46]:
model = Sequential()

In [47]:
model.add(Dense(256, input_shape=(40,), activation='relu', kernel_initializer='uniform'))

In [48]:
model.add(Dense(256, activation='relu',kernel_initializer='uniform'))

In [49]:
model.add(Dense(3, activation='softmax',kernel_initializer='uniform'))

### Compiling the Model

In [50]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

### Fitting the Data to the Model

In [51]:
model.fit(X_train,y_train, batch_size=1, epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


<keras.callbacks.History at 0x1c1f2a9128>

### Predicting the value using the Test set

In [52]:
y_pred = model.predict(X_test)
y_pred

array([[1.0000000e+00, 7.5693520e-14, 2.4941692e-15],
       [4.1082367e-01, 5.8915257e-01, 2.3717013e-05],
       [9.9998939e-01, 1.0548004e-05, 1.0551082e-07],
       [9.9999976e-01, 1.8717735e-07, 1.2042297e-08],
       [2.6246462e-02, 9.7339880e-01, 3.5475325e-04],
       [4.5116607e-13, 1.0000000e+00, 2.6394362e-16],
       [9.9999881e-01, 1.1745111e-06, 4.7419566e-09],
       [2.0584170e-07, 9.9999976e-01, 7.0219153e-10],
       [3.9304785e-11, 5.0066834e-10, 1.0000000e+00],
       [7.0070417e-04, 9.9929559e-01, 3.6364361e-06],
       [4.0658113e-10, 9.1372424e-11, 1.0000000e+00]], dtype=float32)

In [53]:
y_test

array([[1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [54]:
y_pred[y_pred>0.7]=1
y_pred[y_pred<=0.7]=0

In [55]:
y_pred

array([[1., 0., 0.],
       [0., 0., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 0., 1.]], dtype=float32)

In [56]:
acc_array = y_test == y_pred
acc_array

array([[ True,  True,  True],
       [ True, False,  True],
       [ True,  True,  True],
       [False, False,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

In [57]:
acc_array.all(axis=1)

array([ True, False,  True, False,  True,  True,  True,  True,  True,
        True,  True])

In [58]:
accuracy_test = (np.count_nonzero(acc_array == True)/acc_array.size)*100
accuracy_test

90.9090909090909

## Real Time Voice Processing

In [59]:
from pyaudio import PyAudio
import pyaudio
import wave

In [60]:
FORMAT = pyaudio.paInt16
CHANNELS = 1
# RATE = 44100
# Rate unit: Hz: Samples/seccond
RATE = 30000
# CHUNK = 1024
CHUNK = 1024*8
# Chunk unit: bytes
RECORD_SECONDS = 0.8
WAVE_OUTPUT_FILENAME = "direction.wav"

In [63]:
from pyautogui import press, typewrite, hotkey
# Had to pip install pyobjc before installing pyautogui
import time
time.sleep(15)

while(True):
    pa = PyAudio()
    # Start Recording
    stream = pa.open(format=FORMAT, channels=CHANNELS,input=True, rate = RATE, frames_per_buffer = CHUNK)
    print("recording...")
    frames = []
 
    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("finished recording")
    # stop Recording
    stream.stop_stream()
    stream.close()
    pa.terminate()
    waveFile = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    waveFile.setnchannels(CHANNELS)
    waveFile.setsampwidth(pa.get_sample_size(FORMAT))
    waveFile.setframerate(RATE)
    waveFile.writeframes(b''.join(frames))
    waveFile.close()
    
    temp_df = pd.DataFrame()
    data, sampling_rate = librosa.load('direction.wav')
    mfccs_mean = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=40).T,axis=0)
    temp_df = temp_df.append([mfccs_mean])
    pred = model.predict(temp_df)
    print(pred)
    
#     Alternate way of detecting silence using onset_strength
#     onset_env = librosa.onset.onset_strength(y=data, sr=sampling_rate,
#                                           aggregate=np.median,
#                                           fmax=8000, n_mels=256)
#     if(onset_env.max()<3):
#         continue
    
    if(pred[0][0]>0.7):
#        press('left')
        typewrite('left\n')
    elif(pred[0][1]>0.7) :
#      press('right')
        typewrite('right\n')
#     elif(pred[0][2]>0.7):
#         typewrite('quiet\n')

recording...
finished recording
[[9.719468e-01 5.618622e-09 2.805331e-02]]
recording...
finished recording
[[1.0206233e-13 7.5534370e-16 1.0000000e+00]]
recording...
finished recording
[[1.0000000e+00 1.3459071e-10 5.8350974e-08]]
recording...
finished recording
[[1.1212316e-03 9.9883503e-01 4.3647018e-05]]
recording...
finished recording
[[9.8195076e-01 1.7946778e-02 1.0245768e-04]]
recording...
finished recording
[[1.0000000e+00 2.1415134e-15 4.0209937e-15]]
recording...
finished recording
[[1.0000000e+00 1.7950098e-15 1.3935951e-14]]
recording...
finished recording
[[1.0000000e+00 3.5070116e-08 8.8630853e-10]]
recording...
finished recording
[[9.9924743e-01 7.0557010e-04 4.7098354e-05]]
recording...
finished recording
[[9.6079042e-05 9.9990380e-01 1.7312253e-07]]
recording...
finished recording
[[4.5030797e-04 1.6913737e-07 9.9954945e-01]]
recording...
finished recording
[[9.5341015e-01 5.0386132e-07 4.6589371e-02]]
recording...
finished recording
[[1.0000000e+00 1.9690033e-13 2.708

finished recording
[[2.3001335e-06 1.8831271e-12 9.9999774e-01]]
recording...
finished recording
[[3.1958769e-11 2.6796822e-16 1.0000000e+00]]
recording...
finished recording
[[1.5826132e-03 2.5996937e-12 9.9841738e-01]]
recording...
finished recording
[[2.0757946e-09 2.3551873e-16 1.0000000e+00]]
recording...
finished recording
[[7.69582661e-11 1.20526495e-14 1.00000000e+00]]
recording...
finished recording
[[7.294546e-01 1.120632e-09 2.705454e-01]]
recording...
finished recording
[[2.0661716e-07 1.4637901e-07 9.9999964e-01]]
recording...
finished recording
[[1.3277202e-08 9.7854608e-01 2.1453870e-02]]
recording...
finished recording
[[2.1253582e-08 1.0000000e+00 3.2995366e-09]]
recording...
finished recording
[[3.5805630e-03 5.2257783e-05 9.9636722e-01]]
recording...
finished recording
[[1.3847995e-05 4.0169543e-06 9.9998212e-01]]
recording...
finished recording
[[9.9999988e-01 2.3428351e-12 1.2109007e-07]]
recording...
finished recording
[[1.000000e+00 5.614371e-09 5.891612e-10]]
re

KeyboardInterrupt: 

### Using Onset Strength to detect periods of silence

In [None]:
y, sr = librosa.load('DirectionAudio/left5.wav')
onset_env = librosa.onset.onset_strength(y=y, sr=sr,
                                          aggregate=np.median,
                                          fmax=8000, n_mels=256)
onset_env

In [None]:
onset_env.max()

## Trying different ML models

### SVM

In [None]:
from sklearn import svm

In [None]:
data_svm = pd.DataFrame()
directory = os.fsencode('../DirectionAudio/')

for file in os.listdir(directory):
    filename = os.fsdecode(file)
    if filename.startswith('left') and filename.endswith('.wav'):
        print(os.path.join(directory,file))
        data_left, sampling_rate_left = librosa.load(os.path.join(directory,file))
        mfccs_mean_left = np.mean(librosa.feature.mfcc(y=data_left, sr=sampling_rate_left, n_mfcc=40).T,axis=0)
        data_svm = data_svm.append([np.append(mfccs_mean_left,[0])])
    elif filename.startswith('right') and filename.endswith('.wav'):
        data_right, sampling_rate_right = librosa.load(os.path.join(directory,file))
        mfccs_mean_right = np.mean(librosa.feature.mfcc(y=data_right, sr=sampling_rate_right, n_mfcc=40).T,axis=0)
        data_svm = data_svm.append([np.append(mfccs_mean_right,[1])])
        print(os.path.join(directory,file))
    elif filename.startswith('quiet') and filename.endswith('.wav'):
        data_quiet, sampling_rate_quiet = librosa.load(os.path.join(directory,file))
        mfccs_mean_quiet = np.mean(librosa.feature.mfcc(y=data_quiet, sr=sampling_rate_quiet, n_mfcc=40).T,axis=0)
        data_svm = data_svm.append([np.append(mfccs_mean_quiet,[2])])
        print(os.path.join(directory,file))

In [None]:
X = data_svm.iloc[:,0:40].values
y = data_svm.iloc[:,-1].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train, y_test = train_test_split(X,y,test_size = 0.1,random_state=0)

In [None]:
clf = svm.SVC(decision_function_shape='ovo')

In [None]:
clf.fit(X_train,y_train)

In [None]:
clf.predict(X_test)

In [None]:
y_test

### K-Means

In [None]:
from sklearn.cluster import KMeans

In [None]:
clf  = KMeans(n_clusters=3,init='k-means++',max_iter=500)

In [None]:
clf.fit(X_train)

In [None]:
clf.predict(X_test)

In [None]:
y_test