In [9]:
import numpy as np
import python_speech_features as mfcc
from sklearn import preprocessing
import os
import pyaudio
import time
import wave
from scipy.io.wavfile import read
from sklearn.mixture import GaussianMixture as GMM
import pickle
import glob
from IPython.display import Audio, display, clear_output

Audio Processing

In [2]:
# calculate and return the delta of given feature vector matrix

def calculate_delta(array):
    N = 2
    rows, cols = array.shape

    # Create an array of indices for the sliding window
    indices = np.arange(-N, N + 1)
    
    # Ensure the indices stay within bounds
    indices = np.clip(indices + np.arange(rows)[:, np.newaxis], 0, rows - 1)

    # Calculate the delta using array slicing and operations
    left_neighbors = array[indices[:, N - 1]]
    right_neighbors = array[indices[:, N + 1]]
    delta = (right_neighbors - left_neighbors) + 2 * (right_neighbors - 2 * array[indices[:, N]] + left_neighbors)
    delta /= 10

    return delta

In [3]:
# convert audio to mfcc features

def extract_features(audio, rate):
    mfcc_features = mfcc.mfcc(audio, rate, 0.025, 0.01, 20, appendEnergy=True, nfft=1103)
    mfcc_features = preprocessing.scale(mfcc_features)
    delta = calculate_delta(mfcc_features)
    combined = np.hstack((mfcc_features, delta))
    return combined

Adding a new user

In [34]:
def add_user():
    name = input("Enter your username: ")
    if os.path.exists('./voice_database/' + name):
        print("User already exists \n Try again with another username")
        return
    
    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100
    CHUNK = 1024
    RECORD_SECONDS = 3 # TODO: 5

    source = "./voice_database/" + name
    os.mkdir(source)
    for i in range(3):
        audio = pyaudio.PyAudio()

        if i == 0:
            j = 3
            while j>=0:
                time.sleep(1.0)
                print("Speak your name in {} seconds".format(j))
                clear_output(wait = True)
                j -= 1
        
        elif i == 1:
            print("Speak your name one more time")
            time.sleep(0.5)
        
        else:
            print("Speak your name one last time")
            time.sleep(0.5)
        
        # start Recording
        stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)

        print("recording...")
        frames = []

        for _ in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
            data = stream.read(CHUNK)
            frames.append(data)
        
        # stop Recording
        stream.stop_stream()
        stream.close()
        audio.terminate()

        # saving wav file of speaker
        waveFile = wave.open(source + '/' + str((i+1)) + '.wav', 'wb')
        waveFile.setnchannels(CHANNELS)
        waveFile.setsampwidth(audio.get_sample_size(FORMAT))
        waveFile.setframerate(RATE)
        waveFile.writeframes(b''.join(frames))
        waveFile.close()
        print("Done")

    dest =  "./gmm_models/"
    count = 1

    for path in os.listdir(source):
        path = os.path.join(source, path)

        features = np.array([])

        # reading audio files of speaker
        (sr, audio) = read(path)

        # extract 40 dimensional MFCC & delta MFCC features
        vector = extract_features(audio,sr)

        if features.size == 0:
            features = vector
        else:
            features = np.vstack((features, vector))

        # when features of 3 files of speaker are concatenated, then do model training
        if count == 3:
            gmm = GMM(n_components = 16, max_iter = 200, covariance_type='diag', n_init = 3)
            gmm.fit(features)

            # saving the trained gaussian model
            # pickle.dump(gmm, open(dest + name + '.gmm', 'w'))
            with open(dest + name + '.gmm', 'wb') as file:
                pickle.dump(gmm, file)
            print(name + ' added successfully')

            features = np.asarray(())
            count = 0
        count = count + 1



Delete User

In [5]:
def delete_user():
    name = input("Enter the name of the user you want to delete: ") # TODO: check if user exists
    [os.remove(path) for path in glob.glob('./voice_database/' + name + '/*')]
    os.rmdir('./voice_database/' + name)
    os.remove('./gmm_models/' + name + '.gmm')

Voice Authentication

In [60]:
def recognize():
    # target_username = input("Enter authentication username: ")
    # if not os.path.exists('./gmm_models/' + target_username + '.gmm'):
    #     print("User doesn't exist!")
    #     return
    # else:
    #     target_model = pickle.load(open('./gmm_models/' + target_username + '.gmm', 'rb'))

    FORMAT = pyaudio.paInt16
    CHANNELS = 1
    RATE = 44100
    CHUNK = 1024
    RECORD_SECONDS = 3
    FILENAME = "./test.wav"

    audio = pyaudio.PyAudio()

    # start Recording
    stream = audio.open(format=FORMAT, channels=CHANNELS, rate=RATE, input=True, frames_per_buffer=CHUNK)
    print("recording...")
    frames = []

    for i in range(0, int(RATE / CHUNK * RECORD_SECONDS)):
        data = stream.read(CHUNK)
        frames.append(data)
    print("finished recording")

    # stop Recording
    stream.stop_stream()
    stream.close()
    audio.terminate()

    # saving wav file
    waveFile = wave.open(FILENAME, 'wb')
    waveFile.setnchannels(CHANNELS)
    waveFile.setsampwidth(audio.get_sample_size(FORMAT))
    waveFile.setframerate(RATE)
    waveFile.writeframes(b''.join(frames))
    waveFile.close()

    modelpath = "./gmm_models/"
    gmm_files = [os.path.join(modelpath,fname) for fname in os.listdir(modelpath) if fname.endswith('.gmm')]

# Find the GMM model for the target username
    # # target_model = None
    # # for fname in gmm_files:
    # #     speaker = fname.split("/")[-1].split(".gmm")[0]
    # #     if speaker == target_username:
    # #         target_model = pickle.load(open(fname, 'rb'))
    # #         break

    # # if target_model is None:
    # #     print("User not found in the database!")
    # #     return

    # # Read test file
    # sr, audio = read(FILENAME)

    # # Extract MFCC features
    # vector = extract_features(audio, sr)
    # log_likelihood = target_model.score(vector)  # Calculate the log-likelihood for the target model
    # print(log_likelihood)
    # return log_likelihood

    # if log_likelihood > some_threshold:  # Set an appropriate threshold for recognition
    #     print(f"Recognized as {target_username}")
    # else:
    #     print("Not Recognized! Try again...")

# Checking all models
    models = [pickle.load(open(fname,'rb')) for fname in gmm_files]
    speakers = [fname.split("/")[-1].split(".gmm")[0] for fname in gmm_files]

    if len(models) == 0:
        print("No Users in the Database!")
        return
    
    #read test file
    sr,audio = read(FILENAME)

    # extract mfcc features
    vector = extract_features(audio,sr)
    log_likelihood = np.zeros(len(models))

    #checking with each model one by one
    for i in range(len(models)):
        gmm = models[i]         
        scores = np.array(gmm.score(vector))
        log_likelihood[i] = scores.sum()

    print(log_likelihood)

    pred = np.argmax(log_likelihood)
    identity = speakers[pred]
    print(identity)
    print(speakers)

    # # if voice not recognized than terminate the process
    # if identity == 'unknown':
    #     print("Not Recognized! Try again...")
    #     return
    # else:
    #     print( "Recognized as - ", identity)
    #     return identity


In [71]:
like = recognize()

recording...
finished recording
[-24.70263553 -30.5747202  -26.82657463]
test2
['test2', 'python', 'vishal']


In [55]:
np.argmax(like)

0

In [31]:
add_user()

recording...
Done
Speak your name one more time
recording...
Done
Speak your name one last time
recording...
Done
vishal added successfully


In [36]:
log = recognize()

recording...
finished recording
[12  6  6  6  6 12 12 12 12 12 12 12 12 12 12 10  8 10 10 12 12 12 10 12
  8 12 12 12 10 12 12 12  1 13  4  1 14 14 11  4 13 13 11 14 14  8  4 13
 11 13  8  4 13  1 12 12  8 12 10 12  8  8 12 12 12 14 12  1  8 12  8 12
 12  8 12 12  8  8 12 12 12 12 12 14  8  4  4  4  1 13  1  9  8 12  1  8
  0  8  8 13 12  4 13 12 12 12 12  8  1  8 12 12 12 12  8  1  8 13 13 12
 12 12 13  1  1 13 13 11  4 13 13  0  8 13  8 13 13  1 14 14 13  8  1 13
 13 14 11 11  1 13 13 12  8  8 13 13 13  8  8 13  4 13 12 12 12 12  8 12
  8 11  8 12 12 12 12  8 12 13 12  8  4 13  8  8 14 14 12 11 11 12 14  1
  0  8 13  8 14 13 12 12 12 14 11 14 12  8 12 11  1 12 14 12  9 12 12 12
 12 11 12  8  1  4 13 12  1 13  9  1  4 13 13  8 11 12 14  1 14 12 12 12
 12 12 11 11  1 11 11  4 12 12  8  8 12 12 12 12  8  8 12 12 12 12  8 12
 12 12 12 12 12 12 12 12 12 12 11 12 12 12 12 12 12 12 12 12 14 14  8 13
 12 12 12 12 12 14  1  9 13 13 12]


In [37]:
log2 = recognize()

recording...
finished recording
[13 13  5  0  0  0  5  5  0  5  5 13  5 13  5 13 13 13 13 13 13  0 13  5
  5 13  5  5  5  5  5  5  5  5  5  5  5  5  5 13  5  5  5  5  5  5  5 13
  5  5  5  5 13 13 13  5  5  5  5  5  5  5  5 13  5  5  5  5 13  5  5  5
 13  5  5  5  5  5  5  5  5  5  5  5 13  5  5  5  5 13 13 13  0 13  0  5
  5  5  5 13  5 13  5  5 13  5  5  9  5 13 13  5 13  5  5  5  5  5  5  5
  5  5  5  5  5  5 13 13 13 13  5  5  5  5  5  5  5  5  5  5 13  5  5 13
 13 13 13 13 13 13  5  5  5 13  5  5  5  0  5 13  5  5 13 13  5  5  5  5
  5  5  5  0  0  5 13  5  0  5  0  5  5  5 13  5  5 13  5 13 13 13  5  5
  5  5  5 13  5  5  5  5  5  5  5  5 13 13  5  5 13 13 13  5 13  5  5  5
  5  5  5  5  5  5  5  5  0  5  0  5 13  5  0 13 13 13  5  5  5 13  5  5
  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  5  0 13  0  5  5  5
 13  5  5  5  5  5  5  5  5  5  5  5  5 13  5  5  5  5  5 13  5  5  5 13
  5  0  5  5  0  5  5  5  5  5  5]


In [38]:
log3 = recognize()

recording...
finished recording
[12 12 12 12 12 12 12 12 11 11 11 14 12 14  8 14  8 14  8 14 14  8 14 14
 14 11 14 14 14 14 11 14 13  1 11 14 11 11 11 13 11  1  8 14  8 14 14 11
 11 11 14 14 14 14 14 11 14 14 14 14  1 14 14 11 14 14 13 13 14 11 11 14
 13 13 14  8 14 14 14 11 11 14 12 14  8 12 14 14 12 12 14 12 12 12 12 12
 14 12 12 12  1 14 14 14 12 13  0  0  8 13 14 14  8 12 12 12 13 12 12  8
 12 12 12 12 12 12 12 12 12 12 12 12  8  8 12  8 12 12 12 12 12 12 12 12
 12 12 12 12 13 12 13 12 13 12 12  8 12 12 12 12 12  8  8 12 12 12 12 12
 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
 12 12 12 12 12 12 12 12 12 12 12 13 13 12 12 12 12 12  8 13 12  9 13 14
 13 11 12 12 12 12 12 12 12 12 12  3  3 12 12 12 12 12 12 12 12 12 12 12
 12  8 13  4 13  9 13 13 13 13 13 11 13  8 14 13 14 14 11 11 14 14 13 11
  0 11 14 13  8  0 13  1 11 11 13 14 14 11 11 14 14 11  1 14  1 14 13 14
 14 11  1 14 14  8 13 12  8  8 11]


In [39]:
log.sum()

3030

In [40]:
log2.sum()

1964

In [41]:
log3.sum()

3454

In [42]:
log4 = recognize()

recording...
finished recording
[13 13  0  0  0  0  0  0  0  0  0  0 13  4  5  4  7  7  7  4  7  4  7  4
  7  7  7 13  7  0  4  7  4  5  0  0  0  0  5  5  0  5  0  5  0  0  0  5
  5  0  7  5  7  7  7  0  5  0  0  5  5  0  7  5  0  5 13  5  0  0  0  4
  7  7  4  4  4  7  7  7  4  7  7  4  4  4  7  4  4  7  4  7  7  7  7  7
  7  7  7  4  7  7  7  4  7  4 13  7 13 13 13  4  7  0  4  4  4  0  0  0
  0  7 13  5  5 13  6  6  6  2  5  5  5  5  2  8  5  8  8  8  5  5 11  6
  6  6  6  6  5  6  6  5  8  8  8  6  8  8  6 11  5  5  5  5  5  5  5 11
 11 11  5 11 11 11  5  5  5  5  5  5  5  5  5  5  5  5  5  8 11  5  5  5
  5  5  5  5  5  5  5 11  5 11 11  2  2  2  2  2  5  5  5  2  5  5  5  2
  2  5  5  5  5  5  5 13 13 13 13 13  1  1  1  1  1  1  1  1  5  5  5  8
  8  8 11 11 11  8  6  5  6  6  6  6  6  6  6  5  5  5  7  4  5 13 13 13
 13 13 13 13 13 13  0  0  0  0  0  4 13 13  4  0  0  0  0  0  0  0  0  4
  4  0  0  0  0  0  0  4  7 13  0]


In [43]:
log4.sum()

1590

In [29]:
type(log)

numpy.float64

In [30]:
log.sum()

-22.797398997781972

In [44]:
add_user()

recording...
Done
Speak your name one more time
recording...
Done
Speak your name one last time
recording...
Done
python added successfully


In [48]:
recognize()

recording...
finished recording
-30.05756359386624


-30.05756359386624