In [169]:
import librosa
import soundfile as sf
import os
from IPython.display import Audio
import matplotlib.pyplot as plt
import numpy as np
from scipy.fftpack import rfft, irfft, fftfreq, fft, rfftfreq, ifft
from scipy import signal
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
import seaborn as sb

from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import silhouette_score
from sklearn.mixture import GaussianMixture as GMM
import python_speech_features as mfcc


In [170]:
def create_dataframe():
    # dataframe header
    header = 'rmse zero_crossing_rate'
    # insert the 21 mfcc feature column headers
    for i in range(1, 21):
        header += f' mfcc{i}'
    header = header.split()                             # make the header string -> array
    features_df = pd.DataFrame(columns = header)        # create the dataframe
    return features_df


In [176]:
def calculate_delta(array):
    rows, cols = array.shape
    deltas = np.zeros((rows, 20))
    N = 2
    for i in range(rows):
        index = []
        j = 1
        while j <= N:
            if i - j < 0:
                first = 0
            else:
                first = i - j
            if i + j > rows - 1:
                second = rows - 1
            else:
                second = i + j
            index.append((second, first))
            j += 1
        deltas[i] = (array[index[0][0]] - array[index[0][1]] + (2 * (array[index[1][0]] - array[index[1][1]]))) / 10
    return deltas

In [177]:
# function to extract features from folder of sounds and turn it into dataframe

def extractWavFeatures(soundFilesFolder):
    # create the dataframe with the needed headers
    features_df = create_dataframe()

    for filename in os.listdir(soundFilesFolder):
        number = f'{soundFilesFolder}/{filename}'
        audio, sr = librosa.load(number, mono=True, duration=3)

        # remove leading and trailing silence
        audio, index = librosa.effects.trim(audio)
        rmse = librosa.feature.rms(y=audio)
        zcr = librosa.feature.zero_crossing_rate(audio)
        mfcc_feat = librosa.feature.mfcc(y=audio, sr=sr)
        # dataframe row
        row_data = [float(np.mean(rmse)), float(np.mean(zcr))]

        # add mfcc features
        for e in mfcc_feat:
            row_data += [float(np.mean(e))]

        mfcc_feature = mfcc.mfcc(audio, sr, 0.025, 0.01, 20, nfft=1200, appendEnergy=True)
        mfcc_feature = preprocessing.scale(mfcc_feature)
        delta = calculate_delta(mfcc_feature)
        # append the row to the dataframe

        combined = np.hstack((mfcc_feature, delta))
        print(combined)
    #     features_df.loc[len(features_df)] = row_data
    # return features_df


In [179]:
Amr = extractWavFeatures('D:/My PC/Projects/DSP/Voice-Recognition-System/Model/Website Data/Amr')
Ibrahim = extractWavFeatures('D:/My PC/Projects/DSP/Voice-Recognition-System/Model/Website Data/Ibrahim')
Momen = extractWavFeatures('D:/My PC/Projects/DSP/Voice-Recognition-System/Model/Website Data/Momen')
Mariam = extractWavFeatures('D:/My PC/Projects/DSP/Voice-Recognition-System/Model/Website Data/Mariam')


[[-1.62883597e+00  4.47194401e-01 -5.59733106e-01 ... -3.15153859e-01
  -9.03537502e-02  4.88138913e-01]
 [-6.35932177e-01 -4.81034007e-02 -9.83303765e-01 ... -3.51165497e-01
  -6.57089017e-02  6.74294679e-01]
 [-4.00274476e-01  5.43938252e-02 -9.21092008e-01 ... -3.37757168e-01
  -3.84950629e-02  7.33522041e-01]
 ...
 [ 9.55504180e-01  6.08074039e-01 -9.34987812e-01 ... -2.33544178e-01
  -9.66075242e-02  1.10115327e-01]
 [ 4.80039307e-01  2.71225827e-01 -8.74218100e-01 ...  3.47118721e-03
  -1.01469170e-01  2.92842444e-01]
 [-5.37281759e+00 -2.01806767e+00  2.58623897e-01 ...  2.08010459e-01
  -5.07075921e-02  3.09873080e-01]]
[[-0.39003544  0.02216659 -0.09611547 ... -0.14799708 -0.05176217
   0.03431266]
 [ 0.26636652  0.86454214  0.30349739 ... -0.08364375 -0.15983466
   0.05503523]
 [ 0.48759041  0.95815269  0.07904524 ... -0.06968509 -0.33703141
   0.06349421]
 ...
 [-0.87697307 -0.26715481  0.12286115 ... -0.32943972 -0.30187842
  -0.06828058]
 [-0.93015996 -0.34516818  0.052784

In [78]:
Amr.dtypes

rmse                  float64
zero_crossing_rate    float64
mfcc1                 float64
mfcc2                 float64
mfcc3                 float64
mfcc4                 float64
mfcc5                 float64
mfcc6                 float64
mfcc7                 float64
mfcc8                 float64
mfcc9                 float64
mfcc10                float64
mfcc11                float64
mfcc12                float64
mfcc13                float64
mfcc14                float64
mfcc15                float64
mfcc16                float64
mfcc17                float64
mfcc18                float64
mfcc19                float64
mfcc20                float64
dtype: object

In [134]:
def train(dataframes):
    models = list()
    for df in dataframes:
        X = df
        cols = X.columns
        min_max_scaler = preprocessing.MinMaxScaler()
        np_scaled = min_max_scaler.fit_transform(X)
        # new data frame with the new scaled data.
        X = pd.DataFrame(np_scaled, columns=cols)
        gmm = GMM(n_components = 4, max_iter = 200, covariance_type='diag',n_init = 3).fit(X)
        models.append(gmm)

    return models

In [135]:
def score(models, audio_path):
        audio, sr = librosa.load(audio_path, mono=True, duration=3)

        # remove leading and trailing silence
        audio, index = librosa.effects.trim(audio)
        rmse = librosa.feature.rms(y=audio)
        zcr = librosa.feature.zero_crossing_rate(audio)
        mfcc = librosa.feature.mfcc(y=audio, sr=sr)
        # dataframe row
        row_data = [np.mean(rmse), np.mean(zcr)]

        # add mfcc features
        for e in mfcc:
            row_data += [np.mean(e)]

        score_list = np.zeros(len(models))
        for i in range(len(models)):
            out = np.array(models[i].score([row_data]))
            score_list[i] = out.sum()

        # score_list.append(model.score([features]))
        return score_list

In [136]:
models = train([Ibrahim,Momen,Amr,Mariam])

In [146]:
score_list = score(models, 'D:/My PC/Projects/DSP/Voice-Recognition-System/Model/Website Data/Mariam/9.wav')



In [147]:
winner = np.argmax(score_list)
print(winner)

1


In [148]:
print(score_list)

[-2745818.93847767 -1388825.84944711 -3768674.67082859 -2315493.70899307]
