In [8]:
# basic packages

import numpy as np
import librosa
import os
import pandas as pd
import pickle

In [9]:
# machine learning & preprocessing packages

from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

In [17]:
def create_dataframe():
    # dataframe header
    header = 'filename rmse zero_crossing_rate'
    # insert the 21 mfcc feature column headers
    for i in range(1, 21):
        header += f' mfcc{i}'
    header += ' label'                                  # the label header
    header = header.split()                             # make the header string -> array
    features_df = pd.DataFrame(columns = header)        # create the dataframe
    return features_df


In [18]:
# function to extract features from folder of sounds and turn it into dataframe

def extractWavFeatures(soundFilesFolder):
    # create the dataframe with the needed headers
    features_df = create_dataframe()

    for filename in os.listdir(soundFilesFolder):
        number = f'{soundFilesFolder}/{filename}'
        audio, sr = librosa.load(number, mono=True, duration=3)


    # remove leading and trailing silence
        audio, index = librosa.effects.trim(audio)

        chroma_shift_mean       = librosa.feature.chroma_stft(y=audio, sr=sr).mean(),
        chroma_shift_var        = librosa.feature.chroma_stft(y=audio, sr=sr).var(),
        rms_mean                = librosa.feature.rms(y=audio).mean(),
        rms_var                 = librosa.feature.rms(y=audio).var(),
        spectral_cent_mean      = librosa.feature.spectral_centroid(y=audio).mean(),
        spectral_cent_var       = librosa.feature.spectral_centroid(y=audio).var(),
        spectral_bandwidth_mean = librosa.feature.spectral_bandwidth(y=audio).mean(),
        spectral_bandwidth_var  = librosa.feature.spectral_bandwidth(y=audio).var(),
        spectral_rolloff_mean   = librosa.feature.spectral_rolloff(y=audio).mean(),
        spectral_rolloff_var    = librosa.feature.spectral_rolloff(y=audio).var()

        # dataframe row
        #row_data = f'{chroma_shift_mean} {chroma_shift_var} {rms_mean} {rms_var} {spectral_cent_mean} {spectral_cent_var} {spectral_bandwidth_mean} {spectral_bandwidth_var} {spectral_rolloff_mean} {spectral_rolloff_var}'
        row_data = [librosa.feature.chroma_stft(y=audio, sr=sr).mean(), librosa.feature.chroma_stft(y=audio, sr=sr).var(),
                        librosa.feature.rms(y=audio).mean(), librosa.feature.rms(y=audio).var(),
                        librosa.feature.spectral_centroid(y=audio).mean(),librosa.feature.spectral_centroid(y=audio).var(),
                        librosa.feature.spectral_bandwidth(y=audio).mean(),librosa.feature.spectral_bandwidth(y=audio).var(),
                        librosa.feature.spectral_rolloff(y=audio).mean(),librosa.feature.spectral_rolloff(y=audio).var()]


        mfcc = librosa.feature.mfcc(y=audio)

        # add mfcc features
        for e in mfcc:
            row_data = row_data + [e.mean()]

        # add the labels for the dataframe
        if 'ibrahim' in filename:
            row_data = row_data + [1]
        elif 'Amr' in filename:
            row_data = row_data + [2]
        elif 'mariam' in filename:
            row_data = row_data + [3]
        elif 'momen' in filename:
            row_data = row_data + [4]
        else:
            row_data = row_data + [0]

        print(row_data)

        # append the row to the dataframe
        features_df.loc[len(features_df)] = row_data

    return features_df

In [36]:
my_df = extractWavFeatures("D:/My PC/Projects/DSP/Voice-Recognition-System/vrs-server/apis/Website Data/all-other")

['chroma_shift_mean', 'chroma_shift_var', 'rms_mean', 'rms_var', 'spectral_cent_mean', 'spectral_cent_var', 'spectral_bandwidth_mean', 'spectral_bandwidth_var', 'spectral_rolloff_mean', 'spectral_rolloff_var', 'mfcc1', 'mfcc2', 'mfcc3', 'mfcc4', 'mfcc5', 'mfcc6', 'mfcc7', 'mfcc8', 'mfcc9', 'mfcc10', 'mfcc11', 'mfcc12', 'mfcc13', 'mfcc14', 'mfcc15', 'mfcc16', 'mfcc17', 'mfcc18', 'mfcc19', 'mfcc20', 'label']
[0.3374428, 0.09618429, 0.03819224, 0.0039691315, 1327.1612759155637, 125345.96617884764, 1710.3009959304954, 163288.3452199652, 2494.546962020421, 1222380.7767073018, -425.10632, 113.61859, 19.532011, 4.25592, -2.7544334, -5.9358263, 1.0273383, -11.377585, -15.920447, -9.069541, -7.09402, -1.2167988, -1.4892529, -3.1446557, 2.9542098, 2.9513304, -6.149427, 1.2425432, -2.3654845, -7.030895, 0]
[0.37467796, 0.09584106, 0.044004973, 0.007976218, 1842.900280575327, 1014501.9847974137, 1959.9180624161254, 406412.6593843397, 3672.9889968345906, 4562966.009270838, -402.6135, 71.45221, 19.7

  return f(*args, **kwargs)


[0.35163966, 0.094838336, 0.13547368, 0.006909913, 1452.8301908036203, 93121.58424327009, 1386.9505982363996, 48592.31944194239, 2779.7632677801726, 517352.4035809297, -153.91016, 155.75984, -57.20391, 29.671213, -11.50448, -3.9980826, -15.345577, -12.749097, -23.455542, 2.727752, -7.6402345, -8.948462, 2.6184392, -12.236117, 0.14805365, 0.49169168, -9.972308, -6.0098934, -1.3978369, -6.112736, 0]
[0.419606, 0.09063352, 0.01591849, 0.00013049654, 1556.2446232618709, 483051.8891756937, 1804.8954543425014, 227476.56057877347, 3406.329977101293, 2666564.3373808595, -401.62833, 137.55945, -6.5015755, 52.86453, -10.056593, 13.638168, -0.8160071, 1.6358126, -16.43949, 5.557709, -17.524132, -7.567836, -3.9858103, -0.7378466, 1.2691176, -2.9382632, -8.436907, -1.5513288, -3.619797, -8.634042, 0]
[0.3415224, 0.09911839, 0.031701893, 0.0002039302, 935.9244792704034, 402683.22139887733, 981.3546530771591, 135834.6077117464, 1534.5501077586207, 1189071.9703587568, -370.78046, 202.65993, -24.16371,

In [37]:
my_df

Unnamed: 0,chroma_shift_mean,chroma_shift_var,rms_mean,rms_var,spectral_cent_mean,spectral_cent_var,spectral_bandwidth_mean,spectral_bandwidth_var,spectral_rolloff_mean,spectral_rolloff_var,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,0.337443,0.096184,0.038192,0.003969,1327.161276,1.253460e+05,1710.300996,163288.345220,2494.546962,1.222381e+06,...,-1.216799,-1.489253,-3.144656,2.954210,2.951330,-6.149427,1.242543,-2.365484,-7.030895,0.0
1,0.374678,0.095841,0.044005,0.007976,1842.900281,1.014502e+06,1959.918062,406412.659384,3672.988997,4.562966e+06,...,0.129431,-3.286428,-1.928210,-5.382076,1.589995,-0.220456,0.738965,-3.223547,-2.810089,0.0
2,0.410613,0.093117,0.040251,0.006355,1784.015186,9.860222e+05,2048.003297,358752.248928,3689.231715,4.807664e+06,...,-1.851334,-2.043721,1.549013,-1.923059,0.825812,0.068985,0.657550,-4.563477,-0.867913,0.0
3,0.331085,0.098558,0.023443,0.001997,1784.807185,7.701224e+05,1855.793091,340334.346252,3453.340229,3.281483e+06,...,5.708575,-2.120415,1.420492,-0.819602,-4.160927,-3.729533,1.026227,-3.137887,-1.790695,0.0
4,0.419685,0.090898,0.023399,0.002732,1755.381442,1.086911e+06,1977.238090,219046.530711,3358.622794,3.092799e+06,...,-1.016256,-4.475920,0.344913,-1.175172,-0.623551,-4.891524,0.578958,2.032105,0.795735,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
626,0.335542,0.097081,0.082318,0.002301,1566.181910,7.314636e+05,1594.470392,224267.030846,2941.160122,2.585929e+06,...,7.201744,-4.329013,-5.788375,5.112070,-4.896547,-3.312038,-3.867249,-5.339969,-2.423097,0.0
627,0.332230,0.091645,0.091341,0.004895,1046.169286,2.560589e+05,994.014818,169754.619131,1843.320968,1.103990e+06,...,-6.745724,2.590726,-6.284797,-2.629619,-2.952410,-4.904474,-7.318274,-8.095371,-3.430597,0.0
628,0.428845,0.105933,0.053220,0.002423,1390.512926,1.485705e+05,1605.829229,99878.957371,2962.880257,1.111701e+06,...,-15.023808,6.951869,-2.415708,-4.868324,-1.669878,0.404921,-4.545482,-5.168434,3.255368,0.0
629,0.460332,0.094499,0.054564,0.003126,1805.220564,1.044535e+06,1777.195738,185662.458364,3606.004028,2.709373e+06,...,-3.942811,-5.146810,-1.610069,5.273541,-2.639288,-8.748569,-0.068908,-6.334131,-10.009932,0.0


In [38]:
my_df.to_csv('./others.csv')

In [28]:
others_df = extractWavFeatures('./otherdata')

In [30]:
others_df.dtypes

chroma_shift_mean          object
chroma_shift_var           object
rms_mean                   object
rms_var                    object
spectral_cent_mean         object
spectral_cent_var          object
spectral_bandwidth_mean    object
spectral_bandwidth_var     object
spectral_rolloff_mean      object
spectral_rolloff_var       object
mfcc1                      object
mfcc2                      object
mfcc3                      object
mfcc4                      object
mfcc5                      object
mfcc6                      object
mfcc7                      object
mfcc8                      object
mfcc9                      object
mfcc10                     object
mfcc11                     object
mfcc12                     object
mfcc13                     object
mfcc14                     object
mfcc15                     object
mfcc16                     object
mfcc17                     object
mfcc18                     object
mfcc19                     object
mfcc20        

In [32]:
pd.to_numeric(my_df, downcast='signed')

TypeError: arg must be a list, tuple, 1-d array, or Series

In [62]:
train_df = pd.concat([team_df, others_df])
# train_df = team_df

In [63]:
train_df

Unnamed: 0,filename,rmse,zero_crossing_rate,mfcc1,mfcc2,mfcc3,mfcc4,mfcc5,mfcc6,mfcc7,...,mfcc12,mfcc13,mfcc14,mfcc15,mfcc16,mfcc17,mfcc18,mfcc19,mfcc20,label
0,Amr1.wav,0.03124353289604187,0.09524574762658228,-398.46075439453125,138.5895538330078,3.0195839405059814,-12.14544677734375,-12.920365333557129,13.24248218536377,-16.564815521240234,...,0.2325524240732193,0.9407077431678772,0.8140118718147278,-4.634973049163818,5.3455023765563965,3.7176096439361572,-7.20994758605957,-3.473836660385132,0.6744386553764343,2
1,Amr10.wav,0.021408159285783768,0.08141243811881188,-434.5036315917969,152.6798095703125,12.030377388000488,0.9768251180648804,0.0641842857003212,11.69894790649414,-16.82280158996582,...,2.804825782775879,3.359466314315796,-1.487720251083374,2.9492483139038086,8.075194358825684,-1.483323335647583,0.37750381231307983,-0.8345574140548706,1.6290479898452759,2
2,Amr11.wav,0.052045322954654694,0.09086834016393443,-389.0693664550781,140.72108459472656,22.436880111694336,7.38924503326416,-14.420411109924316,10.467955589294434,-14.195365905761719,...,3.77705454826355,4.0779266357421875,-2.4903037548065186,1.615167260169983,2.439157247543335,3.506373405456543,-4.329512119293213,-0.24602235853672028,-3.07216215133667,2
3,Amr12.wav,0.03662358224391937,0.08905666977611941,-430.4794921875,148.5320281982422,19.754179000854492,2.6437618732452393,-15.569217681884766,9.518011093139648,-15.909305572509766,...,4.775101661682129,7.445225238800049,-6.382566452026367,0.664188802242279,6.244306564331055,0.35481828451156616,-3.0987207889556885,-0.8284878134727478,-3.9141740798950195,2
4,Amr13.wav,0.035578180104494095,0.08059939822635136,-429.6603088378906,149.5933380126953,16.58245849609375,-3.1334006786346436,-18.459609985351562,8.883918762207031,-14.992230415344238,...,7.0627827644348145,6.48460578918457,-6.427166938781738,0.4854406416416168,4.968094348907471,0.7781060934066772,-3.97288179397583,-0.9344853758811951,-2.878880262374878,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15,Sama_Mostafa14.wav,0.09802084416151047,0.08207334321120689,-309.14495849609375,145.9494171142578,0.4912550747394562,0.4897775650024414,-1.1724460124969482,-11.389527320861816,-12.105578422546387,...,4.070107936859131,-4.355195045471191,0.548127293586731,4.8445820808410645,-2.4607954025268555,-6.609767436981201,2.2119452953338623,-8.701488494873047,-6.439525127410889,0
16,sara_amgad10.wav,0.05758098512887955,0.09900173611111111,-325.34326171875,144.0577850341797,-2.0796611309051514,11.415250778198242,9.431700706481934,4.165267467498779,-6.603484630584717,...,3.7682690620422363,0.14088685810565948,-4.578253746032715,1.0967668294906616,-0.04059265926480293,-8.100237846374512,-0.19826380908489227,-6.887458801269531,-1.2006165981292725,0
17,Shirouq7.wav,0.07740089297294617,0.08069349315068493,-275.23614501953125,122.78518676757812,16.025510787963867,25.77071762084961,4.772244453430176,-3.9702308177948,-11.76247787475586,...,-0.8033259510993958,-0.9933399558067322,7.550411701202393,-4.166332721710205,1.9667266607284546,-5.11379861831665,-5.6719560623168945,-3.9446661472320557,-3.22495174407959,0
18,Sohaila_Mohamed7.wav,0.1270662099123001,0.1521782309322034,-243.84642028808594,105.77610778808594,15.568729400634766,11.314079284667969,-2.7036733627319336,3.599822998046875,-14.891018867492676,...,2.8019936084747314,-3.50246262550354,4.430895805358887,1.3781228065490723,-0.36693599820137024,-1.8721753358840942,4.330615520477295,-5.755814552307129,-0.4267715513706207,0


In [64]:
# drop the unnecessary(label and filename) columns
X_Global = train_df.drop(columns=['label', 'filename'], axis=1)
# get the label in new df
Y_Global = train_df['label']

In [65]:
scaler = StandardScaler()
scaler.fit(X_Global)
standardized_data = scaler.transform(X_Global)

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X_Global, Y_Global, test_size=0.3, random_state=42, shuffle=True)

In [67]:
def model_assess(model, title="Default"):
    model.fit(X_train, y_train)
    predicts = model.predict(X_test)
    print('Accuracy', title, ':', round(accuracy_score(y_test, predicts), 5), '\n')

In [68]:
# Naive Bayes
nb = GaussianNB()
model_assess(nb, "Naive Bayes")

# Stochastic Gradient Descent
sgd = SGDClassifier(max_iter=5000, random_state=0)
model_assess(sgd, "Stochastic Gradient Descent")

# KNN
knn = KNeighborsClassifier(n_neighbors=19)
model_assess(knn, "KNN")

# Decision trees
tree = DecisionTreeClassifier()
model_assess(tree, "Decision trees")

# Random Forest
rforest = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0)
model_assess(rforest, "Random Forest")

# Support Vector Machine
svm = SVC(decision_function_shape="ovo")
model_assess(svm, "Support Vector Machine")

# Logistic Regression
lg = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial')
model_assess(lg, "Logistic Regression")

# Neural Nets
nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5000, 10), random_state=1)
model_assess(nn, "Neural Nets")

Accuracy Naive Bayes : 0.96296 

Accuracy Stochastic Gradient Descent : 0.66667 

Accuracy KNN : 0.81481 

Accuracy Decision trees : 0.88889 

Accuracy Random Forest : 1.0 

Accuracy Support Vector Machine : 0.37037 

Accuracy Logistic Regression : 0.96296 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy Neural Nets : 0.74074 



In [69]:
# start using Random Forest Model

y = train_df['label']  # label
X = train_df.drop(columns=['filename', 'label'], axis=1)  # data without label

cols = X.columns
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(X)

# new data frame with the new scaled data.
X = pd.DataFrame(np_scaled, columns=cols)

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

In [71]:
# create instance of the model
model = RandomForestClassifier(n_estimators=1000, max_depth=50, random_state=0)
model.fit(X_train, y_train)  # model fitting

predictions = model.predict(X_train)
X_train_prediction = model.predict(X_train)
accuracy_score(X_train_prediction, y_train)

X_test_prediction = model.predict(X_test)
new_score = accuracy_score(X_test_prediction, y_test)
print("accuracy is:")
accuracy_score(X_test_prediction, y_test)

accuracy is:


1.0

In [72]:
# naive bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)

predictions_nb = nb_model.predict(X_train)
X_train_prediction_nb = nb_model.predict(X_train)
accuracy_score(X_train_prediction_nb, y_train)

X_test_prediction_nb = nb_model.predict(X_test)
new_score = accuracy_score(X_test_prediction_nb, y_test)
print("accuracy is:")
accuracy_score(X_test_prediction_nb, y_test)

accuracy is:


0.9629629629629629

In [73]:
def predict(x):
    scaled = min_max_scaler.transform(x)
    return nb_model.predict(scaled)

In [74]:
# function for testing manually
def get_result(path):
    y, sr = librosa.load(path, mono=True, duration=3)

    # remove leading and trailing silence
    y, index = librosa.effects.trim(y)
    rmse = librosa.feature.rms(y=y)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    row_data = f'{np.mean(rmse)} {np.mean(zcr)}'
    for e in mfcc:
        row_data += f' {np.mean(e)}'

    print(predict([row_data.split()]))

In [75]:
get_result("C:/Users/I1bra/OneDrive/Documents/Sound Recordings/Recording (7).wav")
# Ibrahim

['1']




In [76]:
get_result("D:/My PC/Projects/DSP/Voice-Recognition-System/Model/data/Amr/Voice 014.wav")
# Amr

['2']




In [77]:
get_result("D:/My PC/Projects/DSP/Voice-Recognition-System/Model/data/Mariam_Wael_close/mariam_close14.wav")
# Mariam

['3']




In [78]:
get_result("D:/My PC/Projects/DSP/Voice-Recognition-System/Model/data/Naira_Youssif/Naira_Youssif3.wav")
# Naira

['0']




In [79]:
get_result("C:/Users/I1bra/Downloads/Music/WhatsApp Ptt 2022-12-07 at 15.05.52.wav")
# Stranger

['3']




In [80]:
get_result("C:/Users/I1bra/Downloads/Music/WhatsApp Ptt 2022-12-06 at 13.25.44.wav")
# Stranger

['3']




In [88]:
pickle.dump(nb_model, open('model.pkl', 'wb'))