In [None]:
import numpy as np
import librosa as lb
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.mixture import GaussianMixture
from spafe.features.rplp import plp

In [None]:
# Function to Load Data of file in code
def func_load_segments(name):
    loaded_data = []
    test_file = open(name) 
    for i in test_file.readlines():
            i = i.replace('\n', '')
            keys = ["filename","channel","start","end","speaker","speakerType","speakerDialect","transcript","section","segment"]
            values = i.split('\t')
            loaded_data.append(dict(zip(keys,values)))
    return loaded_data

In [None]:
Loaded_Data = func_load_segments("test.txt")

In [None]:
# Function to extract audio segments using the loaded file
def func_extract_loaded_data(Data):
    data = []
    for i in Data:
        data.append(i['speaker'])
    
    speakers = data
    
    data = []
    for i in Data:
        data.append(lb.load( i['filename'], offset=float(i['start']), duration = float(i['end']) - float(i['start'])))
    
    audios = data
    
    return audios,speakers

In [None]:
extracted_audios, extracted_speakers = func_extract_loaded_data(Loaded_Data)

In [None]:
# Splitting extracted data into 5 minutes training set and 3 minutes testing set
train_audio,test_audio,train_speakers,test_speakers = train_test_split(extracted_audios,extracted_speakers,test_size=0.375) 

In [None]:
def func_plp(Data):
    data = []
    for y, sr in Data:
        plp_out = plp(y, fs=sr)
        data.append(np.mean(plp_out, axis=0))
    return data


def func_mfcc(Data):
    data = []
    for y,sr in Data:
        data.append(lb.feature.mfcc(y=y, sr=sr))
    return data

def func_pca(Data):
    pca = PCA(n_components=10)
    data = []
    for i in Data:
        data.append(np.ndarray.flatten(pca.fit_transform(i)))
    return data

In [None]:
train_plp = func_plp(train_audio)
test_plp = func_plp(test_audio)

In [None]:
train_mfccs = func_mfcc(train_audio)
train_pca = func_pca(train_mfccs)
test_mfccs = func_mfcc(test_audio)
test_pca = func_pca(test_mfccs)

In [None]:
svm_mfcc = SVC()
svm_mfcc.fit(train_pca,train_speakers)

svm_plp = SVC()
svm_plp.fit(train_plp,train_speakers)

In [None]:
clusters = 7

# Covariance Matrix type is selected to be diagonal to consider only the variances and disregard the covariances for data independency
GMM_mfcc = GaussianMixture(n_components = clusters, covariance_type='diag') 
labels_mfcc = GMM_mfcc.fit_predict(train_pca)

GMM_plp = GaussianMixture(n_components = clusters, covariance_type='diag')
labels_plp = GMM_plp.fit_predict(train_plp)

In [None]:
# Giving speaker labels to their corresponding labels generated by GMM 
speaker_mfcc = {}
for i in range (0, len(labels_mfcc)): 
    if labels_mfcc[i] not in speaker_mfcc.keys():
        speaker_mfcc[labels_mfcc[i]] = train_speakers[i]

speaker_plp = {}
for i in range (0, len(labels_plp)): 
    if labels_plp[i] not in speaker_plp.keys():
        speaker_plp[labels_plp[i]] = train_speakers[i]

In [None]:
# Finding Accuracy
predicted_test = svm_mfcc.predict(test_pca)
print("SVM MFCC:",accuracy_score(predicted_test,test_speakers))

predicted_test = svm_plp.predict(test_plp)
print("SVM PLP:",accuracy_score(predicted_test,test_speakers))

prediction_mfcc = GMM_mfcc.predict(test_pca)

predicted_speakers_mfcc = []
for i in prediction_mfcc:
    predicted_speakers_mfcc.append(speaker_mfcc[i])

print("GMM MFCC:",accuracy_score(test_speakers, predicted_speakers_mfcc) )

prediction_plp = GMM_plp.predict(test_plp)

predicted_speakers_plp = []
for i in prediction_plp:
    predicted_speakers_plp.append(speaker_plp[i])

print("GMM PLP:",accuracy_score(test_speakers, predicted_speakers_plp) )

In [None]:
# Live Sound
import sounddevice 
from scipy.io.wavfile import write

duration = 10

data = sounddevice.rec(int(duration * 8000), samplerate=8000, channels=2)
sounddevice.wait() 
write('live.wav', 8000, data)  

In [None]:
live_audio = [ lb.load( 'live.wav', offset=0, duration = duration) ]
live_mfccs = func_mfcc(live_audio)
live_plp = func_plp(live_audio)
live_pca = func_pca(live_mfccs)
print("SVM Predicted label MFCC:", svm_mfcc.predict(live_pca))
print("SVM Predicted label PLP:", svm_plp.predict(live_plp))
print("GMM Predicted label MFCC:", speaker_mfcc[GMM_mfcc.predict(live_pca)[0]])
print("GMM Predicted label PLP:", speaker_plp[GMM_plp.predict(live_plp)[0]])