In [None]:
pip install python_speech_features



Extract Features

In [None]:
import numpy as np
from python_speech_features import mfcc as MFCC
from python_speech_features import delta
from sklearn import preprocessing
from scipy.io import wavfile
import os

def extract_featrue(audio,fs):
    mfcc_feature = MFCC(audio, fs)
    mfcc = preprocessing.scale(mfcc_feature)
    mfcc_delta = delta(mfcc_feature, 2)
    mfcc_double_delta = delta(mfcc_delta, 2)
    mfcc_final = np.hstack([mfcc, mfcc_delta, mfcc_double_delta])
    return mfcc_final


def get_feature(path,task):
    file_paths = []
    # get all  wav file
    for root, sub, files in os.walk(path):
        files = [f for f in files if f.endswith(".WAV")]
        speaker_file_paths = []
        for file in files:
            speaker_file_paths.append(os.path.join(root, file))
        if speaker_file_paths != []:
            file_paths.append(speaker_file_paths)

    output_path = 'output/op'
    fs=16000
    if task == 'train':
      Feature = np.array([])
      for speaker_file in file_paths:
          for file in speaker_file:
              [fs, audio] = wavfile.read(file)
              audio = remove_silence(audio)
              vector = extract_featrue(audio, fs)
              if Feature.size == 0:
                  Feature = vector
              else:
                  Feature = np.vstack([Feature, vector])
      return Feature
    else:
      Feature = []
      for speaker_file in file_paths:
          for file in speaker_file:
              [fs, audio] = wavfile.read(file)
              audio = remove_silence(audio)
              vector = extract_featrue(audio, fs)
              Feature.append(vector)
      return Feature

Remove Silence

In [None]:
import os
import numpy as np

def remove_silence(audio):
  audio_m=[]
  for i in range(len(audio)):
    if (abs(audio[i])>10):
      audio_m.append(audio[i])
  audio_m = np.asarray(audio_m)
  return audio_m

Get BW statistics

In [None]:
def get_BW_stat(ubm,spk_feature,n_feature,n_clusters,n_utterance):
  N_c_list = []
  N_list = []
  S_list = []
  F_list = []

  for u in range(n_utterance):
    utterance= spk_feature[u]
    posterior = ubm.predict_proba(utterance)
    n_c = posterior.sum(axis=0)
    N_c_list.append(n_c)
    f = np.zeros((n_feature,n_clusters))
    s = np.zeros((n_feature,n_clusters))
    for i in range(len(posterior)):
        for j in range(n_clusters):
          x = utterance[i]-ubm.means_[j]
          x = x.reshape(-1,1)
          f[:,j] += (utterance[i]-ubm.means_[j])*posterior[i,j]
          s[:,j] += np.diag(posterior[i,j]*np.dot(x,x.T))
    
    for i in range(n_clusters):
      if i==0:
        N = np.ones(n_feature)*n_c[i]
        F = f[:,i]
        S = s[:,i]
        m = ubm.means_[i]
        cov = ubm.covariances_[i]
      else:
        N = np.hstack([N,np.ones(n_feature)*n_c[i]])
        F = np.hstack([F,f[:,i]])
        S = np.hstack([S,s[:,i]])
        m = np.hstack([m,ubm.means_[i]])
        cov = np.hstack([cov,ubm.covariances_[i]])
        
    F_list.append(F)
    N = np.diag(N)
    N_list.append(N)
    S = np.diag(S)
    S_list.append(S)
  cov = np.diag(cov)
  return N_list,F_list,S_list,N_c_list,cov

In [None]:
import numpy as np
from sklearn.mixture import GaussianMixture
import copy
import math
import seaborn as sns;sns.set()
import matplotlib.pyplot as plt
from numpy.linalg import multi_dot
from scipy.linalg import eig
import seaborn as sns;sns.set()

n_feature = 39
n_clusters = 64
R = 100
n_iteration = 5
print("Generating UBM")
train_path = 'drive/My Drive/Colab Notebooks/SIP Project/TIMIT_modified/TRAIN/Train_data'
spk_feature = get_feature(train_path,'test')
feature = np.vstack(spk_feature)
ubm = GaussianMixture(n_components=n_clusters,covariance_type='diag',max_iter=100)
ubm.fit(feature)
del feature
n_utterance = len(spk_feature)
n_spk = 10

#Get BW Statistics
N_list,F_list,S_list,N_c_list,cov = get_BW_stat(ubm,spk_feature,n_feature,n_clusters,n_utterance)

EM algorithm to get T and Sigma

In [None]:
T = np.zeros((n_clusters*n_feature,R))
for i in range(n_clusters):
  T[i,:] = np.random.uniform(-0.1*cov[i,i],0.1*cov[i,i],R)                                                        

print("Starting EM iterations")
for itr in range(n_iteration):
  #E - step
  print(f'Iteration :{itr}')
  E_wwT_list = []
  E_w_list = []
  for u in range(len(spk_feature)):
    inv_cov = np.linalg.inv(cov)
    l = np.eye(R) + multi_dot([T.T,inv_cov,N_list[u],T])
    inv_l = np.linalg.inv(l)
    E_w = multi_dot([inv_l,T.T,inv_cov,F_list[u]])
    E_w_list.append(E_w)
    cov_w = inv_l 
    E_wwT = cov_w + np.dot(E_w,E_w.T)
    E_wwT_list.append(E_wwT)
    
  # M - step
  C_m = np.zeros((n_clusters*n_feature,R))
  N_m = np.zeros((n_clusters*n_feature,n_clusters*n_feature))
  A_m = np.zeros((n_clusters,R,R))
  S_m = np.zeros((n_clusters*n_feature,n_clusters*n_feature)) 
  for u in range(n_utterance):
    C_m += np.dot(F_list[u].reshape(-1,1),E_w_list[u].reshape(1,-1))
    N_m += N_list[u]
    for i in range(n_clusters):
      A_m[i] += N_c_list[u][i]*E_wwT_list[u]
    S_m += S_list[u]

  cov = np.dot(np.linalg.inv(N_m),S_m-np.diag(np.diag(np.dot(C_m,T.T))))

  for c in range(n_clusters):
    for f in range(n_feature):
      i = c*n_feature + f 
      T[i,:] = np.dot(C_m[i,:],np.linalg.inv(A_m[c])) 


inv_cov = np.linalg.inv(cov)
w_list = []
ws_list = []
ws_mean = []

#For projection matrix A
for u in range(n_utterance):
  N_list,F_list,_,_,_ = get_BW_stat(ubm,[spk_feature[u]],n_feature,n_clusters,n_utterance=1)
  l = np.eye(R) + multi_dot([T.T,inv_cov,N_list[0],T])
  inv_l = np.linalg.inv(l)
  w_list.append(multi_dot([inv_l,T.T,inv_cov,F_list[0]]))
  ws_list.append(w_list[u])
  if (u+1)%(n_utterance/n_spk)==0 :
    ws_mean.append(np.mean(np.asarray(ws_list),axis=0))
    ws_list = []

w_mean = np.mean(np.asarray(w_list),axis=0)
sb = np.zeros((R,R))
sw = np.zeros((R,R))

for s in range(n_spk):
  sb += np.dot((ws_mean[s]-w_mean).reshape(-1,1),(ws_mean[s]-w_mean).reshape(1,-1))
  sw_temp = np.zeros((R,R))
  for n in range(int(n_utterance/n_spk)):
    i = int(s*(n_utterance/n_spk)+n)
    sw_temp += np.dot((w_list[i]-ws_mean[s]).reshape(-1,1),(w_list[i]-ws_mean[s]).reshape(1,-1))
  sw += np.divide(sw_temp,(n_utterance/n_spk))
eigen,A = eig(sb,sw)

In [None]:
TP = 0
FP = 0
FN = 0
TN = 0
score_mat = np.zeros((n_spk,n_spk))
w_target_list=[]
#For target utterance
target_spk_path = 'drive/My Drive/Colab Notebooks/SIP Project/TIMIT_modified/TRAIN/target'
target_spk_feature = get_feature(target_spk_path,'test')
n_target_spk = len(target_spk_feature)
for i in range(len(target_spk_feature )):
  N_list,F_list,_,_,_ = get_BW_stat(ubm,[target_spk_feature[i]],n_feature,n_clusters,n_utterance=1)
  l = np.eye(R) + multi_dot([T.T,inv_cov,N_list[0],T])
  inv_l = np.linalg.inv(l)
  w_target_list.append(multi_dot([inv_l,T.T,inv_cov,F_list[0]]))
del target_spk_feature


#For test utterance
test_spk_path = 'drive/My Drive/Colab Notebooks/SIP Project/TIMIT_modified/TRAIN/test'
test_spk_feature = get_feature(test_spk_path,'test')
n_test_spk = len(test_spk_feature)
for j in range(n_spk):
  for i in range(len(test_spk_feature)):
    N_list,F_list,_,_,_ = get_BW_stat(ubm,[test_spk_feature[i]],n_feature,n_clusters,n_utterance=1)
    l = np.eye(R) + multi_dot([T.T,inv_cov,N_list[0],T])
    inv_l = np.linalg.inv(l)
    w_test = multi_dot([inv_l,T.T,inv_cov,F_list[0]])
    #Cosine score
    #score = np.dot(w_target_list[j],w_test)/(np.linalg.norm(w_target_list[j])*np.linalg.norm(w_test)
    #Cosine score with LDA
    score = np.dot(np.dot(A.T,w_target_list[j]),np.dot(A.T,w_test))/(np.linalg.norm(np.dot(A.T,w_target_list[j]))*np.linalg.norm(np.dot(A.T,w_test)))
    #norm = np.linalg.norm(np.dot(A.T,w_target_list[j])-np.dot(A.T,w_test))
    #Power Kernel
    #score = norm**5
    #Multiquadric Kernel
    #score = np.math.sqrt(norm**2+1)-1
    #Rational quadratic Kernel
    #score = 1-((norm**2+1)/norm**2)
    score_mat[i//5,j] += score
    if (j==(i//5)):      
      if(score>0.6):
        TP+=1
      else:
        FN+=1
    else:
      if(score<0.6):
        TN+=1
      else:
        FP+=1

FAR = FP/(n_test_spk*n_target_spk)
FRR = FN/(n_test_spk*n_target_spk)

print(f'TP:{TP}')
print(f'FP:{FP}')
print(f'FN:{FN}')
print(f'TN:{TN}')
print(f'FAR:{FAR}')
print(f'FRR:{FRR}')

cnf_mat = np.zeros((2,2))
cnf_mat[0,0]=TP
cnf_mat[0,1]=FP
cnf_mat[1,0]=FN
cnf_mat[1,1]=TN

score_mat = score_mat/5
sns.heatmap(score_mat,cmap="YlGnBu",annot=False,xticklabels=['1','2','3','4','5','6','7','8','9','10'],yticklabels=['1','2','3','4','5','6','7','8','9','10'])
