# Baseline code for Online Speaker Diarization
# -------------
# Team Name - RuntimeTerror

In [None]:
!pip install auditok

Collecting auditok
[?25l  Downloading https://files.pythonhosted.org/packages/49/3a/8b5579063cfb7ae3e89d40d495f4eff6e9cdefa14096ec0654d6aac52617/auditok-0.2.0-py3-none-any.whl (1.5MB)
[K     |▏                               | 10kB 12.4MB/s eta 0:00:01[K     |▍                               | 20kB 17.9MB/s eta 0:00:01[K     |▋                               | 30kB 10.2MB/s eta 0:00:01[K     |▉                               | 40kB 9.7MB/s eta 0:00:01[K     |█                               | 51kB 7.9MB/s eta 0:00:01[K     |█▎                              | 61kB 8.6MB/s eta 0:00:01[K     |█▌                              | 71kB 8.5MB/s eta 0:00:01[K     |█▊                              | 81kB 8.1MB/s eta 0:00:01[K     |██                              | 92kB 8.1MB/s eta 0:00:01[K     |██▏                             | 102kB 7.8MB/s eta 0:00:01[K     |██▍                             | 112kB 7.8MB/s eta 0:00:01[K     |██▋                             | 122kB 7.8MB/s eta 

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import auditok
import librosa
from os import listdir
from os.path import isfile, join

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


## Ground Truth creation

In [None]:
# for reading segmentation times and speaker labels from ICSI meeting corpus
import os
import re

icsi_audio_dir = "/content/drive/MyDrive/icsimeetingcorpus/Signals"
icsi_segments_dir = "/content/drive/MyDrive/icsimeetingcorpus/ICSIplus/Segments"


segs = [[]]
def icsi_labels(icsi_audio_dir, icsi_segments_dir):
    """ for each audio file outputs the segments and corresponding speaker label """
    truth = {}
    segment_fns = sorted(os.listdir(icsi_segments_dir))
    audio_fns = sorted(os.listdir(icsi_audio_dir))
    i = 0
    for audio_fn in audio_fns:
        labeled_segs = []
        temp = audio_fn + ".interaction.wav"
        audio_fn = audio_fn.split('.')[0]
        while len(segment_fns) > 0 and segment_fns[0].split('.')[0] == audio_fn:

            segment_fn = segment_fns.pop(0)
            with open(os.path.join(icsi_segments_dir, segment_fn), 'r') as seg_f: 
                data = seg_f.read()

            segments = re.findall("<segment.*", data)
            for i in range(len(segments)):
                if "subsegment" in segments[i]: continue
                segment = segments[i][1:-1].split(' ')[2:5]
                try:
                    labeled_segs.append([float(segment[0][11:-1]), float(segment[1][9:-1]), segment[2][13:-1]])
                except:
                    print(segments[i], "\n", segment, "\n")
        labeled_segs = sorted(labeled_segs)
        truth[temp] = labeled_segs
    return truth

truth = icsi_labels(icsi_audio_dir, icsi_segments_dir)

<segment starttime="4019.644" endtime="4023.521" participant="me032" closemic="false" nite:id="Buw001.segment.2,333"> 
 ['endtime="4023.521"', 'participant="me032"', 'closemic="false"'] 

<segment starttime="4025.420" endtime="4027.950" participant="me032" closemic="false" nite:id="Buw001.segment.2,343"> 
 ['endtime="4027.950"', 'participant="me032"', 'closemic="false"'] 

<segment starttime="4075.880" endtime="4077.761" participant="me032" closemic="false" nite:id="Buw001.segment.2,388"> 
 ['endtime="4077.761"', 'participant="me032"', 'closemic="false"'] 



In [None]:
# returns speaker given the audio file name and start and end time
from bisect import bisect_left
from bisect import bisect_right

def speaker(filename, start, end):
  segment = truth[filename]
  a=[]
  for row in segment:
    a.append(row[0])
  l = bisect_left(a, start)
  r = bisect_right(a, end)
  contr={}
  for i in range(l-1, r):
    contr[segment[i][2]] = 0
  # print(segment[l-1])
  # print(segment[r-1])
  if l>=1 and end>segment[l-1][1]:
    # print(segment[l-1][1] - start)
    contr[segment[l-1][2]] = segment[l-1][1] - start
  if l>=1 and end<segment[l-1][1]: 
    contr[segment[l-1][2]] = end - start
  for i in range(l, r):
    if(i>=0 and i<len(segment)) and segment[i][1]<end:
      contr[segment[i][2]] = contr[segment[i][2]] + segment[i][1] - segment[i][0]
    else:
      contr[segment[i][2]] = contr[segment[i][2]] + end - segment[i][0]
  # print(contr)
  # print(a[l], a[r])
  threshold = 0.5
  maxLen = -1
  count = 0
  temp = ""
  for row in contr.keys():
    if(contr[row]/(end-start)>threshold):
      count = count+1
    if(maxLen<=contr[row]):
      maxLen = contr[row]
      temp = row
  if count>1:
    return -1 # overlap
  else:
    if (end < truth['Bed005.interaction.wav'][0][0]) or (start > truth['Bed005.interaction.wav'][-1][1]): 
        return None
    return temp


In [None]:
!pip install simpleder
import simpleder

def DER(ground_truth, predictions):
    """
        ground_truth = [label, start, end]*N
        predictions = [label, start, end]*N
        overlapped mapped to -1 in ground_truth
    """
    # reference (ground truth)
    ref = []
    hyp = []
    for i in range(len(ground_truth)):
        if (ground_truth[i][2]!=-1):
            ref.append(( ground_truth[i][0], ground_truth[i][1], ground_truth[i][2] ))
            hyp.append(( str(predictions[i][0]), predictions[i][1], predictions[i][2] ))

    return simpleder.DER(ref, hyp)

# Online EM

In [None]:
import numpy as np

# each speaker is modeled via a single GMM

def get_llks(X_mfcc, array_of_Cs, array_of_gaussian_means, array_of_gaussian_covs):
    """
        X_mfcc = matrix of shape TxD

        array_of_Cs: shape NxGx1 
        array_of_gaussian_means: shape NxGxD
        array_of_gaussian_covs: shape NxGxDxD

        G = number of gaussians in speaker GMM
        N = number of speaker GMMs

        returns: 
            p(X_mfcc|theta_i) for the GMM
    """

    # P(x_k | theta ) = summation_i[  c_i * p_i( x_k | mean_i, cov_i) ]
    # p(X | theta) = product_k[ P(x_k | theta) ]
    # log p(X | theta) = summation_k[ log(P(x_k | theta)) ] 

    covs = np.expand_dims(array_of_gaussian_covs, axis=2)
    means = np.expand_dims(array_of_gaussian_means, axis=2)
    
    Z = X_mfcc - means
    exponent_terms = -0.5*np.multiply(np.matmul(Z, np.linalg.inv(covs).squeeze()), Z).sum(axis=-1)
    cov_dets = np.linalg.det(covs)

    pi_x = np.exp(exponent_terms)/pow(2*np.pi, -covs.shape[-1]/2)*np.sqrt(cov_dets) 
    p_x = (array_of_Cs*pi_x).sum(axis=1)
    log_P_X = np.log(p_x).sum(axis=1)

    return log_P_X


def calculate_probs(x_mfcc, Cs, means, covs):
    """
        x_mfcc: an vector of shape 1xD or 2D matrix of shape TxD

        Cs: shape NxGx1 
        means: shape NxGxD
        covs: shape NxGxDxD

        returns: P(z = i| x, theta) shape NxT for each GMM i 

        P(z = i| X, theta) = P_i(X| theta_i)/summation_i[P_i(x| theta_i)]
        
    """
    covs = np.expand_dims(covs, axis=2)
    means = np.expand_dims(means, axis=2)
    
    z = x_mfcc - means
    exponent_terms = -0.5*np.multiply(np.matmul(z, np.linalg.inv(covs).squeeze()), z).sum(axis=-1)
    cov_dets = np.linalg.det(covs)

    pi_x = np.exp(exponent_terms)/np.sqrt(2*np.pi*cov_dets)
    p_x = (Cs * pi_x).sum(axis=1)
    P_X = np.exp(np.log(p_x).sum(axis=1, keepdims=True))
    probs = P_X/np.sum(P_X, keepdims=True)

    return probs


def eta(t, a=0.999, b=1000):
    return 1/(a*t + b)


def online_adaptation(X_mfcc, single_spk_gmm_params, globaltime, anb = (0.999, 1000)):
    """ adapts the Gmms 
        X_mfcc: a 2D matrix of shape TxD
        single_spk_gmm_params: tupple or list (array_of_Cs, array_of_gaussian_means, array_of_gaussian_covs)
        globaltime: current time step
        returns: updated Gmm params in a tupple, current globaltime

        <<f>>(t) = <<f>>(t-1) + eta(t)*[f(t)Pi(t) - <<f>>(t-1)]
    """

    LT = 300
    local_time = 0

    Cs = single_spk_gmm_params[0][0]
    means = single_spk_gmm_params[1][0]
    covs = single_spk_gmm_params[2][0]
    
    stat_1s = Cs
    stat_x = means*Cs
    stat_x2 = (covs + np.matmul(means[:,:,np.newaxis], means[:, np.newaxis, :]))*Cs[:, np.newaxis, :]
    # stat_x2 = stat_x2 * np.concatenate([[np.identity(stat_x2.shape[1])]]*stat_x2.shape[0])

    for x_mfcc in X_mfcc:
        # calculate <<1>>(t)
        # calculate <<x>>(t)
        # calculate <<x^2>>(t)
        # update Cs, means, covs

        globaltime = globaltime+1

        if local_time > LT: continue

        eta_t = eta(local_time, anb[0], anb[1])

        pi = calculate_probs(x_mfcc, Cs[:, None], means[:, None], covs[:, None])
        stat_1s = stat_1s + eta_t*(pi - stat_1s)
        stat_x = stat_x + eta_t*(x_mfcc*pi - stat_x)

        # sph = np.diag(np.diag(np.matmul(x_mfcc[:, None], x_mfcc[None,:])))
        sph = np.matmul(x_mfcc[:, None], x_mfcc[None,:]) * pi[:, None, :]
        stat_x2 = stat_x2 + eta_t*(sph - stat_x2)        

        Cs = stat_1s
        means = stat_x/stat_1s
        covs = stat_x2/stat_1s[:,:, np.newaxis] - np.matmul(means[:,:,np.newaxis], means[:, np.newaxis, :])

        local_time += 1

    return [Cs[None, :], means[None, :], covs[None, :]], globaltime


def novelity_detection(X_mfcc, threshold, spk_gmm_params, gen_gmm_params):
    """
        X_mfcc: a 2D matrix of shape TxD
        threshold: threshold for deciding novelity

        spk_gmm_params and gen_gmm_params denote a set of GMMs
        spk_gmm_params: tupple or list (array_of_Cs, array_of_gaussian_means, array_of_gaussian_covs)
        gen_gmm_params: tupple or list (gender_Cs, gender_means, gender_covs)

        array_of_Cs / gender_Cs: shape NxGx1 
        array_of_gaussian_means / gender_means: shape NxGxD
        array_of_gaussian_covs / gender_covs: shape NxGxDxD

        time: current inference time

        returns: 
            if a new speaker is detected:
                new spk_gmm_params, num of speakers + 1
            else:
                detected_spk_gmm_params, detected_spk_gmm_index
    """

    gen_Cs, gen_means, gen_covs = gen_gmm_params
    gender_llks = get_llks(X_mfcc, gen_Cs, gen_means, gen_covs)

    male_or_female = np.argmax(gender_llks) # 0 -> male

    if len(spk_gmm_params[0]) == 0: # first speaker
        return [ np.array([gen_Cs[male_or_female]]), np.array([gen_means[male_or_female]]), np.array([gen_covs[male_or_female]]) ], 0, male_or_female, -1, -1

    spk_Cs, spk_means, spk_covs = spk_gmm_params
    speaker_llks = get_llks(X_mfcc, spk_Cs, spk_means, spk_covs)

    index_gaussian_maxPsp = np.argmax(speaker_llks)
    log_of_P_sp = speaker_llks[index_gaussian_maxPsp]
    log_of_P_gen = gender_llks[male_or_female]

    # if len(speaker_llks) > 1:
    #     log_pavg = np.log((np.exp(speaker_llks).sum() - np.exp(log_of_P_sp))/(len(speaker_llks)-1))
    #     print(np.exp(log_of_P_sp))
    # else:
    #     log_pavg = log_of_P_sp

    log_of_Liklihood_ratio = log_of_P_sp - log_of_P_gen #- log_pavg

    if log_of_Liklihood_ratio < threshold:
        # enroll new speaker
        return [ 
            np.array([gen_Cs[male_or_female]]), 
            np.array([gen_means[male_or_female]]), 
            np.array([gen_covs[male_or_female]]) 
            ], len(speaker_llks), male_or_female, index_gaussian_maxPsp, log_of_Liklihood_ratio
    else:
        return [ 
            spk_Cs[index_gaussian_maxPsp][None, :], 
            spk_means[index_gaussian_maxPsp][None, :], 
            spk_covs[index_gaussian_maxPsp][None, :] 
            ], index_gaussian_maxPsp, male_or_female, index_gaussian_maxPsp, log_of_Liklihood_ratio


def update_params(adapted_gmm, gmm_index, spk_gmm_params, label_for_gmm, last_seen_spk, current_time):
    """
        modifies spk_gmm_params, label_for_gmm and last_seen_spk to enroll a new speaker or update an existing speaker
    """
    if gmm_index == len(spk_gmm_params[0]): # new speaker
        last_seen_spk.append(0)
        if (gmm_index == 0):
            label_for_gmm.append(0)
            for i in range(3):
                spk_gmm_params[i] = adapted_gmm[i]
        else:
            label_for_gmm.append(label_for_gmm[-1]+1)
            for i in range(3):
                spk_gmm_params[i] = np.concatenate([spk_gmm_params[i], adapted_gmm[i]])
    
    else:
        for i in range(3):
            spk_gmm_params[i][gmm_index] = adapted_gmm[i]

    # gmm_index is in range now
    last_seen_spk[gmm_index] = current_time


def remove_dormant_speakers(spk_gmm_params, label_for_gmm, last_seen_spk, current_time, threshold = 100000):
    """
        removes those speakers which have not been seen since time - threshold mfcc vectors
    """
    i = 0
    while i < len(last_seen_spk):
        if current_time - last_seen_spk[i] > threshold:
            for k in range(3): 
                spk_gmm_params[k] = np.delete(spk_gmm_params[k], i, 0)
            label_for_gmm.pop(i)
            last_seen_spk.pop(i)
        else:
            i+=1



In [None]:
%cd /content/drive/MyDrive/'Colab Notebooks'/
%ls

/content/drive/MyDrive/Colab Notebooks
'Copy of vad.ipynb'       Gender-GMM-training.ipynb   onlineEM.py
 covs.npy                 means.npy                   Slicing.ipynb
 Cs.npy                   [0m[01;36mModel[0m@                      vad.ipynb
 Dataset-download.ipynb  'onlineEM (1).ipynb'
 DER.ipynb                onlineEM.ipynb


# Simple GMM online adaptation baseline

In [None]:
# diarization experiment per show
input_path = "/content/drive/MyDrive/ICSI-Dataset/"
wavfiles = [f for f in listdir(input_path) if isfile(join(input_path, f))]

audio_fn = wavfiles[9]


audio_regions = auditok.split(input_path + audio_fn, min_dur=2, max_dur=10, max_silence=0.0, energy_threshold=50, strict_min_dur=True)
wav, sr = librosa.load(input_path + audio_fn, sr=16000)
vad_start_end = [[int(r.meta.start *sr), int(r.meta.end *sr)] for r in audio_regions]




time = 0
threshold = -120 # experimental
gender_gmm_params = (np.load("Model/full/Cs.npy"), np.load("Model/full//means.npy"), np.load("Model/full//covs.npy"))
# gender_gmm_params = (np.load("Cs.npy"), np.load("means.npy"), np.load("covs.npy"))


spk_gmm_params = [ [],[],[] ]; label_for_gmm = []; last_seen_spk = []; num_times_pudated = []
initial_means = initial_vars = None; bool_flag = True
adapted = {}; est_labels = []; ground_t = []

for start, end in vad_start_end:

    assert start < len(wav)

    mfccs = librosa.feature.mfcc(wav[start:end], n_mfcc=12, sr=16000)
    if mfccs.shape[1] < 10: continue
    delta_mfccs = librosa.feature.delta(mfccs)
    X_mfcc = np.concatenate((mfccs, delta_mfccs)).T

    if bool_flag:
        initial_means = np.mean(X_mfcc, axis=0)
        initial_vars = np.var(X_mfcc, axis=0)
        bool_flag = False
    else:
        initial_means = initial_means + 0.01*(np.mean(X_mfcc, axis=0) - initial_means)
        initial_vars = initial_vars + 0.01*(np.var(X_mfcc, axis=0) - initial_vars)

    X_mfcc = (X_mfcc - initial_means)/initial_vars

    #======================================

    # novelity
    selected_gmm, gmm_index, gender, spkn, lratio = novelity_detection(X_mfcc, threshold, spk_gmm_params, gender_gmm_params)

    # continual learning
    if not adapted.get(gmm_index, False) or lratio > -50:
        adapted_gmm, time = online_adaptation(X_mfcc, selected_gmm, time, (0.999, 100))
        adapted[gmm_index] = True
    else:
        adapted_gmm = selected_gmm
        time += len(X_mfcc)

    # enroll if new speaker
    update_params(adapted_gmm, gmm_index, spk_gmm_params, label_for_gmm, last_seen_spk, time)

    # append estimate
    est_labels.append([str(label_for_gmm[gmm_index]), start/sr, end/sr])

    # remove a speaker if dormant
    remove_dormant_speakers(spk_gmm_params, label_for_gmm, last_seen_spk, time)

    identity = speaker(audio_fn, start/sr, end/sr)
    ground_t.append([identity, start/sr, end/sr])

    print(identity, f"{gender}", f"{spkn}", lratio, "\t", est_labels[-1], len(label_for_gmm), "duration: {:.2f}".format(end/sr - start/sr))
    #======================================

print(DER(est_labels, ground_t)*100, audio_fn)


-1 0 -1 -1 	 ['0', 3.8, 6.0] 1 duration: 2.20
mn015 0 0 -42.31681786410229 	 ['0', 15.85, 18.4] 1 duration: 2.55
fn050 0 0 -114.37966969791955 	 ['0', 44.35, 47.6] 1 duration: 3.25
me045 0 0 -79.61722106423849 	 ['0', 53.4, 55.65] 1 duration: 2.25
fe004 0 0 -77.37448244271036 	 ['0', 59.3, 61.5] 1 duration: 2.20
-1 0 0 -96.43776072569699 	 ['0', 62.2, 64.95] 1 duration: 2.75
fe004 0 0 -79.61722106423838 	 ['0', 79.5, 81.75] 1 duration: 2.25
fe004 0 0 -77.37448244271047 	 ['0', 92.8, 95.0] 1 duration: 2.20
mn015 0 0 -70.64626657812664 	 ['0', 96.0, 98.0] 1 duration: 2.00
mn015 0 0 -79.61722106423827 	 ['0', 124.85, 127.1] 1 duration: 2.25
mn015 0 0 -82.9813289965299 	 ['0', 151.2, 153.55] 1 duration: 2.35
fe004 0 0 -74.0103745104185 	 ['0', 205.8, 207.9] 1 duration: 2.10
fe004 0 0 -80.7385903750021 	 ['0', 242.75, 245.05] 1 duration: 2.30
fe004 0 0 -77.37448244271059 	 ['0', 286.8, 289.0] 1 duration: 2.20
-1 0 0 -85.2240676180578 	 ['0', 292.9, 295.3] 1 duration: 2.40
fe004 0 0 -77.3744

In [None]:
# diarization experiment per show
input_path = "/content/drive/MyDrive/ICSI-Dataset/"
wavfiles = [f for f in listdir(input_path) if isfile(join(input_path, f))]

# audio_fn = wavfiles[0]

def diarization(audio_fn):

    audio_regions = auditok.split(input_path + audio_fn, min_dur=2, max_dur=10, max_silence=0.0, energy_threshold=50, strict_min_dur=True)
    wav, sr = librosa.load(input_path + audio_fn, sr=16000)
    vad_start_end = [[int(r.meta.start *sr), int(r.meta.end *sr)] for r in audio_regions]




    time = 0
    threshold = 100 # experimental
    gender_gmm_params = (np.load("Model/full/Cs.npy"), np.load("Model/full//means.npy"), np.load("Model/full//covs.npy"))
    # gender_gmm_params = (np.load("Cs.npy"), np.load("means.npy"), np.load("covs.npy"))


    spk_gmm_params = [ [],[],[] ]; label_for_gmm = []; last_seen_spk = []; num_times_pudated = []
    initial_means = initial_vars = None; bool_flag = True
    adapted = {}; est_labels = []; ground_t = []

    for start, end in vad_start_end:

        assert start < len(wav)

        mfccs = librosa.feature.mfcc(wav[start:end], n_mfcc=12, sr=16000)
        if mfccs.shape[1] < 10: continue
        delta_mfccs = librosa.feature.delta(mfccs)
        X_mfcc = np.concatenate((mfccs, delta_mfccs)).T

        if bool_flag:
            initial_means = np.mean(X_mfcc, axis=0)
            initial_vars = np.var(X_mfcc, axis=0)
            bool_flag = False
        else:
            initial_means = initial_means + 0.1*(np.mean(X_mfcc, axis=0) - initial_means)
            initial_vars = initial_vars + 0.1*(np.var(X_mfcc, axis=0) - initial_vars)

        X_mfcc = (X_mfcc - initial_means)/initial_vars

        #======================================

        # novelity
        selected_gmm, gmm_index, gender, spkn, lratio = novelity_detection(X_mfcc, threshold, spk_gmm_params, gender_gmm_params)

        # continual learning
        if not adapted.get(gmm_index, False):
            adapted_gmm, time = online_adaptation(X_mfcc, selected_gmm, time, (0.999, 1000))
            # adapted[gmm_index] = True
        else:
            time += len(X_mfcc)

        # enroll if new speaker
        update_params(adapted_gmm, gmm_index, spk_gmm_params, label_for_gmm, last_seen_spk)

        # append estimate
        est_labels.append([str(label_for_gmm[gmm_index]), start/sr, end/sr])

        # remove a speaker if dormant
        remove_dormant_speakers(spk_gmm_params, label_for_gmm, last_seen_spk, time)

        identity = speaker(audio_fn, start/sr, end/sr)
        ground_t.append([identity, start/sr, end/sr])

        # print(identity, f"{gender}", f"{spkn}", lratio, "\t", est_labels[-1], len(label_for_gmm), "duration: {:.2f}".format(end/sr - start/sr))
        #======================================
    der = DER(est_labels, ground_t)*100
    print(der, audio_fn)
    return der




# CODE STORE:

        code for safe keeping

In [None]:
time = 0
threshold = 2 # experimental

# put the 2 gender gmms in here the male at 0th index
gender_Cs = np.load("Cs.npy") #np.array([[0.5],[0.5]])
gender_means = np.load("means.npy")
gender_covs = np.load("covs.npy")
gender_gmm_params = (gender_Cs, gender_means, gender_covs)



est_labels = []

spk_gmm_params = [ [],[],[] ]
label_for_gmm = []
last_seen_spk = []
num_times_pudated = []

initial_means = initial_vars = None
bool_flag = True
adapted = {}
for a_file in segment_files:
    if a_file.split('.')[0] != "Bed005": break

    wav, sr = librosa.load(segment_path+a_file, sr=16000)
    mfccs = librosa.feature.mfcc(wav, n_mfcc=12, sr=16000)
    delta_mfccs = librosa.feature.delta(mfccs)
    X_mfcc = np.concatenate((mfccs, delta_mfccs)).T

    if bool_flag:
        initial_means = np.mean(X_mfcc, axis=0)
        initial_vars = np.var(X_mfcc, axis=0)
        bool_flag = False
    else:
        initial_means = initial_means + 0.01*(np.mean(X_mfcc, axis=0) - initial_means)
        initial_vars = initial_vars + 0.01*(np.var(X_mfcc, axis=0) - initial_vars)

    X_mfcc = (X_mfcc - initial_means)/initial_vars

    #======================================

    # novelity
    selected_gmm, gmm_index = novelity_detection(X_mfcc, threshold, spk_gmm_params, gender_gmm_params)

    # continual learning
    if not adapted.get(gmm_index, False):
        adapted_gmm, time = online_adaptation(X_mfcc, selected_gmm, time)
        adapted[gmm_index] = True
    else:
        time += len(X_mfcc)

    # enroll if new speaker
    update_params(adapted_gmm, gmm_index, spk_gmm_params, label_for_gmm, last_seen_spk)

    # append estimate
    est_labels.append(label_for_gmm[gmm_index])

    # remove a speaker if dormant
    remove_dormant_speakers(spk_gmm_params, label_for_gmm, last_seen_spk, time)

    print(est_labels[-1], len(spk_gmm_params[0]), len(X_mfcc), eta(time), time)
    #======================================


In [None]:
# input_path = "/content/drive/MyDrive/ICSI-Dataset/"
# segment_path = "/content/drive/MyDrive/Segmented(ICSI)/"

# wavfiles = [f for f in listdir(input_path) if isfile(join(input_path, f))]
# for a_file in wavfiles:
#     audio_regions = auditok.split(input_path + a_file, min_dur=0.2, max_dur=2, max_silence=0.3, energy_threshold=55)
    
#     for i,r in enumerate(audio_regions):
# #         start = r.meta.start
# #         end = r.meta.end
# #         duration = end-start
#         r.save(segment_path + a_file[0:-4] + "-region_{i}.wav".format(i=i))
# segment_files = listdir(segment_path) #[f for f in listdir(segment_path) if isfile(join(segment_path, f))]
# segment_files = [f for f in listdir(segment_path) if isfile(join(segment_path, f))]

# print(len(segment_files))