In [1]:
from resemblyzer import preprocess_wav, VoiceEncoder
from spectralcluster import SpectralClusterer
from pathlib import Path
import librosa
import numpy as np
from scipy.io.wavfile import write
from scipy.io import wavfile
import os
import speech_recognition as sr
import time
import azure.cognitiveservices.speech as speechsdk
import subprocess
import ipywidgets as widgets
from IPython.display import display

tts_key = "5e658863795544bc976f49716c1659d2"
region = "westus"

encoder = VoiceEncoder("cpu")
r = sr.Recognizer()
clusterer = SpectralClusterer(
    min_clusters=2,
    max_clusters=100,
    # p_percentile=0.90,
    # gaussian_blur_sigma=1
)

ModuleNotFoundError: ignored

In [None]:
def create_labelling(labels,wav_splits):
    sampling_rate = 16000
    times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
    labelling = []
    start_time = 0
    for i,time in enumerate(times):
        if i>0 and labels[i]!=labels[i-1]:
            temp = [str(labels[i-1]),start_time,time]
            labelling.append(tuple(temp))
            start_time = time
        if i==len(times)-1:
            temp = [str(labels[i]),start_time,time]
            labelling.append(tuple(temp))
    return labelling

def asr(audio):
    speech_configuration = speechsdk.SpeechConfig(subscription=tts_key, region=region)
    audio_configuration = speechsdk.audio.AudioConfig(filename=audio)
    recognizer = speechsdk.SpeechRecognizer(speech_config=speech_configuration, audio_config=audio_configuration)
    result = recognizer.recognize_once_async().get()
    return result.text

def find_sub_list(sl,l):
    results=[]
    sll=len(sl)
    try:
        for ind in (i for i,e in enumerate(l) if e==sl[0]):
            if l[ind:ind+sll]==sl:
                results.append((ind,ind+sll-1))
    except:
        return False
    if len(results) == 0:
        return False
    return results


In [None]:
def run():
    audio_file_path = 'file.wav'
    wav_fpath = Path(audio_file_path)

    # iterate an arbitrary number of maximum steps
    for j in range(20):
        if j > 2:
            try:
                start = time.time()
                # tries to read and preprocess wav file (sometimes fails if file is being written to)
                try:
                    wav2 = preprocess_wav(wav_fpath)
                except:
                    continue

                wav = wav2

                # runs diarization
                _, cont_embeds, wav_splits = encoder.embed_utterance(wav2, return_partials=True, rate=16)
                labels = clusterer.predict(cont_embeds)
                labelling = create_labelling(labels, wav_splits)

                # removes redundant audio
                if j > 0:
                    try:
                        split_idx = find_sub_list(list([round(x, 2) for x in redundancy_indicator]),list([round(x, 2) for x in wav]))
                        if split_idx != False:
                            wav = wav[split_idx[0][0]:]
                            time_elapsed = split_idx[0][0]/16000
                            labelling = [(label[0], label[1]-time_elapsed, label[2]-time_elapsed) for label in labelling if label[1]-time_elapsed > 0]
                    except:
                        pass

                # writes new wave that has been trimmed for silence
                write('file_trimmed.wav', 16000, wav)

                # empties split_wavs directory
                dir = 'split_wavs/'
                for f in os.listdir(dir):
                    os.remove(os.path.join(dir, f))

                # reads in file that has had silence trimmed out
                rate, data = wavfile.read('file_trimmed.wav')

                # slices and saves new wav files to split_wavs directory
                split_wavs = []
                num_frames_removed = 0
                for i in range(len(labelling)):
                    if labelling[i][2] - labelling[i][1] > .6:
                        split_at_frame = int(rate * labelling[i][2]) - num_frames_removed
                        left_data, data = data[:split_at_frame - 1], data[split_at_frame:]
                        num_frames_removed += len(left_data)
                        wavfile.write('split_wavs/file_' + str(i) + '.wav', rate, left_data)
                    if i == len(labelling) -1:
                        redundancy_indicator = data[:1000]

                # performs ASR and appends speaker's words to diarization
                diarization = []
                files = os.listdir(dir)[::-1]
                for f in files:
                    text = asr(dir+f)
                    if len(text) > 4:
                        diarization.append('New Speaker: ' + text)
                print('\n'.join(diarization))
                time.sleep(1)
            except:
                time.sleep(2.3)
        else:
            time.sleep(.8)


In [None]:
button = widgets.Button(description="run")
output = widgets.Output()

display(button, output)

def on_button_clicked(b):
    with output:
        run()

button.on_click(on_button_clicked)

Button(description='run', style=ButtonStyle())

Output()