This is a noteboook used to generate the speaker embeddings with the  Speech2Phone Model.

In [None]:
import os
import numpy as np
from glob import glob
from tqdm import tqdm

In [None]:
! python3 -m pip uninstall pydub -y
!conda remove pydub -y
!conda install -c conda-forge pydub -y

In [None]:
import pydub

In [None]:
# Install Speech2Phone Requeriments
! pip install tensorflow==1.14.0 tflearn==0.3.2

In [None]:
#Download Speech2Phone Checkpoint
!wget -O ./saver.zip https://www.dropbox.com/s/b19xt2wu3th9p36/Save-Models-Speaker-Diarization.zip?dl=0
!mkdir Speech2Phone
!unzip saver.zip
!mv  Save-Models/  Speech2Phone/Save-Models/


In [None]:
#Utils for Speech2Phone Preprocessing
from pydub import AudioSegment as audio

def detect_leading_silence(sound, silence_threshold=-50.0, chunk_size=10):
    '''
    sound is a pydub.AudioSegment
    silence_threshold in dB
    chunk_size in ms
 
    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0  # ms
    while sound[trim_ms:trim_ms+chunk_size].dBFS < silence_threshold:
        #print(trim_ms,len(sound))
        if trim_ms > len(sound):
            return None
        trim_ms += chunk_size
 
    return trim_ms

def remove_silence(sound):
    start_trim = detect_leading_silence(sound)
    if start_trim is None:
        return None
    end_trim = detect_leading_silence(sound.reverse())
    duration = len(sound)
    trimmed_sound = sound[start_trim:duration-end_trim]
    return trimmed_sound
    


In [None]:
import tflearn

#Create model for restore
encoder = tflearn.input_data(shape=[None, 13,int(216)])
encoder = tflearn.dropout(encoder,0.9) #10 % drop - 90% -> 80
encoder = tflearn.dropout(encoder,0.2)# 80 % drop
encoder = tflearn.fully_connected(encoder, 40,activation='crelu')
decoder = tflearn.fully_connected(encoder, int(572), activation='linear')
net = tflearn.regression(decoder, optimizer='adam', learning_rate=0.0007,loss='mean_square', metric=None)#categorical_crossentropy
model = tflearn.DNN(net, tensorboard_verbose=0,tensorboard_dir='tflearn_logs')

model.load('./Speech2Phone/Save-Models/Model3-Best-40loc.tflearn')

encoding_model = tflearn.DNN(encoder, session=model.session)# used for extract embedding in encoder layer


In [None]:
# Set constants
DATA_ROOT_PATH = '../../../LibriSpeech/voicefilter_data-3/'
TRAIN_DATA = os.path.join(DATA_ROOT_PATH, 'train')
TEST_DATA = os.path.join(DATA_ROOT_PATH, 'test')
glob_re_wav_emb = '*-ref_emb.wav'
glob_re_emb = '*-emb.pt'

In [None]:
os.listdir(TRAIN_DATA)

In [None]:
os.listdir(TEST_DATA)

In [None]:
import ffmpeg

In [None]:
#Preprocess dataset
train_files = sorted(glob(os.path.join(TRAIN_DATA, glob_re_wav_emb)))
test_files = sorted(glob(os.path.join(TEST_DATA, glob_re_wav_emb)))

if len(train_files) == 0 or len(test_files):
    print("check train and test path files not in directory")
files  = train_files+test_files
      

for i in tqdm(range(len(files))):
    wave_file_path = files[i]
    print(files)
    wav_file_name = os.path.basename(wave_file_path)
    # Extract Embedding
    try:
        sound = audio.from_wav(wave_file_path)
    except Exception as e:
        print("erro ler arquivo", e)
        continue
    wave = remove_silence(sound)
    if wave is None:
        print("erro remove silence")
        continue
    
    file_embeddings = None
    begin = 0
    end = 5
    step = 1 
    if int(wave.duration_seconds) < 5: # 5 seconds is the Speech2Phone input if is small concate
        aux = wave
        while int(aux.duration_seconds) <= 5:
            aux += wave
        wave = aux
        del aux
        
    while (end) <= int(wave.duration_seconds):
        try:        
            segment = wave[begin*1000:end*1000]
            segment.export('../aux' + '.wav', 'wav')# its necessary because pydub and librosa load wave in diferent form 
            y, sr = librosa.load('../aux.wav',sr=22050)#sample rate = 22050 
            if file_embeddings is None:
                file_embeddings =[np.array(encoding_model.predict([librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)])[0])]
            else:
                file_embeddings.append(np.array(encoding_model.predict([librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)])[0]))   
            os.system('rm ../aux.wav')
            begin = begin + step
            end = end + step
        except Exception as e:
            print(e)
            #print('par, len(file_embeddings))
            begin = begin + step
            end = end + step
    file_embedding = np.mean(np.array(file_embeddings), axis=0)
    output_name = wave_file_path.replace(glob_re_wav_emb.replace('*',''),'')+glob_re_emb.replace('*','')
    torch.save(torch.from_numpy(file_embedding.reshape(-1)), output_name)