# Create speaker embeddings from flac audiofiles of  the VCTK dataset

## Setup

This notebook expects a zipped file containing the txt and wav48_silence_trimmed folders of VCTK in your google drive at the path  specified below.
Connect your google drive.

Also, use a GPU runtime.

In [None]:
vctk_gdrive_path = 'VC/VCTK/VCTK-Corpus-mic1.zip'
vctk_path = '/content/drive/MyDrive/' + vctk_gdrive_path

In [None]:
!mkdir /content/vctk

copying and unzipping both take around 1 min.

In [None]:
!cp $vctk_path /content/vctk/VCTK-Corpus-mic1.zip

In [None]:
!unzip -q /content/vctk/VCTK-Corpus-mic1.zip -d /content/vctk

In [None]:
!mkdir /content/vctk/spk_emb

In [None]:
vctk_path = '/content/vctk/'

### Requirements

In [None]:
!pip install torch torchvision torchaudio

## Code

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
from transformers import AutoFeatureExtractor, WavLMForXVector
import torch, torchaudio
import numpy as np
import os
from tqdm import tqdm
from torchaudio.functional import resample

In [None]:
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/wavlm-base-plus-sv")
model = WavLMForXVector.from_pretrained("microsoft/wavlm-base-plus-sv")


In [None]:
model.cuda()
model.device

device(type='cuda', index=0)

In [None]:
def load_flac(audiofile_path):
  data, sr = torchaudio.load(audiofile_path)
  data = resample(data, sr, 16000)
  return data

quick single test:

In [None]:
#audiofile_path = '/content/vctk/wav48_silence_trimmed/p225/p225_001_mic1.flac'
audiofile_path = '/content/Taylor_Swift.wav'
data = load_flac(audiofile_path)
#data = data.unsqueeze(0).cuda()
print(data.shape)
data = data[1]
print(data.shape)
data = data.squeeze()
print(data.shape)
input = feature_extractor(data, sampling_rate=16000, return_tensors='pt', padding=True)
input.to('cuda')
with torch.no_grad():
  embedding = model(**input).embeddings
embedding = embedding.squeeze().cpu().numpy()
print('model out', embedding.shape)
np.save('taylor.npy', embedding)

In [None]:
def create_spk_emb(audiofile_paths, units_path):
  data_to_concat = []
  for audiofile_path in audiofile_paths:
    data = load_flac(audiofile_path)
    data = data.squeeze()
    data_to_concat.append(data)
  data = torch.concat(data_to_concat)
  input = feature_extractor(data, sampling_rate=16000, return_tensors='pt', padding=True)
  input.to('cuda')
  with torch.no_grad():
    embedding = model(**input).embeddings
  embedding = embedding.squeeze().cpu().numpy()
  np.save(units_path, embedding)

In [None]:
spk_list = os.listdir(vctk_path + 'wav48_silence_trimmed')
spk_list.remove('log.txt')

In [None]:
indices_single = [0]
indices_windowed = [-2, -1, 0, 1, 2]

select either single or windowed indices to create speaker embeddings from only one audio file or include adjacent audio files for speaker embedding creation.

In [None]:
indices = indices_single  # or indices_windowed

In [None]:
for spk in tqdm(spk_list):
  spk_emb_spk_path = vctk_path + 'spk_emb/' + spk
  audio_spk_path = vctk_path + 'wav48_silence_trimmed/' + spk
  os.makedirs(spk_emb_spk_path, exist_ok=True)
  audio_paths = os.listdir(audio_spk_path)
  audio_paths_len = len(audio_paths)
  for i in range(audio_paths_len):
    paths = []
    for j in indices:
      paths.append(audio_spk_path + '/' + audio_paths[(i+j) % audio_paths_len])
    spk_emb_path = spk_emb_spk_path + '/' + audio_paths[i].split('.')[0] + '.npy'
    create_spk_emb(paths, spk_emb_path)


# Storing the results

the zipping takes about 10min.

In [None]:
!mv /content/vctk/spk_emb /content/vctk/spk_emb_win

In [None]:
!zip -q -r /content/vctk/VCTK-Corpus-mic1-spk_emb_win.zip /content/vctk/spk_emb_win

In [None]:
!cp /content/vctk/VCTK-Corpus-mic1-spk_emb_win.zip /content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic1-spk_emb_win.zip

just here to check my storage

In [None]:
!du -sh /content/drive/MyDrive/VC/VCTK/*