# Create speech units from flac audiofiles of  the VCTK dataset

## Setup

This notebook expects a zipped file containing the txt and wav48_silence_trimmed folders of VCTK in your google drive at the path  specified below.
Connect your google drive.

Also, use a GPU runtime.

In [1]:
vctk_gdrive_path = 'VC/VCTK/VCTK-Corpus-mic1.zip'
vctk_path = '/content/drive/MyDrive/' + vctk_gdrive_path

In [2]:
!mkdir /content/vctk

copying and unzipping both take around 1 min.

In [3]:
!cp $vctk_path /content/vctk/VCTK-Corpus-mic1.zip

In [4]:
!unzip -q /content/vctk/VCTK-Corpus-mic1.zip -d /content/vctk

In [6]:
!mkdir /content/vctk/units

In [7]:
vctk_path = '/content/vctk/'

### Requirements

In [None]:
!pip install torch torchvision torchaudio

## Code

In [9]:
import torch, torchaudio
import numpy as np
import os
from tqdm import tqdm
from torchaudio.functional import resample

In [None]:
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()

In [11]:
def load_flac(audiofile_path):
  data, sr = torchaudio.load(audiofile_path)
  data = resample(data, sr, 16000)
  return data

quick single test:

In [None]:
audiofile_path = '/content/vctk/wav48_silence_trimmed/p225/p225_001_mic1.flac'
data = load_flac(audiofile_path)
data = data.unsqueeze(0).cuda()
print(data.shape)
with torch.inference_mode():
  units = hubert.units(data)
  print(units.shape)
units = units.squeeze().cpu().numpy()
print(units.shape)

In [18]:
def create_units(audiofile_path, units_path):
  data = load_flac(audiofile_path)
  data = data.unsqueeze(0).cuda()
  with torch.inference_mode():
    units = hubert.units(data)
  units = units.squeeze().cpu().numpy()
  np.save(units_path, units)

In [19]:
spk_list = os.listdir(vctk_path + 'wav48_silence_trimmed')
spk_list.remove('log.txt')

In [None]:
for spk in tqdm(spk_list):
  units_spk_path = vctk_path + 'units/' + spk
  audio_spk_path = vctk_path + 'wav48_silence_trimmed/' + spk
  os.makedirs(units_spk_path, exist_ok=True)
  for audiofile in os.listdir(audio_spk_path):
    audiofile_path = audio_spk_path + '/' + audiofile
    units_path = units_spk_path + '/' + audiofile.split('.')[0] + '.npy'
    create_units(audiofile_path, units_path)


# Storing the results

the zipping takes about 10min.

In [21]:
!zip -q -r /content/vctk/VCTK-Corpus-mic1-units.zip /content/vctk/units

In [22]:
!cp /content/vctk/VCTK-Corpus-mic1-units.zip /content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic1-units.zip

just here to check my storage

In [None]:
!du -sh /content/drive/MyDrive/VC/VCTK/*