# Create speech units from flac audiofiles of  the VCTK dataset

## Setup

This notebook expects a zipped file containing the txt and wav48_silence_trimmed folders of VCTK in your google drive at the path  specified below.
Connect your google drive.

Also, use a GPU runtime.

In [1]:
vctk_gdrive_path = 'VC/VCTK/VCTK-Corpus-mic1.zip'
vctk_path = '/content/drive/MyDrive/' + vctk_gdrive_path

In [2]:
!mkdir /content/vctk

copying and unzipping both take around 1 min.

In [3]:
!cp $vctk_path /content/vctk/VCTK-Corpus-mic1.zip

In [4]:
!unzip -q /content/vctk/VCTK-Corpus-mic1.zip -d /content/vctk

In [6]:
!mkdir /content/vctk/units

In [7]:
vctk_path = '/content/vctk/'

### Requirements

In [8]:
!pip install torch torchvision torchaudio



## Code

In [9]:
import torch, torchaudio
import numpy as np
import os
from tqdm import tqdm
from torchaudio.functional import resample

In [10]:
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()

Downloading: "https://github.com/bshall/hubert/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://github.com/bshall/hubert/releases/download/v0.2/hubert-soft-35d9f29f.pt" to /root/.cache/torch/hub/checkpoints/hubert-soft-35d9f29f.pt
100%|██████████| 361M/361M [00:01<00:00, 246MB/s]


In [11]:
def load_flac(audiofile_path):
  data, sr = torchaudio.load(audiofile_path)
  data = resample(data, sr, 16000)
  return data

quick single test:

In [17]:
audiofile_path = '/content/vctk/wav48_silence_trimmed/p225/p225_001_mic1.flac'
data = load_flac(audiofile_path)
data = data.unsqueeze(0).cuda()
print(data.shape)
with torch.inference_mode():
  units = hubert.units(data)
  print(units.shape)
units = units.squeeze().cpu().numpy()
print(units.shape)

torch.Size([1, 1, 32825])
torch.Size([1, 102, 256])
(102, 256)


In [18]:
def create_units(audiofile_path, units_path):
  data = load_flac(audiofile_path)
  data = data.unsqueeze(0).cuda()
  with torch.inference_mode():
    units = hubert.units(data)
  units = units.squeeze().cpu().numpy()
  np.save(units_path, units)

In [19]:
spk_list = os.listdir(vctk_path + 'wav48_silence_trimmed')
spk_list.remove('log.txt')

In [20]:
for spk in tqdm(spk_list):
  units_spk_path = vctk_path + 'units/' + spk
  audio_spk_path = vctk_path + 'wav48_silence_trimmed/' + spk
  os.makedirs(units_spk_path, exist_ok=True)
  for audiofile in os.listdir(audio_spk_path):
    audiofile_path = audio_spk_path + '/' + audiofile
    units_path = units_spk_path + '/' + audiofile.split('.')[0] + '.npy'
    create_units(audiofile_path, units_path)


100%|██████████| 110/110 [33:43<00:00, 18.40s/it]


# Storing the results

the zipping takes about 10min.

In [21]:
!zip -q -r /content/vctk/VCTK-Corpus-mic1-units.zip /content/vctk/units

In [22]:
!cp /content/vctk/VCTK-Corpus-mic1-units.zip /content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic1-units.zip

just here to check my storage

In [23]:
!du -sh /content/drive/MyDrive/VC/VCTK/*

11G	/content/drive/MyDrive/VC/VCTK/DS_10283_3443.zip
5.5K	/content/drive/MyDrive/VC/VCTK/README.txt
4.0K	/content/drive/MyDrive/VC/VCTK/speaker-info.txt
23M	/content/drive/MyDrive/VC/VCTK/txt
4.0K	/content/drive/MyDrive/VC/VCTK/update.txt
11G	/content/drive/MyDrive/VC/VCTK/VCTK-Corpus-0.92.zip
6.3G	/content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic1-mels.zip
6.7G	/content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic1-units.zip
5.9G	/content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic1.zip
5.2G	/content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic2.zip
8.3G	/content/drive/MyDrive/VC/VCTK/wav48_silence_trimmed
