# Create Mel-Spectrograms from flac audiofiles of  the VCTK dataset

## Setup

This notebook expects a zipped file containing the txt and wav48_silence_trimmed folders of VCTK in your google drive at the path  specified below.
Connect your google drive.

In [None]:
vctk_gdrive_path = 'VC/VCTK/VCTK-Corpus-mic1.zip'
vctk_path = '/content/drive/MyDrive/' + vctk_gdrive_path

In [None]:
!mkdir /content/vctk

copying and unzipping both take around 1 min.

In [None]:
!cp $vctk_path /content/vctk/VCTK-Corpus-mic1.zip

In [None]:
!unzip -q /content/vctk/VCTK-Corpus-mic1.zip -d /content/vctk

In [None]:
!mkdir /content/vctk/mels

In [None]:
vctk_path = '/content/vctk/'

### Requirements

In [None]:
!pip install torch torchvision torchaudio



## Code

In [None]:
import torch, torchaudio
import numpy as np
import os
from tqdm import tqdm
import torchaudio.transforms as transforms
import torch.nn.functional as F
from torchaudio.functional import resample

In [None]:
class LogMelSpectrogram(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.melspctrogram = transforms.MelSpectrogram(
            sample_rate=16000,
            n_fft=1024,
            win_length=1024,
            hop_length=160,
            center=False,
            power=1.0,
            norm="slaney",
            onesided=True,
            n_mels=128,
            mel_scale="slaney",
        )

    def forward(self, wav):
        padding = (1024 - 160) // 2
        wav = F.pad(wav, (padding, padding), "reflect")
        mel = self.melspctrogram(wav)
        logmel = torch.log(torch.clamp(mel, min=1e-5))
        return logmel

In [None]:
melspectrogram = LogMelSpectrogram()



In [None]:
def load_flac(audiofile_path):
  data, sr = torchaudio.load(audiofile_path)
  data = resample(data, sr, 16000)
  return data

In [None]:
def create_mel_spectrogram(audiofile_path, mel_path):
  data = load_flac(audiofile_path)
  data = data.unsqueeze(0)
  logmel = melspectrogram(data)
  logmel = logmel.squeeze().numpy()
  np.save(mel_path, logmel)

In [None]:
spk_list = os.listdir(vctk_path + 'wav48_silence_trimmed')
spk_list.remove('log.txt')

In [None]:
for spk in tqdm(spk_list):
  mel_spk_path = vctk_path + 'mels/' + spk
  audio_spk_path = vctk_path + 'wav48_silence_trimmed/' + spk
  os.makedirs(mel_spk_path, exist_ok=True)
  for audiofile in os.listdir(audio_spk_path):
    audiofile_path = audio_spk_path + '/' + audiofile
    mel_path = mel_spk_path + '/' + audiofile.split('.')[0] + '.npy'
    create_mel_spectrogram(audiofile_path, mel_path)


100%|██████████| 110/110 [15:00<00:00,  8.19s/it]


# Testing mel to audio

Let's test if the vocoder correctly generated audio from an example generated mel

In [None]:
import IPython.display as display

In [None]:
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True, map_location=torch.device('cpu'))

Using cache found in /root/.cache/torch/hub/bshall_hifigan_main


In [None]:
mel = np.load('/content/vctk/mels/p228/p228_002_mic1.npy')
mel.shape

(128, 491)

In [None]:
mel = torch.Tensor(mel)
mel = mel.unsqueeze(0)
mel.shape

torch.Size([1, 128, 491])

In [None]:
with torch.inference_mode():
  target = hifigan(mel)

In [None]:
display.Audio(target.squeeze(), rate=16000)

# Storing the results

the zipping takes about 10min.

In [None]:
!zip -q -r /content/vctk/VCTK-Corpus-mic1-mels.zip /content/vctk/mels

In [None]:
!cp /content/vctk/VCTK-Corpus-mic1-mels.zip /content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic1-mels.zip

In [None]:
!du -sh /content/drive/MyDrive/VC/VCTK/*

11G	/content/drive/MyDrive/VC/VCTK/DS_10283_3443.zip
5.5K	/content/drive/MyDrive/VC/VCTK/README.txt
4.0K	/content/drive/MyDrive/VC/VCTK/speaker-info.txt
23M	/content/drive/MyDrive/VC/VCTK/txt
4.0K	/content/drive/MyDrive/VC/VCTK/update.txt
11G	/content/drive/MyDrive/VC/VCTK/VCTK-Corpus-0.92.zip
6.3G	/content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic1-mels.zip
5.9G	/content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic1.zip
5.2G	/content/drive/MyDrive/VC/VCTK/VCTK-Corpus-mic2.zip
8.3G	/content/drive/MyDrive/VC/VCTK/wav48_silence_trimmed
