In [1]:
cd Prosody2Vec/

/home/dcor/niskhizov/Prosody2Vec


# Data prep

In [2]:
import torchaudio
from speechbrain.inference.speaker import EncoderClassifier
import glob
import torch
import torchaudio
from funasr import AutoModel

import numpy as np
from IPython.display import Audio



In [3]:

# model="iic/emotion2vec_base"
# model="iic/emotion2vec_base_finetuned"
# model="iic/emotion2vec_plus_seed"
# model="iic/emotion2vec_plus_base"
model_id = "iic/emotion2vec_plus_large"

sed_model = AutoModel(
    model=model_id,
    hub="ms",  # "ms" or "modelscope" for China mainland users; "hf" or "huggingface" for other overseas users
)


In [4]:
import torch, torchaudio

# Load the content encoder (either hubert_soft or hubert_discrete)
hubert = torch.hub.load("bshall/hubert:main", "hubert_soft", trust_repo=True).cuda()

# Load the acoustic model (either hubert_soft or hubert_discrete)
acoustic = torch.hub.load("bshall/acoustic-model:main", "hubert_soft", trust_repo=True).cuda()

# Load the vocoder (either hifigan_hubert_soft or hifigan_hubert_discrete)
hifigan = torch.hub.load("bshall/hifigan:main", "hifigan_hubert_soft", trust_repo=True).cuda()


In [5]:
from torch.nn import functional as F
class LogMelSpectrogram(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.melspctrogram = torchaudio.transforms.MelSpectrogram(
            sample_rate=16000,
            n_fft=1024,
            win_length=1024,
            hop_length=160,
            center=False,
            power=1.0,
            norm="slaney",
            onesided=True,
            n_mels=128,
            mel_scale="slaney",
        )

    def forward(self, wav):
        padding = (1024 - 160) // 2
        wav = F.pad(wav, (padding, padding), "reflect")
        mel = self.melspctrogram(wav)
        logmel = torch.log(torch.clamp(mel, min=1e-5))
        return logmel
    
logmel = LogMelSpectrogram().cuda()

In [6]:

spk_ecapa_tdnn = EncoderClassifier.from_hparams(source="speechbrain/spkrec-ecapa-voxceleb")

In [7]:
# data_dir = './Emotion Speech Dataset/'
data_dir = '/home/dcor/niskhizov/Prosody2Vec/IEMOCAP_full_release/'
# scan recursively for all .wav files in the data_dir
wav_files = glob.glob(data_dir + '/**/*.wav', recursive=True)



In [8]:
wav_files = [x for x in wav_files if 'sentences' in x]

In [9]:

len(wav_files)
wav_file = wav_files[0]

wav, sr = torchaudio.load(wav_file)
wav.shape

In [10]:
Audio(wav.squeeze().numpy(), rate=sr)

In [11]:
import pickle

In [12]:
mkdir iemocap_embeddings

In [13]:
def save_vecs(units, emo_vec, spk_vec, logmel, out_file):
    with open(out_file, 'wb') as f:
        pickle.dump({'units': units, 'emo_vec': emo_vec, 'spk_vec': spk_vec, 'logmel': logmel}, f)

In [14]:
import tqdm

In [19]:
# extract hubert embeddings, emotion embeddings, and speaker embeddings

for wav_path in tqdm.tqdm(wav_files):
    wav, sr = torchaudio.load(wav_path)
    with torch.inference_mode():
        # Extract speech units
        units = hubert.units(wav.unsqueeze(0).cuda())

        emo_vec = sed_model.generate(wav, granularity="utterance", extract_embedding=True, disable_pbar =True)[0]['feats']

        spk_vec = spk_ecapa_tdnn.encode_batch(wav.cuda())

        melspec = logmel(wav.cuda())
    
    out_file = f"iemocap_embeddings/{wav_path.split('/')[-1].replace('.wav', '.pkl')}"
    save_vecs(units[0].cpu(), emo_vec, spk_vec[0][0].cpu(), melspec[0].cpu(), out_file)

