In [1]:
import torch
import torchaudio
import os
from torchaudio.datasets import SPEECHCOMMANDS, VoxCeleb1Identification, IEMOCAP

In [2]:
root_path = "/home/braveenan/voice_dataset"
root_speechcommand = os.path.join(root_path, "SpeechCommand")
root_voxceleb = os.path.join(root_path, "VoxCeleb")
root_iemocap = os.path.join(root_path, "IEMOCAP")

In [3]:
bundle = torchaudio.pipelines.WAVLM_LARGE
upstream_model = bundle.get_model()

In [4]:
sc_train = SPEECHCOMMANDS(root=root_speechcommand, url = "speech_commands_v0.01", download=False, subset="training")
sc_val = SPEECHCOMMANDS(root=root_speechcommand, url = "speech_commands_v0.01", download=False, subset="validation")
sc_test = SPEECHCOMMANDS(root=root_speechcommand, url = "speech_commands_v0.01", download=False, subset="testing")
print(len(sc_train)+len(sc_val)+len(sc_test))
print(sc_train[0])

64721
(tensor([[ 9.1553e-05,  3.0518e-05,  1.8311e-04,  ..., -3.0518e-05,
         -9.1553e-05,  1.2207e-04]]), 16000, 'bed', '00176480', 0)


In [5]:
with torch.no_grad():
    waveform = sc_train[0][0]
    output = upstream_model.forward(waveform)
    features, _ = upstream_model.extract_features(waveform)
    
print(output[0].shape)
print(len(features))

torch.Size([1, 49, 1024])
24


In [6]:
vc_train = VoxCeleb1Identification(root=root_voxceleb, download=False, subset="train")
vc_val = VoxCeleb1Identification(root=root_voxceleb, download=False, subset="dev")
vc_test = VoxCeleb1Identification(root=root_voxceleb, download=False, subset="test")
print(len(vc_train)+len(vc_val)+len(vc_test))
print(vc_train[0])

153516
(tensor([[ 0.0703,  0.0703,  0.0916,  ..., -0.0863, -0.1171, -0.1537]]), 16000, 1, 'id10001-1zcIwhmdeo4-00001')


In [7]:
with torch.no_grad():
    waveform = vc_train[0][0]
    output = upstream_model.forward(waveform)
    features, _ = upstream_model.extract_features(waveform)
    
print(output[0].shape)
print(len(features))

torch.Size([1, 405, 1024])
24


In [8]:
ic_train = IEMOCAP(root = root_iemocap, sessions = (1, 2, 3), utterance_type = None)
ic_val = IEMOCAP(root = root_iemocap, sessions = (4,), utterance_type = None)
ic_test = IEMOCAP(root = root_iemocap, sessions = (5,), utterance_type = None)
print(len(ic_train)+len(ic_val)+len(ic_test))
print(ic_train[0])

7380
(tensor([[-0.0050, -0.0050, -0.0038,  ..., -0.0027, -0.0032, -0.0042]]), 16000, 'Ses01F_impro01_F000', 'neu', 'Ses01F')


In [9]:
with torch.no_grad():
    waveform = ic_train[0][0]
    output = upstream_model.forward(waveform)
    features, _ = upstream_model.extract_features(waveform)
    
print(output[0].shape)
print(len(features))

torch.Size([1, 97, 1024])
24
