In [1]:
import os
import torch
from tqdm import tqdm
import numpy as np
from wavlm.WavLM import WavLM, WavLMConfig
import torch.nn.functional as F
from IPython.display import Audio

  from .autonotebook import tqdm as notebook_tqdm


# Generate WavLM Representations and Save

## Parameters

In [2]:
wavlm_layer = 11 
fps=30
sample_rate=16000

## Data Folders

In [3]:
train_wavlm_reps_folder = 'dataset/Genea2023/trn/main-agent/wavlm_representations/'
val_wavlm_reps_folder = 'dataset/Genea2023/val/main-agent/wavlm_representations/'
train_audios_folder = 'dataset/Genea2023/trn/main-agent/audio16k_npy/' 
val_audios_folder = 'dataset/Genea2023/val/main-agent/audio16k_npy/'

In [4]:
if not os.path.exists(train_wavlm_reps_folder):
    os.makedirs(train_wavlm_reps_folder)
if not os.path.exists(val_wavlm_reps_folder):
    os.makedirs(val_wavlm_reps_folder) 

## Load Pre-Trained WavLM

In [5]:
checkpoint = torch.load('./wavlm/WavLM-Base+.pt')
wavlm_cfg = WavLMConfig(checkpoint['cfg'])
wavlm = WavLM(wavlm_cfg)
wavlm.load_state_dict(checkpoint['model'])
wavlm.eval()

WavLM(
  (feature_extractor): ConvFeatureExtractionModel(
    (conv_layers): ModuleList(
      (0): Sequential(
        (0): Conv1d(1, 512, kernel_size=(10,), stride=(5,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): Fp32GroupNorm(512, 512, eps=1e-05, affine=True)
        (3): GELU()
      )
      (1): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (2): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (3): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (4): Sequential(
        (0): Conv1d(512, 512, kernel_size=(3,), stride=(2,), bias=False)
        (1): Dropout(p=0.0, inplace=False)
        (2): GELU()
      )
      (5): Seque

## Train Audio Representations

In [6]:
for idx in tqdm(range(303, len(os.listdir(train_audios_folder)))):
    
    # Get Audio File
    audio_path = train_audios_folder + os.listdir(train_audios_folder)[idx]
    
    # Load with Numpy
    signal = np.load(audio_path)
    
    # Set to model innput format
    signal = torch.tensor(signal).unsqueeze(0)
    
    # Normalize
    if wavlm_cfg.normalize:
        signal_norm = torch.nn.functional.layer_norm(signal , signal.shape)
    else:
        signal_norm = signal
        
    # Run Model (rep=Desired Layer, layer_results=all layers)
    rep, layer_results = wavlm.extract_features(signal_norm, output_layer=wavlm_layer, ret_layer_results=True)[0]
    layer_reps = [x.transpose(0, 1) for x, _ in layer_results] # fix shape

    # Get Number of Seconds of Audio File
    n_secs = signal.shape[1] / sample_rate
    
    # Get Number of poses equivalent to audio file duration, given fps (alignment len)
    n_pose = n_secs * fps
    
    # Interpolate number of representations to match number of poses corresponding to audio file
    interp_reps = F.interpolate(rep.transpose(1, 2), size=int(n_pose), align_corners=True, mode='linear')
    
    # Prepare to save
    interp_reps = interp_reps.squeeze(0).transpose(0,1).cpu().detach().data.cpu().numpy()
    
    # Double check dimension
    assert (interp_reps.shape[0] == int(np.ceil(n_pose)) or interp_reps.shape[0] == int(np.floor(n_pose)))
    
    #Save
    path_name = train_wavlm_reps_folder + os.listdir(train_audios_folder)[idx]
    with open(path_name, 'wb') as f:
        np.save(f, interp_reps)
    
    # Normalize OutBatch of Desired Layer
    rep_norm = wavlm.encoder.layer_norm(rep)
    
    # Interpolate number of representations to match number of poses corresponding to audio file
    interp_reps_norm = F.interpolate(rep_norm.transpose(1, 2), size=int(n_pose), align_corners=True, mode='linear')
    
    # Prepare to save
    interp_reps_norm = interp_reps_norm.squeeze(0).transpose(0,1).cpu().detach().data.cpu().numpy()
    
    # Double check dimension
    assert (interp_reps_norm.shape[0] == int(np.ceil(n_pose)) or interp_reps_norm.shape[0] == int(np.floor(n_pose)))
    
    #Save
    path_name = train_wavlm_reps_folder + 'norm_' + os.listdir(train_audios_folder)[idx]
    with open(path_name, 'wb') as f:
        np.save(f, interp_reps_norm)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 69/69 [1:39:18<00:00, 86.36s/it]


## Val Audio Representations

In [7]:
for idx in tqdm(range(len(os.listdir(val_audios_folder)))):
    
    # Get Audio File
    audio_path = val_audios_folder + os.listdir(val_audios_folder)[idx]
    
    # Load with Numpy
    signal = np.load(audio_path)
    
    # Set to model innput format
    signal = torch.tensor(signal).unsqueeze(0)
    
    # Normalize
    if wavlm_cfg.normalize:
        signal_norm = torch.nn.functional.layer_norm(signal , signal.shape)
    else:
        signal_norm = signal
        
    # Run Model (rep=Desired Layer, layer_results=all layers)
    rep, layer_results = wavlm.extract_features(signal_norm, output_layer=wavlm_layer, ret_layer_results=True)[0]
    layer_reps = [x.transpose(0, 1) for x, _ in layer_results] # fix shape

    # Get Number of Seconds of Audio File
    n_secs = signal.shape[1] / sample_rate
    
    # Get Number of poses equivalent to audio file duration, given fps (alignment len)
    n_pose = n_secs * fps
    
    # Interpolate number of representations to match number of poses corresponding to audio file
    interp_reps = F.interpolate(rep.transpose(1, 2), size=int(n_pose), align_corners=True, mode='linear')
    
    # Prepare to save
    interp_reps = interp_reps.squeeze(0).transpose(0,1).cpu().detach().data.cpu().numpy()
    
    # Double check dimension
    assert (interp_reps.shape[0] == int(np.ceil(n_pose)) or interp_reps.shape[0] == int(np.floor(n_pose)))
    
    #Save
    path_name = val_wavlm_reps_folder + os.listdir(val_audios_folder)[idx]
    with open(path_name, 'wb') as f:
        np.save(f, interp_reps)
    
    # Normalize OutBatch of Desired Layer
    rep_norm = wavlm.encoder.layer_norm(rep)
    
    # Interpolate number of representations to match number of poses corresponding to audio file
    interp_reps_norm = F.interpolate(rep_norm.transpose(1, 2), size=int(n_pose), align_corners=True, mode='linear')
    
    # Prepare to save
    interp_reps_norm = interp_reps_norm.squeeze(0).transpose(0,1).cpu().detach().data.cpu().numpy()
    
    # Double check dimension
    assert (interp_reps_norm.shape[0] == int(np.ceil(n_pose)) or interp_reps_norm.shape[0] == int(np.floor(n_pose)))
    
    #Save
    path_name = val_wavlm_reps_folder + 'norm_' + os.listdir(val_audios_folder)[idx]
    with open(path_name, 'wb') as f:
        np.save(f, interp_reps_norm)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 41/41 [10:22<00:00, 15.19s/it]
