### Prepare Repo

In [1]:
# !git clone https://github.com/facebookresearch/av_hubert.git

# %cd av_hubert
# !git submodule init
# !git submodule update
# !pip install scipy
# !pip install sentencepiece
# !pip install python_speech_features
# !pip install scikit-video

# %cd fairseq
# !python -m pip install ./

In [2]:
%cd E:/university/FYT/repos/multi_modal_ser/finetune_encoder/audio_video/av_hubert/avhubert

import sys
sys.path.append("E:/university/FYT/repos/multi_modal_ser/finetune_encoder/audio_video/av_hubert/fairseq")
from fairseq import checkpoint_utils, options, tasks, utils
import cv2
import tempfile
import torch
import utils as avhubert_utils
from argparse import Namespace
from IPython.display import HTML
from scipy.io import wavfile
from python_speech_features import logfbank
import numpy as np
import sys
print(sys.version)

E:\university\FYT\repos\multi_modal_ser\finetune_encoder\audio_video\av_hubert\avhubert


  from .autonotebook import tqdm as notebook_tqdm


3.7.1 (default, Oct 28 2018, 08:39:03) [MSC v.1912 64 bit (AMD64)]


In [3]:
def stacker(feats, stack_order):
    """
    Concatenating consecutive audio frames
    Args:
    feats - numpy.ndarray of shape [T, F]
    stack_order - int (number of neighboring frames to concatenate
    Returns:
    feats - numpy.ndarray of shape [T', F']
    """
    feat_dim = feats.shape[1]
    if len(feats) % stack_order != 0:
        res = stack_order - len(feats) % stack_order
        res = np.zeros([res, feat_dim]).astype(feats.dtype)
        feats = np.concatenate([feats, res], axis=0)
    feats = feats.reshape((-1, stack_order, feat_dim)).reshape(-1, stack_order*feat_dim)
    return feats

### Download Model

In [4]:
# !wget https://dl.fbaipublicfiles.com/avhubert/model/lrs3_vox/vsr/base_vox_433h.pt -O E:/check_pts/avhubert.pt

### Build Model Pipeline

In [5]:
user_dir = "E:/university/FYT/repos/multi_modal_ser/finetune_encoder/audio_video/av_hubert/avhubert"
utils.import_user_module(Namespace(user_dir=user_dir))
ckpt_path = "E:/check_pts/avhubert.pt"
models, saved_cfg, task = checkpoint_utils.load_model_ensemble_and_task([ckpt_path])  
model = models[0]
if hasattr(models[0], 'decoder'):
    print(f"Checkpoint: fine-tuned")
    model = models[0].encoder.w2v_model
else:
    print(f"Checkpoint: pre-trained w/o fine-tuning")

Checkpoint: fine-tuned


In [8]:
smp_id = "Ses01F_impro05_M017"
video_path = "E:/datasets/preprocessed/face_raw/{}.mp4".format(smp_id)
audio_path = "E:/datasets/preprocessed/spectrogram/raw/{}.wav".format(smp_id)

# Load Video
# Mute transform first
transform = avhubert_utils.Compose([
  avhubert_utils.Normalize(0.0, 255.0),
  avhubert_utils.CenterCrop((task.cfg.image_crop_size, task.cfg.image_crop_size)),
  avhubert_utils.Normalize(task.cfg.image_mean, task.cfg.image_std)])

frames = avhubert_utils.load_video(video_path)
# frames = transform(frames)
frames = torch.FloatTensor(frames).unsqueeze(dim=0).unsqueeze(dim=0)
video_feats = frames

# Load Audio
sample_rate, wav_data = wavfile.read(audio_path)
in_features = model.feature_extractor_audio.proj.in_features
assert sample_rate == 16_000 and len(wav_data.shape) == 1
audio_feats = logfbank(wav_data, samplerate=sample_rate).astype(np.float32) # [T, F]
audio_feats = stacker(audio_feats, in_features//26) # [T/stack_order_audio, F*stack_order_audio]

# Match Audio Video
print(audio_feats.shape)
print(video_feats.shape)

diff = audio_feats.shape[0] - video_feats.shape[2]
if diff < 0:
    audio_feats = np.concatenate([audio_feats, np.zeros([-diff, audio_feats.shape[-1]], dtype=audio_feats.dtype)])
elif diff > 0:
    audio_feats = audio_feats[:-diff]

import torch.nn.functional as F
audio_feats = torch.from_numpy(audio_feats.astype(np.float32)).T
audio_feats = F.layer_norm(audio_feats, audio_feats.shape[1:])
audio_feats = audio_feats.unsqueeze(dim=0)


(160, 104)
torch.Size([1, 1, 186, 88, 88])


In [17]:
model.eval()
with torch.no_grad():
    # Specify output_layer if you want to extract feature of an intermediate layer
    feature, _ = model.extract_finetune(source={'video': video_feats , 
                                                'audio': audio_feats}, padding_mask=None, output_layer=None)
    feature = feature.squeeze(dim=0) 
print(f"Video feature shape: {feature.shape}")

Video feature shape: torch.Size([186, 768])


In [19]:
### Save Model Standalone Checkpoints

In [26]:
from torch.utils.data import Dataset, Subset
class MMSERDataset(Dataset):
    """multi model ser dataset."""
    
    def stacker(self, feats, stack_order):
        """
        Concatenating consecutive audio frames
        Args:
        feats - numpy.ndarray of shape [T, F]
        stack_order - int (number of neighboring frames to concatenate
        Returns:
        feats - numpy.ndarray of shape [T', F']
        """
        feat_dim = feats.shape[1]
        if len(feats) % stack_order != 0:
            res = stack_order - len(feats) % stack_order
            res = np.zeros([res, feat_dim]).astype(feats.dtype)
            feats = np.concatenate([feats, res], axis=0)
        feats = feats.reshape((-1, stack_order, feat_dim)).reshape(-1, stack_order*feat_dim)
        return feats
        
    def __load_label__(self, cutmap_path):
        sheet_df = pd.DataFrame()
        for ses in range(1, 6):
            extractionmapPATH = cutmap_path + \
                str(ses)+'.xlsx'
            xl = pd.ExcelFile(extractionmapPATH)
            sheets = xl.sheet_names
            for sheet in sheets:
                sheet_df = pd.concat([sheet_df, xl.parse(sheet)])
        self.df_ = sheet_df
        
        # remove samples not agreed
        self.df_ = pd.merge(self.df_, self.df_text, on=["smp_id"])
        self.df_["emotion_id"] = self.df_["emotion"].map(self.emo2id)
        self.df_ = self.df_[self.df_["emotion_id"].notna()].reset_index(drop=True)
        self.df_["session"] = self.df_["smp_id"].apply(lambda x: x.split("_")[0])
        self.df_ = self.df_[self.df_["smp_id"].str.startswith("Ses01F_impro")].reset_index(drop=True)
        
    def __load_text__(self, text_path):
        self.df_text = pd.read_csv(text_path)
        pass
    
    def __load_audio__(self, fn_path):
        self.fn_list = list(self.df_["smp_id"])
        self.raw_list = []
        for fn in self.fn_list:
            self.raw_list.append(wavfile.read(os.path.join(fn_path, fn)+'.wav')[1])
    
    def __load_video__(self, idx):
        frames = avhubert_utils.load_video(os.path.join(self.video_path, idx+".mp4"))
#         transform = avhubert_utils.Compose([
#           avhubert_utils.Normalize(0.0, 255.0),
#           avhubert_utils.CenterCrop((task.cfg.image_crop_size, task.cfg.image_crop_size)),
#           avhubert_utils.Normalize(task.cfg.image_mean, task.cfg.image_std)])

        frames = avhubert_utils.load_video(video_path)
        # frames = transform(frames)
        frames = torch.FloatTensor(frames).unsqueeze(dim=0).unsqueeze(dim=0)
        video_feats = frames
        return video_feats
    
    def __init__(self, 
                 fn_path, 
                 cutmap_path, 
                 text_path, 
                 video_path, 
                 emo2id,
                 audio_in_features = 104):
        
        self.emo2id = emo2id
        self.audio_in_features = audio_in_features
        self.video_path = video_path
        self.__load_text__(text_path)
        self.__load_label__(cutmap_path)
        self.__load_audio__(fn_path)
        
    def __len__(self):
        return self.df_.shape[0]
    
    def __getsingle__(self, idx):
        raw_audio = self.raw_list[idx]
        video_feats = self.__load_video__(self.fn_list[idx])
        audio_feats = logfbank(raw_audio, samplerate=AUDIORATE).astype(np.float32) # [T, F]
        audio_feats = self.stacker(audio_feats, self.audio_in_features//26) # [T/stack_order_audio, F*stack_order_audio]

        diff = audio_feats.shape[0] - video_feats.shape[2]
        if diff < 0:
            audio_feats = np.concatenate([audio_feats, np.zeros([-diff, audio_feats.shape[-1]], dtype=audio_feats.dtype)])
        elif diff > 0:
            audio_feats = audio_feats[:-diff]

        audio_feats = torch.from_numpy(audio_feats.astype(np.float32)).T
        audio_feats = F.layer_norm(audio_feats, audio_feats.shape[1:])
        audio_feats = audio_feats.unsqueeze(dim=0)
        return audio_feats, video_feats
    
    def __getitem__(self, idx):
        if isinstance(idx, slice):
            return [self.__getitem__(i) for i in range(*idx.indices(len(self)))]  # type: ignore
        else:
            audio_feats, video_feats = self.__getsingle__(idx)

            return {
                "sess": list(self.df_["session"])[idx],
                "fn": self.fn_list[idx],
                "audio": audio_feats,
                "video": video_feats,
                "text": list(self.df_["transcript"])[idx],
                "labels": list(self.df_["emotion_id"])[idx]
            }

In [28]:
mmser_ds = torch.load("E:/datasets/preprocessed/dataset/avhubert_ds.pt")

In [37]:
import os
AUDIORATE = 16000

model.extract_finetune(mmser_ds[11])

(tensor([[[-2.7083e-01, -1.8842e-02, -3.7301e-02,  ..., -1.2694e-03,
            1.6491e-01, -1.7138e-01],
          [-2.1259e-02, -7.6889e-02, -1.3348e-01,  ..., -2.9132e-01,
            2.7133e-01, -1.0294e-01],
          [ 9.6669e-02, -5.6949e-02, -1.4569e-01,  ..., -3.8035e-01,
            2.3745e-01,  3.0098e-04],
          ...,
          [ 1.5377e-01,  1.3366e-01, -1.1035e-01,  ..., -5.0409e-01,
            8.9877e-02,  8.1486e-02],
          [ 9.9678e-02,  1.1212e-01, -8.9646e-02,  ..., -5.1223e-01,
            1.3578e-01,  3.8791e-02],
          [-2.7174e-01,  1.2212e-01,  1.2415e-01,  ..., -3.3966e-01,
            1.7755e-01, -6.1008e-03]]], grad_fn=<NativeLayerNormBackward0>),
 None)