In [21]:
import librosa
from transformers import Wav2Vec2Processor
import numpy as np 

In [22]:
audio_processor = Wav2Vec2Processor.from_pretrained(
        "facebook/hubert-xlarge-ls960-ft")  # HuBERT uses the processor of Wav2Vec 2.0

preprocessor_config.json:   0%|          | 0.00/212 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/138 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.49k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/292 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

In [24]:
audio_path = 'dataset/multiface/m--20171024--0000--002757580--GHS/audio/SEN_are_you_looking_for_employment.wav'
speech_array, sampling_rate = librosa.load(audio_path, sr=16000)
audio_values = np.squeeze(audio_processor(speech_array, return_tensors=None, padding="longest",
                            sampling_rate=sampling_rate).input_values)
print(audio_values.shape)
print(f"secs: {audio_values.shape[0] / sampling_rate}")
print(f"{type(audio_values)}")

(49600,)
secs: 3.1
<class 'numpy.ndarray'>


In [25]:
sampling_rate

16000

## Check correspondances of MEAD dataset

In [1]:
import numpy as np 
import cv2 
import os 
import glob
import pickle
import h5py
from skimage.transform import estimate_transform, warp

In [2]:
## test mediapipe original landmarks on the original images
img_path = 'dataset/mead_25fps/processed/images/M003/front/angry/level_1/000001.png'
img = cv2.imread(img_path)
print(img.shape)

(1080, 1920, 3)


In [3]:
lmk_aligned_path = 'dataset/mead_25fps/processed/landmarks_original/mediapipe/M003/front/angry/level_1/001'
with open(os.path.join(lmk_aligned_path, 'landmarks_original.pkl'), 'rb') as f:
    lmks_original = pickle.load(f)
lmks_original_478 = np.asarray(lmks_original).squeeze(1)
print(lmks_original_478.shape)

with open(os.path.join(lmk_aligned_path, 'landmarks.pkl'), 'rb') as f:
    landmark_478 = pickle.load(f)
landmark_478 = np.asarray(landmark_478).squeeze(1)
print(landmark_478.shape)

(83, 478, 3)
(83, 478, 2)


In [7]:
def point2bbox(center, size):
    size2 = size / 2

    src_pts = np.array(
        [[center[0] - size2, center[1] - size2], [center[0] - size2, center[1] + size2],
         [center[0] + size2, center[1] - size2]])
    return src_pts

def point2transform(center, size, target_size_height, target_size_width):
    src_pts = point2bbox(center, size)
    dst_pts = np.array([[0, 0], [0, target_size_width - 1], [target_size_height - 1, 0]])
    tform = estimate_transform('similarity', src_pts, dst_pts)
    return tform

def warp_image_from_lmk(
        landmarks, 
        img, 
        scale=1.35, 
        bb_center_shift_x=0., 
        bb_center_shift_y=-0.1,
        image_size=224):    # defaults from EMOTE preprocessing script
    left = np.min(landmarks[:, 0])
    right = np.max(landmarks[:, 0])
    top = np.min(landmarks[:, 1])
    bottom = np.max(landmarks[:, 1])

    old_size = (right - left + bottom - top) / 2 * 1.1
    center_x = right - (right - left) / 2.0 
    center_y = bottom - (bottom - top) / 2.0
    center = np.array([center_x, center_y])

    center[0] += abs(right-left)*bb_center_shift_x
    center[1] += abs(bottom-top)*bb_center_shift_y

    size = int(old_size * scale)

    tform = point2transform(center, size, image_size, image_size)
    output_shape = (image_size, image_size)
    dst_image = warp(img, tform.inverse, output_shape=output_shape, order=3)
    dst_landmarks = tform(landmarks[:, :2])

    return dst_image, dst_landmarks

def draw_lmk_on_image(lmks, img, out_path):
    # draw landmarks on the image
    h, w, c = img.shape
    for px in lmks[:,:2]:
        x, y = int(px[0]), int(px[1])
        if 0 <= x < w and 0 <=y < h:
            cv2.circle(img, (x, y), 1, (0, 0, 255), 1)
    cv2.imwrite(out_path, img)

In [8]:
img = cv2.imread(img_path)
dst_image, dst_landmarks = warp_image_from_lmk(lmks_original_478[0], img)

In [9]:
# check warped landmarks equal to gt
np.allclose(dst_landmarks, landmark_478[0])

True

In [51]:
img_warped = (dst_image * 255).astype(np.uint8)
draw_lmk_on_image(landmark_478[0], img_warped.copy(), "test_mediapipe_aligned.png")

In [47]:
# check FAN 68 landmarks
lmk_path = 'dataset/mead_25fps/processed/landmarks_aligned/fan/M003/front/angry/level_1/001/landmarks.pkl'
with open(lmk_path, 'rb') as f:
    lmk_68 = pickle.load(f)
print(lmk_68.shape)

(83, 68, 2)


In [55]:
lmk_68_224 = lmk_68[0].copy() * 224
draw_lmk_on_image(lmk_68_224, img_warped.copy(), "test_fan_aligned.png")

## Test audio input

In [10]:
from transformers import Wav2Vec2Processor
import librosa
import numpy as np  
import torch
import warnings
warnings.filterwarnings('ignore')

In [30]:
audio_processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") 

In [31]:
audio_path = "/local/home/yaqqin/Downloads/013.m4a" # "dataset/mead_25fps/original_data/M003/audio/angry/level_1/001.m4a"

In [23]:
sampling_rate

16000

In [24]:
len(speech_array) / sampling_rate

3.221375

In [25]:
audio_values = audio_processor(
    speech_array, 
    return_tensors='pt', 
    padding="longest",
    sampling_rate=sampling_rate).input_values

In [26]:
audio_values.shape

torch.Size([1, 51542])

In [27]:
audio_values.shape[1] / sampling_rate

3.221375

In [28]:
from model.wav2vec import Wav2Vec2Model

In [29]:
wav2vec = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-large-960h")
wav2vec.feature_extractor._freeze_parameters()
wav2vec.to('cuda')
 
audio_input = audio_values.float().to('cuda')
with torch.no_grad():
    audio_emb = wav2vec(audio_input, frame_num = 83).last_hidden_state.cpu()
    print(audio_emb.shape)

pytorch_model.bin:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/wav2vec2-large-960h and are newly initialized: ['wav2vec2.masked_spec_embed']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 83, 1024])
