In [35]:
from transformers import AutoImageProcessor, VideoMAEForPreTraining
import numpy as np
import torch
from torchinfo import summary

num_frames = 16
video = list(np.random.randint(0, 256, (num_frames, 3, 224, 224)))

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
model = VideoMAEForPreTraining.from_pretrained("MCG-NJU/videomae-base").to(device)
model.eval()

pixel_values = image_processor(video, return_tensors="pt").pixel_values

num_patches_per_frame = (model.config.image_size // model.config.patch_size) ** 2
seq_length = (num_frames // model.config.tubelet_size) * num_patches_per_frame
bool_masked_pos = torch.randint(0, 2, (1, seq_length)).bool()

# outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
# loss = outputs.loss
print(pixel_values.shape)
print(model.config)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


torch.Size([1, 16, 3, 224, 224])
VideoMAEConfig {
  "_attn_implementation_autoset": true,
  "architectures": [
    "VideoMAEForPreTraining"
  ],
  "attention_probs_dropout_prob": 0.0,
  "decoder_hidden_size": 384,
  "decoder_intermediate_size": 1536,
  "decoder_num_attention_heads": 6,
  "decoder_num_hidden_layers": 4,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "videomae",
  "norm_pix_loss": true,
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_frames": 16,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3",
  "tubelet_size": 2,
  "use_mean_pooling": false
}



In [36]:
!nvidia-smi

Mon May 19 11:42:06 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off |   00000000:19:00.0 Off |                  N/A |
| 30%   36C    P2             55W /  350W |     688MiB /  24576MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  NVIDIA GeForce RTX 3090        Off |   00

In [37]:
import os
import cv2
import librosa
import numpy as np
from moviepy.editor import VideoFileClip
from PIL import Image
from scipy.fftpack import dct

def save_frame_as_uint8_image(frame, filename):
    # Convert RGB frame (float) to uint8 and save
    frame_uint8 = np.clip(frame * 255, 0, 255).astype(np.uint8) if frame.dtype == np.float32 or frame.max() <= 1 else frame.astype(np.uint8)
    frame_bgr = cv2.cvtColor(frame_uint8, cv2.COLOR_RGB2BGR)
    cv2.imwrite(filename, frame_bgr)



In [38]:
def normalize_audio(audio):
    audio = audio / np.max(np.abs(audio))
    return audio

In [40]:
def save_mfcc_as_uint8_image(audio_segment, sr, filename):
    mfcc = librosa.feature.mfcc(y=audio_segment, sr=sr, n_mfcc=13)

    # Normalize to 0-255 uint8
    mfcc_min, mfcc_max = np.min(mfcc), np.max(mfcc)
    mfcc_norm = 255 * (mfcc - mfcc_min) / (mfcc_max - mfcc_min + 1e-6)
    mfcc_uint8 = mfcc_norm.astype(np.uint8)

    # Resize for better visibility if needed (optional)
    img = Image.fromarray(mfcc_uint8)
    img = img.resize((256, 256), Image.BICUBIC)  # Make all spectrograms same size
    img.save(filename)



In [47]:
def extract_frames_and_mfccs(video_path, output_dir='output', num_segments=8):
    os.makedirs(output_dir, exist_ok=True)

    clip = VideoFileClip(video_path)
    duration = clip.duration
    audio, sr = librosa.load(video_path, sr=48000)

    segment_duration = duration / num_segments

    for i in range(num_segments):
        time = (i + 0.5) * segment_duration # Time for the middle of the segment
        frame = clip.get_frame(time)

        # Save video frame as uint8 image
        frame_filename = os.path.join(output_dir, f"frame_{i+1}.jpg")
        save_frame_as_uint8_image(frame, frame_filename)

        # Extract audio segment
        start_sample = int(i * segment_duration * sr)
        end_sample = int((i + 1) * segment_duration * sr)
        audio_segment = audio[start_sample:end_sample]

        # Save MFCC as uint8 image
        # audio_segment = normalize_audio(audio_segment)
        # mfcc = MFCC(audio_segment, sr)
        # mfcc_min = mfcc.min()
        # mfcc_max = mfcc.max()
        # mfcc_normalized = 255 * (mfcc - mfcc_min) / (mfcc_max - mfcc_min)
        # mfcc_uint8 = mfcc_normalized.astype(np.uint8)

        # image = Image.fromarray(mfcc_uint8)
        # print(image.shape)

        mfcc_filename = os.path.join(output_dir, f"mfcc_{i+1}.jpg")
        save_mfcc_as_rgb_uint8_image(audio_segment, sr, mfcc_filename, "viridis")

    print(f"Saved {num_segments} uint8 frame and MFCC images to '{output_dir}'")


In [48]:
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import matplotlib.cm as cm

def save_mfcc_as_rgb_uint8_image(audio, sr, output_path, cmap='viridis'):
    # Step 1: Compute MFCC
    audio = normalize_audio(audio)
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

    # Step 2: Normalize MFCC to 0-1 for colormap
    mfcc_min = mfcc.min()
    mfcc_max = mfcc.max()
    mfcc_norm = (mfcc - mfcc_min) / (mfcc_max - mfcc_min + 1e-6)  # Prevent div by 0

    # Step 3: Map to RGB using a colormap (matplotlib)
    colormap = cm.get_cmap(cmap)
    mfcc_rgb = colormap(mfcc_norm)[:, :, :3]  # Drop alpha channel if present

    # Step 4: Convert to uint8 (0-255)
    mfcc_rgb_uint8 = (mfcc_rgb * 255).astype(np.uint8)

    # Step 5: Convert to PIL image and save
    img = Image.fromarray(mfcc_rgb_uint8)
    img.save(output_path)



In [49]:
extract_frames_and_mfccs("/home/varaudio/Thang/emotion_recognition/data/Video_Song_Actor_02/Actor_02/01-02-01-01-01-01-02.mp4")

Saved 8 uint8 frame and MFCC images to 'output'


In [51]:
image = cv2.imread('/home/varaudio/Thang/emotion_recognition/test/output/frame_1.jpg')
print (image.shape)

(720, 1280, 3)
