In [1]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'


In [4]:
import av
import numpy as np

from transformers import AutoImageProcessor, VideoMAEModel
from huggingface_hub import hf_hub_download

np.random.seed(0)

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    for i, frame in enumerate(container.decode(video=0)):
        if i > indices[-1]:
            break
        if i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

def sample_frame_indices(clip_len, seg_len):
    '''
    Sample a given number of frame indices from the video evenly across the entire length.
    Args:
        clip_len (`int`): Total number of frames to sample.
        seg_len (`int`): Total number of frames in the video.
    Returns:
        indices (`List[int]`): List of sampled frame indices
    '''
    indices = np.linspace(0, seg_len - 1, num=clip_len, endpoint=True)
    indices = np.round(indices).astype(np.int64)
    return indices.tolist()

# def read_video_pyav(container, indices):
#     '''
#     Decode the video with PyAV decoder.
#     Args:
#         container (`av.container.input.InputContainer`): PyAV container.
#         indices (`List[int]`): List of frame indices to decode.
#     Returns:
#         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
#     '''
#     frames = []
#     container.seek(0)
#     start_index = indices[0]
#     end_index = indices[-1]
#     for i, frame in enumerate(container.decode(video=0)):
#         if i > end_index:
#             break
#         if i >= start_index and i in indices:
#             frames.append(frame)
#     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


# def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
#     '''
#     Sample a given number of frame indices from the video.
#     Args:
#         clip_len (`int`): Total number of frames to sample.
#         frame_sample_rate (`int`): Sample every n-th frame.
#         seg_len (`int`): Maximum allowed index of sample's last frame.
#     Returns:
#         indices (`List[int]`): List of sampled frame indices
#     '''
#     converted_len = int(clip_len * frame_sample_rate)
#     end_idx = np.random.randint(converted_len, seg_len)
#     start_idx = end_idx - converted_len
#     indices = np.linspace(start_idx, end_idx, num=clip_len)
#     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
#     return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

# sample 16 frames
# indices = sample_frame_indices(clip_len=16, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
indices = sample_frame_indices(clip_len=16, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)
print(f'video:{video.shape}')
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
model = VideoMAEModel.from_pretrained("MCG-NJU/videomae-base")

inputs = image_processor(list(video), return_tensors="pt")

# forward pass
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
list(last_hidden_states.shape)

video:(16, 360, 640, 3)


  return torch.tensor(value)


[1, 1568, 768]

In [8]:
print(outputs)

BaseModelOutput(last_hidden_state=tensor([[[-0.8585, -0.6821,  0.0243,  ...,  0.1413,  0.6469, -0.0817],
         [-0.1544,  0.0288, -0.0167,  ..., -0.3857,  0.1204, -0.1545],
         [ 0.5996, -0.1285,  0.0552,  ...,  0.1270,  0.1242, -0.3623],
         ...,
         [ 0.6294, -0.0396, -0.0694,  ...,  0.4068, -0.1223,  0.0167],
         [ 0.4512, -0.2887,  0.0535,  ...,  0.2846, -0.3203,  0.0321],
         [ 0.6574, -0.0593, -0.0391,  ...,  0.4710, -0.1633, -0.1031]]],
       grad_fn=<NativeLayerNormBackward0>), hidden_states=None, attentions=None)


In [6]:
print(inputs['pixel_values'].dtype)

torch.float32


In [7]:
print(inputs['pixel_values'].shape)

torch.Size([1, 16, 3, 224, 224])


In [5]:
print(inputs)

{'pixel_values': tensor([[[[[-1.0048, -1.0048, -1.0219,  ..., -1.7583, -1.7069, -1.7069],
           [-1.0219, -1.0219, -1.0390,  ..., -1.7583, -1.7069, -1.7069],
           [-1.0562, -1.0562, -1.0562,  ..., -1.7583, -1.7069, -1.7069],
           ...,
           [-1.9638, -1.9638, -1.9467,  ...,  0.1254,  0.1083,  0.1083],
           [-1.9980, -1.9980, -1.9809,  ...,  0.1254,  0.1083,  0.0912],
           [-2.0494, -2.0494, -1.9980,  ...,  0.1083,  0.0912,  0.0741]],

          [[-1.7556, -1.7556, -1.7556,  ..., -1.8606, -1.8081, -1.8081],
           [-1.7731, -1.7731, -1.7731,  ..., -1.8606, -1.8081, -1.8081],
           [-1.8081, -1.8081, -1.8081,  ..., -1.8606, -1.8081, -1.8081],
           ...,
           [-2.0357, -2.0357, -2.0357,  ..., -0.4951, -0.4951, -0.4951],
           [-2.0357, -2.0357, -2.0357,  ..., -0.5126, -0.5126, -0.4951],
           [-2.0357, -2.0357, -2.0357,  ..., -0.5126, -0.4951, -0.4776]],

          [[-1.7173, -1.7173, -1.7173,  ..., -1.6824, -1.6302, -1.6302]

In [2]:
print(image_processor)

VideoMAEImageProcessor {
  "_valid_processor_keys": [
    "videos",
    "do_resize",
    "size",
    "resample",
    "do_center_crop",
    "crop_size",
    "do_rescale",
    "rescale_factor",
    "do_normalize",
    "image_mean",
    "image_std",
    "return_tensors",
    "data_format",
    "input_data_format"
  ],
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.485,
    0.456,
    0.406
  ],
  "image_processor_type": "VideoMAEImageProcessor",
  "image_std": [
    0.229,
    0.224,
    0.225
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}



In [3]:
print(model.embeddings)

VideoMAEEmbeddings(
  (patch_embeddings): VideoMAEPatchEmbeddings(
    (projection): Conv3d(3, 768, kernel_size=(2, 16, 16), stride=(2, 16, 16))
  )
)


In [6]:
print(inputs['pixel_values'].shape)

torch.Size([1, 16, 3, 224, 224])


In [1]:
import torch
from PIL import Image
import open_clip

model, _, preprocess = open_clip.create_model_and_transforms('MobileCLIP-S1', pretrained='datacompdr')
model.eval()  # model in train mode by default, impacts some models with BatchNorm or stochastic depth active
tokenizer = open_clip.get_tokenizer('ViT-B-32')
image = preprocess(Image.open("docs/CLIP.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    image_features /= image_features.norm(dim=-1, keepdim=True)
    text_features /= text_features.norm(dim=-1, keepdim=True)

    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)

print("Label probs:", text_probs)  # prints: [[1., 0., 0.]]

Label probs: tensor([[9.9980e-01, 1.8377e-04, 1.6858e-05]])


In [3]:
print(model.visual.trunk.fork_feat)

False


In [2]:
# model.visual.trunk.final_conv = torch.nn.Identity()
model.visual.trunk.head = torch.nn.Identity()
model.visual.head = torch.nn.Identity()


In [3]:
with torch.no_grad(), torch.cuda.amp.autocast():
    image_features = model.encode_image(image)

In [4]:
image_features.shape

torch.Size([1, 1024, 8, 8])

In [7]:
print(image_features)

tensor([[[[-1.6777e-01, -1.6147e-01, -1.6748e-01,  ...,  7.6636e-02,
           -1.5139e-01, -1.1881e-01],
          [-1.6004e-01, -1.6015e-01, -1.5510e-01,  ..., -1.1804e-01,
           -1.6248e-01, -7.1789e-04],
          [-1.6235e-01, -1.4091e-01, -1.5138e-01,  ..., -1.6069e-01,
           -1.1205e-01, -1.6799e-01],
          ...,
          [-1.6934e-01, -1.5087e-01, -1.6184e-01,  ..., -1.2955e-01,
           -8.4509e-02, -1.6715e-01],
          [-1.6059e-01, -1.5737e-01, -9.9238e-02,  ..., -1.6912e-01,
           -9.3838e-02, -1.5155e-01],
          [-1.5996e-01, -7.4727e-02, -1.6797e-01,  ..., -1.6365e-01,
           -1.6588e-01, -1.6966e-01]],

         [[ 1.7520e-03,  6.2955e-03,  1.3091e-02,  ..., -5.2338e-02,
            5.0242e-02,  4.2277e-02],
          [-1.0292e-02,  1.3960e-02, -1.4933e-03,  ...,  3.7706e-02,
            9.2762e-02,  2.4897e-01],
          [ 1.0124e-02, -1.5115e-02,  1.9859e-02,  ...,  1.9876e-02,
            1.5220e-01,  7.7530e-02],
          ...,
     

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class VideoFeatureExtractor(nn.Module):
    def __init__(self, pretrained_model):
        super(VideoFeatureExtractor, self).__init__()
        self.pretrained_model = pretrained_model  # 预训练的图像特征提取模型
        self.conv3d = nn.Conv3d(in_channels=1024, out_channels=1024, kernel_size=(2, 3, 3), padding=(0, 1, 1), stride=(2, 1, 1))

    def forward(self, x):
        # x shape: [B, num_frames, C, H, W]
        # 提取每一帧的特征
        b, t, c, h, w = x.shape
        x = x.view(b * t, c, h, w)
        x = self.pretrained_model(x)  # 假设预训练模型输出shape为[B*T, 1024, 8, 8]
        
        # 上采样特征图
        x = x.view(b, t, 1024, 8, 8)
        x = F.interpolate(x, size=(t, 14, 14), mode='trilinear', align_corners=False)  # 使用 trilinear 模式适用于三维数据
        
        # 时序特征融合
        x = self.conv3d(x)
        
        return x

# 假设你有一个预训练模型
pretrained_model = nn.Conv2d(in_channels=1024, out_channels=1024, kernel_size=3, stride=1, padding=1)
model = VideoFeatureExtractor(pretrained_model)

# 创建一些随机输入数据来测试模型
x = torch.rand(512, 16, 1024, 8, 8)  # B, num_frames, C, H, W 是输入数据的维度
output = model(x)
print("Output shape:", output.shape)  # 应该是 [B, num_frames/2, 1024, 14, 14]


RuntimeError: Given groups=1, weight of size [1024, 1024, 2, 3, 3], expected input[512, 16, 16, 14, 14] to have 1024 channels, but got 16 channels instead