In [2]:
from transformers import VideoMAEImageProcessor, VideoMAEForVideoClassification
import numpy as np
import torch
import cv2
from PIL import Image

cap = cv2.VideoCapture('./resultvideo.mp4')
frames = []

num_frames = 16

while len(frames) < num_frames:
    ret, frame = cap.read()
    if not ret:
        break
    frame = cv2.resize(frame, (224, 224))
    frames.append(frame)

video = np.array(frames)

video_pil = [] 
for f in video:
    pil_image = Image.fromarray(f)
    video_pil.append(pil_image)

processor = VideoMAEImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
model = VideoMAEForVideoClassification.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")


if len(video_pil) < num_frames:
    padding = [torch.zeros_like(processor(video_pil[0])).unsqueeze(0) for _ in range(num_frames - len(video_pil))]
    inputs = processor(video_pil + padding, return_tensors="pt", padding=True, max_length=num_frames)
else:
    inputs = processor(video_pil[:num_frames], return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
'''
predicted_class_idx = logits.argmax(-1).item()
print("Predicted class:", model.config.id2label[predicted_class_idx])'''

top_k = 5
probs, idx = torch.topk(torch.softmax(logits, dim=-1), k=top_k)
probs = probs.squeeze().tolist()
idx = idx.squeeze().tolist()




for i in range(top_k):
    predicted_class = model.config.id2label[idx[i]]
    probability = probs[i]
    print(f"Top {i+1} predicted class: {predicted_class}, Probability: {probability:.4f}")

Top 1 predicted class: tai chi, Probability: 0.3295
Top 2 predicted class: passing American football (not in game), Probability: 0.1145
Top 3 predicted class: dancing ballet, Probability: 0.0302
Top 4 predicted class: robot dancing, Probability: 0.0202
Top 5 predicted class: dunking basketball, Probability: 0.0175


In [1]:
import torch
