In [22]:
import cv2
import torch
import numpy as np
from timesformer.models.vit import TimeSformer

model = TimeSformer(
    img_size=224,
    num_classes=600,  # 변경됨
    num_frames=8,
    attention_type='divided_space_time',
    pretrained_model='TimeSformer_divST_96x4_224_K600.pyth'
)
model.eval()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def load_video_frames(video_path, num_frames=8, resize=(224,224)):
    cap = cv2.VideoCapture(video_path)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    
    if total_frames == 0:
        raise ValueError("Could not read video or video has no frames.")
    
    frame_indices = np.linspace(0, total_frames-1, num_frames).astype(int)
    
    frames = []
    for i in range(total_frames):
        ret, frame = cap.read()
        if not ret:
            break
        if i in frame_indices:
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame = cv2.resize(frame, resize)
            frames.append(frame)
    cap.release()
    
    if len(frames) < num_frames:
        print(f"Warning: only {len(frames)} frames extracted, expected {num_frames}")
    
    frames = np.array(frames)  # (T, H, W, C)
    frames = np.transpose(frames, (3, 0, 1, 2))  # (C, T, H, W)
    frames = frames / 255.0
    frames = torch.tensor(frames, dtype=torch.float32)
    frames = frames.unsqueeze(0)  # (1, 3, T, H, W)
    return frames

video_path = 'input_video/tenis.mp4'

input_tensor = load_video_frames(video_path, num_frames=8, resize=(224,224))
input_tensor = input_tensor.to(device)

with torch.no_grad():
    pred = model(input_tensor)

predicted_class = torch.argmax(pred, dim=1).item()

with open('label_map_600.txt', 'r') as f:
    class_names = [line.strip() for line in f.readlines()]

print(f"Predicted class index: {predicted_class}, name: {class_names[predicted_class]}")


Predicted class index: 372, name: playing tennis
