# Experimenting with some pretrained video classification models

In [None]:
'''Taken from the torchvision documentation:

modified by gemini
'''

from torchvision.io.video import read_video
from torchvision.models.video import r3d_18, R3D_18_Weights
import torchvision.transforms as T
import torch

vid, _, _ = read_video("test/assets/videos/v_SoccerJuggling_g23_c01.avi", output_format="TCHW")
vid = vid[:32]  # optionally shorten duration

# Check current dimensions
print(f"Original video shape: {vid.shape}") # Should be T, C, H, W

# Step 1: Initialize model with the best available weights
weights = R3D_18_Weights.DEFAULT
model = r3d_18(weights=weights)
model.eval()

# Step 2: Initialize the inference transforms
preprocess = weights.transforms()

# Determine the expected minimum size after torchvision's transforms
# From documentation, it's typically 112x112 after central crop.
# To avoid the error, ensure your input before `preprocess` is large enough.
# A safe bet would be to aim for the `resize_size` of the transform which is [128, 171]
# or even larger, and let the `preprocess` handle the cropping.
required_height, required_width = 128, 171 # Or even 224, 224 to be safe before preprocessing

# If your video's height or width is smaller than required_height/width, resize it
current_height, current_width = vid.shape[2], vid.shape[3]
if current_height < required_height or current_width < required_width:
    print(f"Resizing video from ({current_height}x{current_width}) to ({required_height}x{required_width})")
    resize_transform = T.Resize((required_height, required_width))
    # Apply the resize transform to each frame
    # We need to loop through the frames or reshape for the transform
    # For a TCHW tensor, we can apply Resize directly
    vid_resized = resize_transform(vid)
    vid = vid_resized
    print(f"Resized video shape: {vid.shape}")


# Step 3: Apply inference preprocessing transforms
batch = preprocess(vid).unsqueeze(0)

# Step 4: Use the model and print the predicted category
prediction = model(batch).squeeze(0).softmax(0)
label = prediction.argmax().item()
score = prediction[label].item()
category_name = weights.meta["categories"][label]
print(f"{category_name}: {100 * score}%")



Downloading: "https://download.pytorch.org/models/r3d_18-b3b3357e.pth" to /home/luke/.cache/torch/hub/checkpoints/r3d_18-b3b3357e.pth


100.0%


RuntimeError: Calculated padded input size per channel: (2 x 118 x 118). Kernel size: (3 x 7 x 7). Kernel size can't be greater than actual input size