https://colab.research.google.com/github/pytorch/pytorch.github.io/blob/master/assets/hub/facebookresearch_pytorchvideo_slowfast.ipynb

https://github.com/wufan-tb/yolo_slowfast

In [1]:
!pip install fvcore



In [2]:
!pip install av



In [3]:
import torch

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [5]:
sf_model = torch.hub.load(
    'facebookresearch/pytorchvideo',
    'slowfast_r50',
    pretrained=True
).eval().to(device)

Using cache found in /root/.cache/torch/hub/facebookresearch_pytorchvideo_main


In [6]:
from typing import Dict
import json
import urllib
from torchvision.transforms import Compose, Lambda
from torchvision.transforms._transforms_video import (
    CenterCropVideo,
    NormalizeVideo,
)
from pytorchvideo.data.encoded_video import EncodedVideo
from pytorchvideo.transforms import (
    ApplyTransformToKey,
    ShortSideScale,
    UniformTemporalSubsample,
    UniformCropVideo
)



In [7]:
json_url = "https://dl.fbaipublicfiles.com/pyslowfast/dataset/class_names/kinetics_classnames.json"
json_filename = "kinetics_classnames.json"

try: urllib.URLopener().retrieve(json_url, json_filename)
except: urllib.request.urlretrieve(json_url, json_filename)

In [8]:
with open(json_filename, "r") as f:
    kinetics_classnames = json.load(f)

kinetics_id_to_classname = {}
for k, v in kinetics_classnames.items():
    kinetics_id_to_classname[v] = str(k).replace('"', "")

kinetics_id_to_classname

{290: 'sharpening knives',
 115: 'eating ice cream',
 81: 'cutting nails',
 53: 'changing wheel',
 19: 'bench pressing',
 88: 'deadlifting',
 111: 'eating carrots',
 192: 'marching',
 358: 'throwing discus',
 231: 'playing flute',
 72: 'cooking on campfire',
 33: 'breading or breadcrumbing',
 218: 'playing badminton',
 276: 'ripping paper',
 244: 'playing saxophone',
 197: 'milking cow',
 169: 'juggling balls',
 130: 'flying kite',
 43: 'capoeira',
 187: 'making jewelry',
 100: 'drinking',
 228: 'playing cymbals',
 61: 'cleaning gutters',
 161: 'hurling (sport)',
 239: 'playing organ',
 361: 'tossing coin',
 395: 'wrestling',
 103: 'driving car',
 150: 'headbutting',
 147: 'gymnastics tumbling',
 186: 'making bed',
 0: 'abseiling',
 155: 'holding snake',
 278: 'rock climbing',
 71: 'cooking egg',
 182: 'long jump',
 17: 'bee keeping',
 365: 'trimming or shaving beard',
 63: 'cleaning shoes',
 86: 'dancing gangnam style',
 50: 'catching or throwing softball',
 164: 'ice skating',
 168: 

In [9]:
side_size = 256
mean = [0.45, 0.45, 0.45]
std = [0.225, 0.225, 0.225]
crop_size = 256
num_frames = 32
sampling_rate = 2
frames_per_second = 30
slowfast_alpha = 4
num_clips = 10
num_crops = 3

In [10]:
clip_duration = (num_frames * sampling_rate)/frames_per_second

In [11]:
source_video_path = 'source_video.mp4'
output_video_path = 'output_video.mp4'

In [12]:
class PackPathway(torch.nn.Module):
    """
    Transform for converting video frames as a list of tensors.
    """
    def __init__(self):
        super().__init__()

    def forward(self, frames: torch.Tensor):
        fast_pathway = frames
        slow_pathway = torch.index_select(
            frames,
            1,
            torch.linspace(
                0, frames.shape[1] - 1, frames.shape[1] // slowfast_alpha
            ).long(),
        )
        frame_list = [slow_pathway, fast_pathway]
        return frame_list

In [13]:
transform =  ApplyTransformToKey(
    key="video",
    transform=Compose(
        [
            UniformTemporalSubsample(num_frames),
            Lambda(lambda x: x/255.0),
            NormalizeVideo(mean, std),
            ShortSideScale(
                size=side_size
            ),
            CenterCropVideo(crop_size),
            PackPathway()
        ]
    ),
)

In [14]:
def get_video_interval(video_path, start_sec, clip_duration):
    end_sec = start_sec + clip_duration

    video = EncodedVideo.from_path(video_path)

    video_data = video.get_clip(start_sec=start_sec, end_sec=end_sec)

    video_data = transform(video_data)

    inputs = video_data["video"]
    inputs = [i.to(device)[None, ...] for i in inputs]

    return inputs

In [15]:
def predict(inputs):
    with torch.no_grad():
        preds = sf_model(inputs)

    post_act = torch.nn.Softmax(dim=1)
    action_id = preds.argmax(dim=1).item()

    return kinetics_id_to_classname[action_id]

In [16]:
start_sec = 0
interval = get_video_interval('source_video.mp4', start_sec, clip_duration)

print(len(interval))
print(interval[0].shape) # slow
print(interval[1].shape) # fast

predict(interval)

2
torch.Size([1, 3, 8, 256, 256])
torch.Size([1, 3, 32, 256, 256])


'clean and jerk'

In [17]:
for i in range(10):
    interval = get_video_interval('source_video.mp4', i, clip_duration)
    pred = predict(interval)
    print(i, 's', pred)

0 s clean and jerk
1 s clean and jerk
2 s clean and jerk
3 s clean and jerk
4 s clean and jerk
5 s clean and jerk
6 s clean and jerk
7 s clean and jerk
8 s clean and jerk
9 s clean and jerk


In [18]:
!pip install ultralytics



In [19]:
!pip install deep-sort-realtime



In [20]:
import cv2
from collections import defaultdict, deque
import numpy as np

In [21]:
from ultralytics import YOLO

yolo_model = YOLO("yolov8n.pt")
PERSON_CLASS_ID = 0

In [22]:
from deep_sort_realtime.deepsort_tracker import DeepSort

tracker = DeepSort(
    max_age=30,
    n_init=3,
    max_cosine_distance=0.3
)

In [23]:
import torch
import cv2
import numpy as np

In [24]:
def preprocess_clip(frames_np):
    frames = torch.from_numpy(frames_np)
    frames = frames.permute(3, 0, 1, 2)
    frames = frames.float()

    video_data = {"video": frames}
    video_data = transform(video_data)
    inputs = video_data["video"]

    inputs = [i.unsqueeze(0).to(device) for i in inputs]

    return inputs

In [25]:
def predict_clip(frames):
    inputs = preprocess_clip(frames)

    with torch.no_grad():
        preds = sf_model(inputs)

    action_id = preds.argmax(dim=1).item()

    return kinetics_id_to_classname[action_id]

In [26]:
from collections import defaultdict, deque

clip_buffers = defaultdict(lambda: deque(maxlen=32))
action_labels = {}

In [27]:
cap = cv2.VideoCapture(source_video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
w = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
h = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

fourcc = cv2.VideoWriter_fourcc(*"mp4v")
out = cv2.VideoWriter(output_video_path, fourcc, fps, (w, h))

In [28]:
labels = []

In [29]:
from tqdm.notebook import tqdm

frame_index = 0
pbar = tqdm(total=300)

while cap.isOpened():

    ret, frame = cap.read()
    if not ret:
        break

    MIN_BOX_AREA_RATIO = 0.1
    frame_area = frame.shape[0] * frame.shape[1]

    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)

    results = yolo_model(rgb, verbose=False)[0]
    detections = []

    for box in results.boxes:
        cls_id = int(box.cls[0])

        if cls_id != PERSON_CLASS_ID:
            continue

        x1, y1, x2, y2 = box.xyxy[0].cpu().numpy()

        w = x2 - x1
        h = y2 - y1
        area = w * h

        if area < MIN_BOX_AREA_RATIO * frame_area:
            continue

        conf = float(box.conf[0])
        detections.append(([x1, y1, x2 - x1, y2 - y1], conf, 'person'))

    tracks = tracker.update_tracks(detections, frame=frame)

    for track in tracks:
        if not track.is_confirmed():
            continue

        track_id = track.track_id
        l, t, r, b = track.to_ltrb()
        l, t, r, b = map(int, [l, t, r, b])

        person_crop = rgb[t:b, l:r]
        if person_crop.size == 0:
            continue

        person_crop = cv2.resize(person_crop, (256, 256))
        clip_buffers[track_id].append(person_crop)

        if len(clip_buffers[track_id]) == 32 and frame_index % 8 == 0:
            clip = np.array(clip_buffers[track_id])
            action = predict_clip(clip)
            action_labels[track_id] = action

        label = action_labels.get(track_id, "...")

        labels.append(label)

        cv2.rectangle(frame, (l, t), (r, b), (0, 255, 0), 2)
        cv2.putText(frame, f"ID {track_id}: {label}",
                    (l, t - 10),
                    cv2.FONT_HERSHEY_SIMPLEX,
                    0.6, (0, 255, 0), 2)

    out.write(frame)
    frame_index += 1
    pbar.update(1)

cap.release()
out.release()
pbar.close()

  0%|          | 0/300 [00:00<?, ?it/s]

In [32]:
from collections import Counter

label_counts = Counter(labels)
label_counts

Counter({'...': 33,
         'vault': 68,
         'playing trumpet': 48,
         'gymnastics tumbling': 8,
         'situp': 16,
         'headbutting': 8,
         'playing keyboard': 16,
         'waxing legs': 8,
         'lunge': 48,
         'clean and jerk': 40})