# TimeSformer
Copyright 2023, Denis Rothman

[Hugging Face documentation](https://huggingface.co/docs/transformers/model_doc/timesformer)



In [None]:
!pip install transformers

Installing PyAV, a Pythonic binding to the FFmpeg libraries for audio/video processing.

In [None]:
!pip install av

Collecting av
  Downloading av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.0/31.0 MB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: av
Successfully installed av-10.0.0


In [None]:
from IPython.display import HTML
from base64 import b64encode
from huggingface_hub import hf_hub_download


file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset")

# Load video
with open(file_path, 'rb') as f:
    video_data = f.read()

# Display video
HTML("""
<video width="320" height="240" controls>
  <source src="data:video/mp4;base64,{0}" type="video/mp4">
</video>
""".format(b64encode(video_data).decode()))

Downloading eating_spaghetti.mp4:   0%|          | 0.00/1.01M [00:00<?, ?B/s]

In [None]:
from transformers import TimesformerConfig, TimesformerModel

# Initializing a TimeSformer timesformer-base style configuration
configuration = TimesformerConfig()

# Initializing a model from the configuration
model = TimesformerModel(configuration)

# Accessing the model configuration
configuration = model.config

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
import av
import torch
import numpy as np

from transformers import AutoImageProcessor, TimesformerForVideoClassification
from huggingface_hub import hf_hub_download

np.random.seed(0)

In [None]:
def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])


def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
    converted_len = int(clip_len * frame_sample_rate)
    end_idx = np.random.randint(converted_len, seg_len)
    start_idx = end_idx - converted_len
    indices = np.linspace(start_idx, end_idx, num=clip_len)
    indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
    return indices


# video clip consists of 300 frames (10 seconds at 30 FPS)
file_path = hf_hub_download(
    repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
)
container = av.open(file_path)

# sample 8 frames
indices = sample_frame_indices(clip_len=8, frame_sample_rate=1, seg_len=container.streams.video[0].frames)
video = read_video_pyav(container, indices)

image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base-finetuned-kinetics")
model = TimesformerForVideoClassification.from_pretrained("facebook/timesformer-base-finetuned-k400")

inputs = image_processor(list(video), return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits

# model predicts one of the 400 Kinetics-400 classes
predicted_label = logits.argmax(-1).item()

Downloading (…)rocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Could not find image processor class in the image processor config or the model config. Loading based on pattern matching with the model's feature extractor configuration.


Downloading (…)lve/main/config.json:   0%|          | 0.00/22.7k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/486M [00:00<?, ?B/s]

  return torch.tensor(value)


In [None]:
print(model.config.id2label[predicted_label])

eating spaghetti
