In [1]:
!pip install torch torchvision transformers diffusers bitsandbytes imageio av gradiog

Collecting bitsandbytes
  Downloading bitsandbytes-0.44.1-py3-none-manylinux_2_24_x86_64.whl.metadata (3.5 kB)
Collecting av
  Downloading av-13.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting gradio
  Downloading gradio-5.6.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.3 (from gradio)
  Downloading gradio_client-1.4.3-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-mu

In [None]:
import av
import numpy as np
import torch
import gc
import imageio
from transformers import (
    AutoImageProcessor,
    AutoTokenizer,
    VisionEncoderDecoderModel,
    T5EncoderModel,
    BitsAndBytesConfig,
)
from diffusers import LattePipeline
import gradio as gr

# Set up device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Function to clear GPU cache
def flush():
    gc.collect()
    torch.cuda.empty_cache()

# Load models for video captioning
image_processor = AutoImageProcessor.from_pretrained("MCG-NJU/videomae-base")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
caption_model = VisionEncoderDecoderModel.from_pretrained(
    "Neleac/timesformer-gpt2-video-captioning"
).to(device)

# Define the main processing function
def process_video(video_input):
    # Handle Gradio video input
    if isinstance(video_input, dict):
        video_path = video_input["name"]
    else:
        video_path = video_input  # Assume it's a filepath

    # Step 1: Generate caption from the input video
    container = av.open(video_path)

    # Extract evenly spaced frames from the video
    seg_len = container.streams.video[0].frames
    clip_len = caption_model.config.encoder.num_frames
    indices = set(
        np.linspace(0, seg_len - 1, num=clip_len, endpoint=True).astype(np.int64)
    )
    frames = []
    container.seek(0)
    for i, frame in enumerate(container.decode(video=0)):
        if i in indices:
            frames.append(frame.to_ndarray(format="rgb24"))
    container.close()

    # Generate caption
    gen_kwargs = {
        "min_length": 10,
        "max_length": 20,
        "num_beams": 5,
        "no_repeat_ngram_size": 2,
    }
    pixel_values = image_processor(frames, return_tensors="pt").pixel_values.to(device)
    tokens = caption_model.generate(pixel_values, **gen_kwargs)
    caption = tokenizer.decode(tokens[0], skip_special_tokens=True)

    # Step 2: Use the caption to generate a video
    prompt = caption
    negative_prompt = ""

    # Initialize the text encoder
    text_encoder = T5EncoderModel.from_pretrained(
        "maxin-cn/Latte-1",
        subfolder="text_encoder",
        quantization_config=BitsAndBytesConfig(
            load_in_4bit=True, bnb_4bit_compute_dtype=torch.float16
        ),
        device_map="auto",
    )

    # Initialize the pipeline with the text encoder
    pipe = LattePipeline.from_pretrained(
        "maxin-cn/Latte-1",
        text_encoder=text_encoder,
        transformer=None,
        device_map="balanced",
    )

    # Encode the prompt
    with torch.no_grad():
        prompt_embeds, negative_prompt_embeds = pipe.encode_prompt(
            prompt, negative_prompt=negative_prompt
        )

    # Release text encoder and pipeline to free memory
    del text_encoder
    del pipe
    flush()

    # Reload the pipeline without the text encoder
    pipe = LattePipeline.from_pretrained(
        "maxin-cn/Latte-1",
        text_encoder=None,
        torch_dtype=torch.float16,
    ).to(device)

    # Generate the video
    with torch.no_grad():
        videos = pipe(
            video_length=16,
            num_inference_steps=50,
            negative_prompt=None,
            prompt_embeds=prompt_embeds,
            negative_prompt_embeds=negative_prompt_embeds,
            output_type="pt",
        ).frames.cpu()

    # Save the generated video
    output_video_path = "generated_video.mp4"
    videos = (videos.clamp(0, 1) * 255).to(dtype=torch.uint8)
    imageio.mimwrite(
        output_video_path, videos[0].permute(0, 2, 3, 1), fps=8, quality=5
    )

    # Release pipeline and flush memory
    del pipe
    flush()

    # Return both the caption and the generated video path
    return caption, output_video_path

# Set up Gradio Interface
iface = gr.Interface(
    fn=process_video,
    inputs=gr.Video(label="Upload a Video"),
    outputs=[
        gr.Textbox(label="Generated Caption"),
        gr.Video(label="Generated Video"),
    ],
    title="Video Captioning and Generation",
    description="Upload a video to generate a caption and then generate a new video based on the caption.",
)

# Launch the Gradio App
iface.launch(debug=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/41.2k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Config of the encoder: <class 'transformers.models.timesformer.modeling_timesformer.TimesformerModel'> is overwritten by shared encoder config: TimesformerConfig {
  "_name_or_path": "facebook/timesformer-base-finetuned-k600",
  "architectures": [
    "TimesformerForVideoClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "attention_type": "divided_space_time",
  "drop_path_rate": 0,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "abseiling",
    "1": "acting in play",
    "2": "adjusting glasses",
    "3": "air drumming",
    "4": "alligator wrestling",
    "5": "answering questions",
    "6": "applauding",
    "7": "applying cream",
    "8": "archaeological excavation",
    "9": "archery",
    "10": "arguing",
    "11": "arm wrestling",
    "12": "arranging flowers",
    "13": "assembling bicycle",
    "14": "assembling computer",
    "15": "attending conference",
    "16": "auctioning",
    "17": "backflip (human)",
   

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://7dde54ccacb95f62e2.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


text_encoder/config.json:   0%|          | 0.00/780 [00:00<?, ?B/s]

(…)ext_encoder/model.safetensors.index.json:   0%|          | 0.00/19.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.87G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/4.19G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

model_index.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/20.5k [00:00<?, ?B/s]

tokenizer/added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/391M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/4 [00:00<?, ?it/s]

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/4.23G [00:00<?, ?B/s]

transformer/config.json:   0%|          | 0.00/503 [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/4 [00:00<?, ?it/s]

Some weights of the model checkpoint were not used when initializing LatteTransformer3DModel: 
 ['caption_projection.y_embedding']


  0%|          | 0/50 [00:00<?, ?it/s]