In [2]:
import os
import logging
import time
import json
from typing import Any
import google.generativeai as genai
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

In [5]:
import typing_extensions as typing2

class VideoResponse(typing2.TypedDict):
    location: str
    video_description: str
    mood_of_video: list[str]

In [3]:
def process_video(video_path: str) -> Any:
    video_file = genai.upload_file(path=video_path)

    while video_file.state.name == "PROCESSING":
        logging.debug("Waiting for video to be processed.")
        time.sleep(10)
        video_file = genai.get_file(video_file.name)

    if video_file.state.name == "FAILED":
        raise ValueError(video_file.state.name)

    model = genai.GenerativeModel("gemini-1.5-flash")
    prompt = "Generate a description of the video, its mood and where the video takes place to your best guess."
    logging.debug(f"Generating description for video: {video_file.name}")
    result = model.generate_content(
        [prompt, video_file],
        generation_config=genai.GenerationConfig(
            response_mime_type="application/json", response_schema=VideoResponse
        ),
        request_options={"timeout": 600},
    )
    genai.delete_file(video_file.name)
    return json.loads(result.text)

In [6]:
process_video("test_video_22s.mp4")

'{"location": "A sandy shore next to a still, reflective lake.", "mood_of_video": ["Calm", "Peaceful"], "video_description": "A dog walks along the edge of a still, reflective lake on a bright, sunny day."}\n'