In [1]:
# Install dependencies (run once per environment)
%pip install -q dspy python-dotenv google-genai fal-client pillow ffmpeg-python
print("Note: restart the kernel if packages were freshly installed.")


Note: you may need to restart the kernel to use updated packages.
Note: restart the kernel if packages were freshly installed.


In [2]:
# Basic imports and environment setup
import os
import json
import dspy
from dotenv import load_dotenv
from dspy import History

load_dotenv()

# Configure LM (Google Gemini to mirror LangGraph usage; adjust if needed)
lm = dspy.LM("openai/gpt-5-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=1, max_tokens=16000)
dspy.configure(lm=lm)

print("DSPy configured for Video Generation agent.")


DSPy configured for Video Generation agent.


In [3]:
# Tools and utilities (ported from LangGraph as plain Python functions)
import re
import shlex
import shutil
import subprocess
from typing import Any, Dict, List, Optional

import ffmpeg  # type: ignore
from PIL import Image

# Helper: temp directories

def initialize_tmp_directories() -> None:
    directories = [
        "/tmp/assets",
        "/tmp/audio",
        "/tmp/unsplash",
        "/tmp/subtitles",
        "/tmp/videos",
        "/tmp/images",
    ]
    for d in directories:
        os.makedirs(d, exist_ok=True)


def _safe_remove_path(path: str) -> tuple[bool, Optional[str]]:
    try:
        if not os.path.exists(path):
            return True, None
        if os.path.isdir(path) and not os.path.islink(path):
            shutil.rmtree(path, ignore_errors=False)
        else:
            os.remove(path)
        return True, None
    except Exception as e:
        return False, str(e)


def cleanup_tmp_directories(preserve_paths: Optional[List[str]] = None) -> Dict[str, Any]:
    preserved = set(preserve_paths or [])
    target_directories = [
        "/tmp/unsplash",
        "/tmp/unsplash_photos",
        "/tmp/audio",
        "/tmp/subtitles",
        "/tmp/videos",
        "/tmp/images",
    ]
    removed: List[str] = []
    errors: List[str] = []
    processed: List[str] = []

    for directory in target_directories:
        if not os.path.isdir(directory):
            continue
        processed.append(directory)
        try:
            for name in os.listdir(directory):
                candidate = os.path.join(directory, name)
                if candidate in preserved:
                    continue
                ok, err = _safe_remove_path(candidate)
                if ok:
                    removed.append(candidate)
                elif err:
                    errors.append(f"{candidate}: {err}")
        except Exception as e:
            errors.append(f"{directory}: {e}")
        try:
            if directory != "/tmp/videos" and not os.listdir(directory):
                os.rmdir(directory)
        except Exception:
            pass

    return {
        "removed_files": removed,
        "errors": errors,
        "preserved": sorted(list(preserved)),
        "directories_processed": processed,
    }


# Tool: create_story_board

def create_story_board(
    product_name: str,
    brand: str,
    target_audience: str,
    key_message: str,
    tone: str,
    scenes_count: int = 3,
    default_scene_duration_seconds: float = 6.0,
) -> str:
    """Create a simple 3-scene storyboard (beginning, middle, end).

    Returns JSON string with a `scenes` array.
    """
    try:
        initialize_tmp_directories()
        scene_names = ["beginning", "middle", "end"]
        scenes: List[Dict[str, Any]] = []
        for idx, sid in enumerate(scene_names[: max(1, min(10, int(scenes_count))) ]):
            title = sid.capitalize()
            desc = (
                f"{brand} {product_name} ad for {target_audience}. {sid} of story, tone: {tone}. "
                f"Key message: {key_message}. Visual focus: product + person."
            )
            prompt = (
                f"Photorealistic {product_name} with a person interacting, cinematic lighting, "
                f"brand aesthetic for {brand}, clean composition, ad-ready. Scene: {sid}."
            )
            scenes.append(
                {
                    "id": sid,
                    "title": title,
                    "description": desc,
                    "prompt": prompt,
                    "duration_seconds": float(max(1.0, min(60.0, default_scene_duration_seconds))),
                }
            )
        return json.dumps({"scenes": scenes}, ensure_ascii=False)
    except Exception as e:
        return json.dumps({"scenes": [], "error": f"create_story_board failed: {e}"})


# Tool: update_story_board

def update_story_board(storyboard_json: str, instructions: str) -> str:
    """Apply lightweight edits (e.g., `duration=8`). Accepts and returns JSON string."""
    try:
        data = json.loads(storyboard_json) if isinstance(storyboard_json, str) else storyboard_json
        scenes = data.get("scenes", []) if isinstance(data, dict) else []
        m = re.search(r"duration\s*=\s*(\d+(?:\.\d+)?)", instructions.lower())
        if m:
            new_dur = float(m.group(1))
            for sc in scenes:
                if isinstance(sc, dict):
                    sc["duration_seconds"] = float(max(1.0, min(60.0, new_dur)))
        return json.dumps({"scenes": scenes}, ensure_ascii=False)
    except Exception as e:
        return json.dumps({"error": f"update_story_board failed: {e}", "scenes": []})


# Tool: generate_image via Google Gemini Images

def _extract_text_from_genai(resp: Any) -> str:
    # Best-effort extraction of text from google-genai response
    try:
        t = getattr(resp, "text", None)
        if isinstance(t, str):
            return t
        # candidates → parts → text
        cands = getattr(resp, "candidates", None) or []
        for c in cands:
            content = getattr(c, "content", None)
            parts = getattr(content, "parts", None) or []
            for p in parts:
                txt = getattr(p, "text", None)
                if isinstance(txt, str):
                    return txt
    except Exception:
        pass
    return ""


def generate_image(prompt: str, num_images: int = 1, output_basename: str = "product_ad_image") -> str:
    """Generate one or more images using Google Gemini and save PNG files under /tmp/images.

    Returns JSON: {"images": [paths...], "error"?: str}
    """
    try:
        from google import genai  # lazy import
        api_key = os.getenv("GOOGLE_API_KEY", "")
        if not api_key:
            return json.dumps({"images": [], "error": "GOOGLE_API_KEY not set"})
        client = genai.Client(api_key=api_key)
        resp = client.models.generate_content(
            model="gemini-2.5-flash-image-preview",
            contents=[prompt],
        )
        os.makedirs("/tmp/images", exist_ok=True)
        saved: List[str] = []
        idx = 0
        # Extract images from interleaved parts
        try:
            cands = getattr(resp, "candidates", None) or []
            for c in cands:
                content = getattr(c, "content", None)
                parts = getattr(content, "parts", None) or []
                for part in parts:
                    inline_data = getattr(part, "inline_data", None)
                    if inline_data is None:
                        continue
                    data = getattr(inline_data, "data", None)
                    if not isinstance(data, (bytes, bytearray, memoryview)):
                        continue
                    try:
                        image = Image.open(BytesIO(bytes(data)))
                        out_path = f"/tmp/images/{output_basename}_{idx}.png"
                        image.save(out_path)
                        saved.append(out_path)
                        idx += 1
                        if len(saved) >= max(1, min(4, int(num_images))):
                            break
                    except Exception:
                        continue
                if len(saved) >= max(1, min(4, int(num_images))):
                    break
        except Exception:
            pass
        if not saved:
            # Try to pull any text error
            txt = _extract_text_from_genai(resp)
            return json.dumps({"images": [], "error": txt or "No images in response"})
        return json.dumps({"images": saved})
    except Exception as e:
        return json.dumps({"images": [], "error": f"generate_image error: {e}"})


# Tool: kling video from image via fal.ai

def kling_generate_video_from_image(
    prompt: str,
    image_path: str = "",
    image_url: str = "",
    duration_seconds: float = 6.0,
    endpoint: str = "fal-ai/kling/v1",
    seed: Optional[int] = None,
) -> str:
    """Generate short video (<=10s typical) from image via fal.ai Kling endpoint.

    Returns JSON with keys: request_id?, video_url?, logs?
    """
    try:
        import fal_client  # type: ignore
        # configure key from env if present
        fal_api_key = os.getenv("FAL_KEY") or os.getenv("FAL_API_KEY")
        if fal_api_key:
            os.environ.setdefault("FAL_KEY", fal_api_key)

        # upload image if local path provided
        img_url = image_url
        if image_path and os.path.exists(image_path):
            try:
                img_url = fal_client.upload_file(image_path)
            except Exception:
                img_url = image_url

        logs: List[str] = []

        def on_queue_update(update: Any):
            try:
                if isinstance(update, fal_client.InProgress):
                    for log in update.logs:
                        msg = log.get("message")
                        if isinstance(msg, str):
                            logs.append(msg)
            except Exception:
                pass

        args: Dict[str, Any] = {
            "prompt": prompt,
            "duration": float(duration_seconds),
        }
        if img_url:
            args["image_url"] = img_url
        if seed is not None:
            args["seed"] = int(seed)

        result = fal_client.subscribe(endpoint, arguments=args, with_logs=True, on_queue_update=on_queue_update)
        video_url = None
        try:
            if isinstance(result, dict):
                video_url = result.get("video_url") or result.get("url") or result.get("output_url")
        except Exception:
            video_url = None

        out = {
            "request_id": getattr(result, "request_id", None) if result else None,
            "video_url": video_url,
            "logs": logs or None,
        }
        return json.dumps(out)
    except Exception as e:
        return json.dumps({"video_url": None, "logs": [f"error: {e}"]})


# Tool: run ffmpeg binary

def run_ffmpeg_binary(command: str, output_path: str = "", timeout_seconds: int = 180) -> str:
    """Execute ffmpeg command safely. Returns JSON summary including stdout/stderr tails."""
    try:
        if not isinstance(command, str) or not command.strip():
            return json.dumps({"error": "Command must be a non-empty string"})
        if not shutil.which("ffmpeg"):
            return json.dumps({"error": "ffmpeg not found on PATH. Please install ffmpeg."})
        if output_path:
            out_dir = os.path.dirname(output_path) or "."
            os.makedirs(out_dir, exist_ok=True)

        planned = command.strip()
        if not planned.lower().startswith("ffmpeg"):
            planned = f"ffmpeg {planned}"
        if " -y " not in f" {planned} ":
            parts = planned.split()
            if parts and parts[0].lower() == "ffmpeg":
                parts.insert(1, "-y")
                planned = " ".join(parts)
        try:
            cmd_list = shlex.split(planned)
        except Exception as e:
            return json.dumps({"error": f"Invalid command: {e}", "ffmpeg_command": planned})

        try:
            completed = subprocess.run(
                cmd_list,
                capture_output=True,
                text=True,
                check=False,
                timeout=max(10, int(timeout_seconds)),
            )
        except subprocess.TimeoutExpired:
            return json.dumps({"error": f"FFmpeg timed out after {timeout_seconds}s", "ffmpeg_command": planned})
        except OSError as e:
            return json.dumps({"error": f"Failed to run ffmpeg: {e}", "ffmpeg_command": planned})

        stdout_tail = (completed.stdout or "")[-2000:]
        stderr_tail = (completed.stderr or "")[-4000:]

        if completed.returncode != 0:
            return json.dumps({
                "error": f"FFmpeg failed with code {completed.returncode}: {stderr_tail[-800:]}",
                "ffmpeg_command": planned,
                "stdout_tail": stdout_tail,
                "stderr_tail": stderr_tail,
                "output_path": output_path,
            })

        return json.dumps({
            "output_path": output_path,
            "ffmpeg_command": planned,
            "stdout_tail": stdout_tail,
            "stderr_tail": stderr_tail,
        })
    except Exception as e:
        return json.dumps({"error": f"run_ffmpeg_binary error: {e}"})


# Tool: score video with metadata + LLM rubric

def score_video(video_path: str, target_quality_score: Optional[float] = None) -> str:
    """Score a video using ffmpeg metadata and a Gemini text rubric. Returns JSON."""
    try:
        if not os.path.exists(video_path):
            return json.dumps({"error": f"Video not found: {video_path}"})
        meta: Dict[str, Any] = {}
        try:
            probe = ffmpeg.probe(video_path)  # type: ignore
            streams = probe.get("streams", [])
            vstreams = [s for s in streams if s.get("codec_type") == "video"]
            astreams = [s for s in streams if s.get("codec_type") == "audio"]
            fmt = probe.get("format", {})
            if vstreams:
                v0 = vstreams[0]
                meta["width"] = v0.get("width")
                meta["height"] = v0.get("height")
                meta["avg_frame_rate"] = v0.get("avg_frame_rate")
            if astreams:
                a0 = astreams[0]
                meta["audio_channels"] = a0.get("channels")
            meta["duration"] = fmt.get("duration")
        except Exception:
            pass

        api_key = os.getenv("GOOGLE_API_KEY", "")
        if not api_key:
            return json.dumps({"error": "GOOGLE_API_KEY not set for scoring"})

        from google import genai
        client = genai.Client(api_key=api_key)
        target = (f"Target quality score: {float(target_quality_score)}" if target_quality_score is not None else "")
        prompt = (
            "You are a strict video quality evaluator. Return ONLY a JSON object with keys "
            "visual_quality (0-10 int), audio_quality (0-10 int), narrative_coherence (0-10 int), feedback (string).\n" \
            + f"Video Path: {video_path}\nMetadata: {json.dumps(meta)}\n{target}\n"
        )
        resp = client.models.generate_content(model="gemini-2.5-flash", contents=[prompt])
        content_text = _extract_text_from_genai(resp)
        # Best-effort JSON extraction
        try:
            start = content_text.find("{")
            end = content_text.rfind("}")
            payload = content_text[start : end + 1] if start != -1 and end != -1 else content_text
            data = json.loads(payload)
        except Exception:
            return json.dumps({"error": "Failed to parse JSON from model", "raw": content_text})

        def _clamp_int(x: Any) -> int:
            try:
                return max(0, min(10, int(x)))
            except Exception:
                return 0

        v = _clamp_int(data.get("visual_quality"))
        a = _clamp_int(data.get("audio_quality"))
        n = _clamp_int(data.get("narrative_coherence"))
        score = round((v + a + n) / 3.0, 2)
        out = {
            "quality_score": score,
            "breakdown": {"visual_quality": v, "audio_quality": a, "narrative_coherence": n},
            "feedback": data.get("feedback", ""),
        }
        return json.dumps(out)
    except Exception as e:
        return json.dumps({"error": f"scoring failed: {e}"})


In [4]:
# ReAct Signature and Agent

class VideoReActSignature(dspy.Signature):
    """
    You are a product advertisement creative agent.
    Tools:
    - create_story_board(product_name, brand, target_audience, key_message, tone, scenes_count=3, default_scene_duration_seconds=6.0)
    - update_story_board(storyboard_json, instructions)
    - generate_image(prompt, num_images=1, output_basename="product_ad_image")
    - kling_generate_video_from_image(prompt, image_path="", image_url="", duration_seconds=6.0, endpoint="fal-ai/kling/v1", seed=None)
    - run_ffmpeg_binary(command, output_path="", timeout_seconds=180)
    - score_video(video_path, target_quality_score=None)

    Workflow: Create storyboard → update storyboard (if needed) → generate image → generate video → (optional) ffmpeg postprocess → score video.
    Kling videos are limited to ~10 seconds; for longer videos, generate multiple clips and stitch with ffmpeg.

    Produce outputs:
    - `action`: one of {create_story_board, update_story_board, generate_image, kling_generate_video_from_image, run_ffmpeg_binary, score_video, answer_direct}
    - `tool_result`: JSON string from the tool call (may be empty for answer_direct)
    - `answer`: concise user-facing update or the final summary
    """
    user_message: str = dspy.InputField(description="User request for the advertisement video")
    history: dspy.History = dspy.InputField(description="Conversation history")

    reasoning: str = dspy.OutputField(description="Brief plan and justification")
    action: str = dspy.OutputField(description="Chosen action/tool")
    tool_result: str = dspy.OutputField(description="Tool output used to answer")
    answer: str = dspy.OutputField(description="Final answer or progress update")


class VideoGenerationAgent(dspy.Module):
    def __init__(self, max_iters: int = 5):
        super().__init__()
        self.conversation_history = dspy.History(messages=[])
        self.react = dspy.ReAct(
            VideoReActSignature,
            tools=[
                create_story_board,
                update_story_board,
                generate_image,
                kling_generate_video_from_image,
                run_ffmpeg_binary,
                score_video,
            ],
            max_iters=max_iters,
        )

    def forward(self, user_message: str):
        initialize_tmp_directories()
        self.conversation_history.messages.append({"role": "user", "content": user_message})
        result = self.react(user_message=user_message, history=self.conversation_history)
        answer = getattr(result, "answer", "")
        if isinstance(answer, str) and answer.strip():
            self.conversation_history.messages.append({"role": "assistant", "content": answer})
        return result


agent = VideoGenerationAgent(max_iters=5)
print("VideoGenerationAgent ready.")


VideoGenerationAgent ready.


In [5]:
# Example usage / smoke tests

# 1) Create storyboard
payload = {
    "product_name": "EcoBottle",
    "brand": "GreenLife",
    "target_audience": "health-conscious young adults",
    "key_message": "Reusable, stylish, and sustainable",
    "tone": "inspiring",
}
resp = agent(user_message=(
    "Create a 3-scene product ad storyboard for EcoBottle by GreenLife aimed at "
    "health-conscious young adults with an inspiring tone; key message: Reusable, stylish, sustainable."
))
print({
    "action": getattr(resp, "action", ""),
    "answer": getattr(resp, "answer", ""),
})

# 2) (Optional) Generate image for the first scene — direct tool call for demo
storyboard = create_story_board(**payload)
print("Storyboard:", storyboard[:120], "...")

img_result = generate_image(
    prompt="Photorealistic water bottle with person at gym, cinematic lighting, brand aesthetic", num_images=1
)
print("Image result:", img_result)

# 3) (Optional) Attempt Kling video generation (requires FAL_KEY/FAL_API_KEY)
vid_result = kling_generate_video_from_image(
    prompt="Dynamic pan of athlete lifting EcoBottle, energetic mood",
    image_path="",  # optionally pass a saved image path from generate_image
    image_url="",   # or a public URL
    duration_seconds=6.0,
)
print("Kling result:", vid_result)

# 4) (Optional) If a local video is available, score it
# Example: score a sample video if present
sample_path = "example_video.mp4"
if os.path.exists(sample_path):
    print("Scoring example video...")
    print(score_video(sample_path))
else:
    print("No local sample video found to score.")




{'action': 'answer_direct', 'answer': 'Storyboard for EcoBottle — GreenLife\nTarget audience: health-conscious young adults\nTone: inspiring\nKey message: Reusable, stylish, sustainable.\nScene duration: 6.0s each\n\nScenes:\n1) id: beginning\n   title: Beginning\n   duration_seconds: 6.0\n   description: Close-up introduction of the EcoBottle in active use — establishes product, design and the “Reusable” message.\n   output_path: /tmp/images/ecobottle_scene_1.png\n   prompt (image-generation-ready):\n   "Scene 1 - Beginning (save as /tmp/images/ecobottle_scene_1.png): Photorealistic high-resolution PNG. Close-up of a health-conscious young adult jogging in a sunlit park, taking a confident sip from a sleek EcoBottle by GreenLife. Golden-hour cinematic lighting, shallow depth of field, strong emphasis on bottle design and visible GreenLife logo, natural skin tones, minimal background clutter, 3:2 crop, high detail. Text overlay placeholder: \'Reusable.\' Ensure consistent EcoBottle app



Image result: {"images": [], "error": "No images in response"}
Kling result: {"video_url": null, "logs": ["error: Path /v1 not found"]}
Scoring example video...
{"quality_score": 5.67, "breakdown": {"visual_quality": 8, "audio_quality": 6, "narrative_coherence": 3}, "feedback": "The technical metadata indicates a robust foundation with 1080p resolution at 60 frames per second, suggesting excellent potential for visual clarity and fluidity. However, a 'strict' evaluation reserves perfect scores for flawless execution beyond mere specifications. Without direct visual inspection, a score of 8 acknowledges the strong technical base but anticipates potential imperfections such as compression artifacts, suboptimal lighting, or minor focus issues that commonly prevent absolute visual perfection. Audio quality, without any provided metadata, is rated a provisional 6. This assumes an average performance, likely marred by common issues like inconsistent levels, ambient noise, or less-than-pristi