Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 100 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,106 @@ The Edit Decision List is a declarative JSON format for multi-step edits. Operat

**Available operations:** `trim`, `split`, `concat`, `reorder`, `extract`, `fade`, `speed`, `mix_audio`, `volume`, `replace_audio`, `normalize`

## Screen Recording Pipeline

CutAgent doesn't capture screens — FFmpeg (its underlying engine) handles that part. Capture with FFmpeg, then immediately hand the file to CutAgent for post-production.

### Step 1: Record your screen with FFmpeg

**macOS (avfoundation)**

```bash
# List available devices first
ffmpeg -f avfoundation -list_devices true -i ""

# Record screen (device index 1) with system audio (device index 0)
ffmpeg -f avfoundation -i "1:0" -t 300 screen.mp4
```

**Linux (x11grab)**

```bash
# Full-screen capture at 1920×1080
ffmpeg -f x11grab -s 1920x1080 -r 30 -i :0.0 -t 300 screen.mp4
```

**Windows (gdigrab)**

```bash
# Full desktop capture
ffmpeg -f gdigrab -framerate 30 -i desktop -t 300 screen.mp4
```

### Step 2: Post-process with CutAgent

After recording, the typical cleanup steps are silence detection (to find dead air at the start/end or during pauses), trimming, and audio normalization.

```bash
# Inspect the recording
cutagent probe screen.mp4

# Find silence intervals (dead air, pauses)
cutagent silence screen.mp4 --threshold -35 --min-duration 0.5

# Get a full content map (scenes + silence + suggested cuts)
cutagent summarize screen.mp4

# Trim to the content window (remove intro/outro dead air)
cutagent trim screen.mp4 --start 00:00:02.1 --end 00:08:43.7 -o content.mp4

# Normalize audio loudness for streaming/sharing
cutagent normalize content.mp4 -o final.mp4
```

### Python pipeline example

This example auto-detects silence boundaries and builds the full post-processing pipeline programmatically:

```python
from cutagent import probe, detect_silence, execute_edl
from cutagent.models import format_time

recording = "screen.mp4"

# Detect intro/outro silence
silences = detect_silence(recording, threshold=-35.0, min_duration=0.5)

# Derive content window from first and last silence boundary
content_start = format_time(silences[0].end) if silences else "0"
content_end = format_time(silences[-1].start) if len(silences) >= 2 else format_time(probe(recording).duration)

# Build and execute the EDL: trim dead air → normalize audio
edl = {
"version": "1.0",
"inputs": [recording],
"operations": [
{"op": "trim", "source": "$input.0", "start": content_start, "end": content_end},
{"op": "normalize", "source": "$0", "target_lufs": -16.0},
],
"output": {"path": "final.mp4", "codec": "libx264"},
}

result = execute_edl(edl)
print(result.to_dict())
```

### EDL example — screen recording workflow

```json
{
"version": "1.0",
"inputs": ["screen.mp4"],
"operations": [
{"op": "trim", "source": "$input.0", "start": "00:00:02.1", "end": "00:08:43.7"},
{"op": "normalize", "source": "$0", "target_lufs": -16.0},
{"op": "text", "source": "$1",
"entries": [{"text": "Demo", "position": "bottom-right", "font_size": 32,
"start": "0", "end": "5", "font_color": "white"}]}
],
"output": {"path": "final.mp4", "codec": "libx264"}
}
```

## Architecture

```
Expand Down
47 changes: 42 additions & 5 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,8 @@
import pytest


def _ffmpeg_has_drawtext() -> bool:
ffmpeg_dir = os.environ.get("CUTAGENT_FFMPEG_DIR")
from pathlib import Path
ffmpeg_bin = str(Path(ffmpeg_dir) / "ffmpeg") if ffmpeg_dir else "ffmpeg"
def _ffmpeg_has_drawtext(ffmpeg_bin: str = "ffmpeg") -> bool:
"""Check if the given FFmpeg binary supports the drawtext filter."""
try:
result = subprocess.run(
[ffmpeg_bin, "-filters"],
Expand All @@ -25,12 +23,51 @@ def _ffmpeg_has_drawtext() -> bool:
return False


def _find_drawtext_ffmpeg() -> str | None:
"""Find an FFmpeg binary with drawtext support."""
import shutil
from pathlib import Path
ffmpeg_dir = os.environ.get("CUTAGENT_FFMPEG_DIR")
if ffmpeg_dir:
candidate = str(Path(ffmpeg_dir) / "ffmpeg")
if _ffmpeg_has_drawtext(candidate):
return candidate
system = shutil.which("ffmpeg")
if system and _ffmpeg_has_drawtext(system):
return system
try:
from static_ffmpeg.run import get_or_fetch_platform_executables_else_raise
ffmpeg_path, _ = get_or_fetch_platform_executables_else_raise()
if _ffmpeg_has_drawtext(ffmpeg_path):
return ffmpeg_path
except Exception:
pass
return None


_drawtext_ffmpeg = _find_drawtext_ffmpeg()

requires_drawtext = pytest.mark.skipif(
not _ffmpeg_has_drawtext(),
_drawtext_ffmpeg is None,
reason="FFmpeg with drawtext filter not available",
)


@pytest.fixture(autouse=True)
def _use_drawtext_ffmpeg_cli():
"""Ensure CLI subprocesses use an FFmpeg binary that has the drawtext filter."""
if _drawtext_ffmpeg is None:
yield
return
old = os.environ.get("CUTAGENT_FFMPEG")
os.environ["CUTAGENT_FFMPEG"] = _drawtext_ffmpeg
yield
if old is None:
os.environ.pop("CUTAGENT_FFMPEG", None)
else:
os.environ["CUTAGENT_FFMPEG"] = old


def _run_cli(*args: str, input_text: Optional[str] = None) -> subprocess.CompletedProcess:
"""Run cutagent CLI as a subprocess and return the result."""
cmd = [sys.executable, "-m", "cutagent"] + list(args)
Expand Down
137 changes: 137 additions & 0 deletions tests/test_screen_recording_pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
"""Integration tests for the screen recording post-processing pipeline.

Simulates a typical screen recording (video with periods of silence) and
validates the full CutAgent pipeline:

probe → detect_silence → trim content → normalize → verify output

No actual screen capture is performed — a synthetic video is generated with
FFmpeg's lavfi source to replicate a recording that starts and ends with
silence (e.g. pre-/post-session dead air).
"""

from __future__ import annotations

import subprocess

import pytest

from cutagent import (
detect_silence,
execute_edl,
normalize_audio,
probe,
trim,
)
from cutagent.models import format_time


@pytest.fixture(scope="module")
def screen_recording(tmp_path_factory) -> str:
"""Generate a synthetic 10-second 1280×720 screen recording.

Audio pattern: 2s silence → 6s tone (content) → 2s silence.
This mirrors a recording where the agent started/stopped slightly
before/after the actual content.
"""
out = str(tmp_path_factory.mktemp("recordings") / "screen.mp4")
subprocess.run(
[
"ffmpeg", "-y", "-hide_banner", "-loglevel", "error",
"-f", "lavfi", "-i", "testsrc=duration=10:size=1280x720:rate=30",
"-f", "lavfi", "-i",
(
"anullsrc=r=44100:cl=stereo:d=2[s0];"
"sine=frequency=440:duration=6[t0];"
"anullsrc=r=44100:cl=stereo:d=2[s1];"
"[s0][t0][s1]concat=n=3:v=0:a=1"
),
"-c:v", "libx264", "-preset", "ultrafast",
"-c:a", "aac", "-b:a", "128k",
"-pix_fmt", "yuv420p",
"-shortest",
out,
],
check=True,
capture_output=True,
)
return out


class TestScreenRecordingPipeline:
def test_probe_returns_expected_metadata(self, screen_recording):
"""A freshly-recorded file is probed as valid 1280×720 media."""
info = probe(screen_recording)
assert info.duration == pytest.approx(10.0, abs=0.5)
assert info.width == 1280
assert info.height == 720
assert info.video_stream is not None
assert info.audio_stream is not None

def test_silence_detection_finds_intro_and_outro(self, screen_recording):
"""Silence detection identifies the intro and outro dead-air segments."""
silences = detect_silence(screen_recording, threshold=-35.0, min_duration=0.5)
assert len(silences) >= 2
# Intro silence starts at the very beginning
assert silences[0].start == pytest.approx(0.0, abs=0.3)
# Outro silence ends near the recording end
assert silences[-1].end == pytest.approx(10.0, abs=0.5)

def test_trim_to_content_is_shorter(self, screen_recording, tmp_path):
"""Trimming out intro/outro silence produces a shorter clip."""
silences = detect_silence(screen_recording, threshold=-35.0, min_duration=0.5)
assert len(silences) >= 2

content_start = format_time(silences[0].end)
content_end = format_time(silences[-1].start)
out = str(tmp_path / "content.mp4")

result = trim(screen_recording, start=content_start, end=content_end, output=out)

assert result.success
trimmed = probe(out)
assert trimmed.duration < probe(screen_recording).duration

def test_full_edl_trim_and_normalize(self, screen_recording, tmp_path):
"""Full EDL pipeline: trim dead air, then normalize audio loudness."""
silences = detect_silence(screen_recording, threshold=-35.0, min_duration=0.5)
assert len(silences) >= 2

content_start = format_time(silences[0].end)
content_end = format_time(silences[-1].start)
out = str(tmp_path / "final.mp4")

edl = {
"version": "1.0",
"inputs": [screen_recording],
"operations": [
{
"op": "trim",
"source": "$input.0",
"start": content_start,
"end": content_end,
},
{
"op": "normalize",
"source": "$0",
"target_lufs": -16.0,
},
],
"output": {"path": out, "codec": "libx264"},
}

result = execute_edl(edl)

assert result.success
final = probe(out)
assert final.duration > 0
assert final.duration < probe(screen_recording).duration

def test_normalize_standalone(self, screen_recording, tmp_path):
"""normalize_audio produces a valid output file."""
out = str(tmp_path / "normalized.mp4")
result = normalize_audio(screen_recording, output=out, target_lufs=-16.0)
assert result.success
assert probe(out).duration == pytest.approx(
probe(screen_recording).duration, abs=0.5
)