Notebook to generate "snippets" based on sign-spotting ground truth, for qualitative analysis.

In [None]:
from pathlib import Path
import random
import pandas as pd

In [None]:
webdataset_extracted_dir = Path("/data/petabyte/cleong/data/DBL_Deaf_Bibles/webdataset_extracted/ase/chronological_bible_translation_in_american_sign_language_119_introductions_and_passages/")

In [None]:
passage_files = list(webdataset_extracted_dir.rglob("*-passage*.mp4"))
random.sample(passage_files, k=5)


In [None]:
fail_conquer_ai_mp4 = list(webdataset_extracted_dir.rglob("*-passage*fail*conquer*ai*.mp4"))[0]
fail_conquer_ai_mp4

In [None]:
fail_conquer_ai_tsv = Path("/opt/home/cleong/projects/semantic_and_visual_similarity/sign-bibles-dataset/sign_bibles_dataset/data_analysis/gloss_annotations/avodah/CBT-033-ase-2-Passage _ Israel Fails to Conquer Ai.tsv")
fail_conquer_ai_truth_df = pd.read_csv(fail_conquer_ai_tsv, delimiter="\t")
fail_conquer_ai_truth_df.head()

In [None]:
from pathlib import Path
import logging
import subprocess
import pandas as pd
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")


def export_snippets_from_df(
    df: pd.DataFrame,
    input_mp4: Path,
    output_dir: Path,
    start_col: str = "Begin Time - hh:mm:ss.ms",
    end_col: str = "End Time - hh:mm:ss.ms",
    label_col: str = "Sign",
) -> None:
    """
    Export video snippets from an input mp4 using time ranges in a DataFrame.

    Args:
        df: DataFrame containing start, end, and label columns.
        input_mp4: Path to the source video file.
        output_dir: Directory to save extracted clips.
        start_col: Name of the start time column (hh:mm:ss.ms format).
        end_col: Name of the end time column (hh:mm:ss.ms format).
        label_col: Name of the column with the label (optional).
    """
    input_mp4 = Path(input_mp4)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if not input_mp4.exists():
        logging.error("Input video does not exist: %s", input_mp4)
        return

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Exporting clips"):
        start_time = str(row[start_col])
        end_time = str(row[end_col])
        label = str(row[label_col]) if pd.notna(row[label_col]) else "UNKNOWN"

        # Make filename safe
        start_safe = start_time.replace(":", "_").replace(".", "_")
        end_safe = end_time.replace(":", "_").replace(".", "_")
        label_safe = "".join(c if c.isalnum() else "_" for c in label)

        output_file = output_dir / f"{idx}_start_{start_safe}_end_{end_safe}_{label_safe}.mp4"

        cmd = [
            "ffmpeg",
            "-y",
            "-ss",
            start_time,
            "-to",
            end_time,
            "-i",
            str(input_mp4),
            "-c:v",
            "libx264",
            "-crf",
            "18",
            "-preset",
            "veryfast",
            "-c:a",
            "aac",
            "-movflags",
            "+faststart",
            str(output_file),
        ]

        try:
            subprocess.run(cmd, check=True, capture_output=True)
        except subprocess.CalledProcessError as e:
            logging.warning(
                "Failed to extract clip %s (row %d): %s",
                output_file,
                idx,
                e.stderr.decode("utf-8", errors="ignore"),
            )


In [None]:
from pathlib import Path
import logging
import subprocess
import tempfile
import pandas as pd
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

def export_gifs_from_df(
    df: pd.DataFrame,
    input_mp4: Path,
    output_dir: Path,
    start_col: str = "Begin Time - hh:mm:ss.ms",
    end_col: str = "End Time - hh:mm:ss.ms",
    label_col: str = "Sign",
    scale: int = 320,
    fps: int = 10,
    fast: bool = True,
) -> None:
    """
    Export GIF snippets from an input mp4 using time ranges in a DataFrame.

    Args:
        df: DataFrame containing start, end, and label columns.
        input_mp4: Path to the source video file.
        output_dir: Directory to save extracted gifs.
        start_col: Name of the start time column (hh:mm:ss.ms format).
        end_col: Name of the end time column (hh:mm:ss.ms format).
        label_col: Name of the column with the label (optional).
        scale: Width of output gif (height auto-scaled).
        fps: Frames per second for gif.
        fast: If True, use one-pass (lower quality but faster).
    """
    input_mp4 = Path(input_mp4)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if not input_mp4.exists():
        logging.error("Input video does not exist: %s", input_mp4)
        return

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Exporting gifs"):
        start_time = str(row[start_col])
        end_time = str(row[end_col])
        label = str(row[label_col]) if pd.notna(row[label_col]) else "UNKNOWN"

        # Safe filename parts
        start_safe = start_time.replace(":", "_").replace(".", "_")
        end_safe = end_time.replace(":", "_").replace(".", "_")
        label_safe = "".join(c if c.isalnum() else "_" for c in label)

        output_file = (
            output_dir
            / f"{idx}_start_{start_safe}_end_{end_safe}_{label_safe}.gif"
        )

        # Duration instead of absolute "to"
        start_sec = float(row["Begin Time - ss.msec"])
        end_sec = float(row["End Time - ss.msec"])
        duration = max(0, end_sec - start_sec)

        if fast:
            # One-pass, faster but lower color quality
            cmd = [
                "ffmpeg",
                "-y",
                "-ss", str(start_sec),
                "-t", str(duration),
                "-i", str(input_mp4),
                "-vf", f"fps={fps},scale={scale}:-1:flags=lanczos",
                str(output_file),
            ]
        else:
            # Two-pass palette (slower but higher quality)
            with tempfile.TemporaryDirectory() as tmpdir:
                palette_file = Path(tmpdir) / f"palette_{idx}.png"

                # Palette gen
                subprocess.run(
                    [
                        "ffmpeg", "-y",
                        "-ss", str(start_sec),
                        "-t", str(duration),
                        "-i", str(input_mp4),
                        "-vf", f"fps={fps},scale={scale}:-1:flags=lanczos,palettegen",
                        str(palette_file),
                    ],
                    check=True,
                    capture_output=True,
                )

                # Palette use
                cmd = [
                    "ffmpeg", "-y",
                    "-ss", str(start_sec),
                    "-t", str(duration),
                    "-i", str(input_mp4),
                    "-i", str(palette_file),
                    "-lavfi", f"fps={fps},scale={scale}:-1:flags=lanczos [x]; [x][1:v] paletteuse",
                    str(output_file),
                ]

        try:
            subprocess.run(cmd, check=True, capture_output=True)
        except subprocess.CalledProcessError as e:
            logging.warning(
                "Failed to extract gif %s (row %d): %s",
                output_file,
                idx,
                e.stderr.decode("utf-8", errors="ignore"),
            )


In [None]:
from pathlib import Path
import logging
import subprocess
import pandas as pd
from tqdm import tqdm

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

def export_frames_from_df(
    df: pd.DataFrame,
    input_mp4: Path,
    output_dir: Path,
    start_col: str = "Begin Time - hh:mm:ss.ms",
    end_col: str = "End Time - hh:mm:ss.ms",
    label_col: str = "Sign",
    step: int = 1,
    image_format: str = "png",
) -> None:
    """
    Export video frames from an input mp4 using time ranges in a DataFrame.

    Args:
        df: DataFrame containing start, end, and label columns.
        input_mp4: Path to the source video file.
        output_dir: Directory to save extracted frames.
        start_col: Name of the start time column (hh:mm:ss.ms format).
        end_col: Name of the end time column (hh:mm:ss.ms format).
        label_col: Name of the column with the label (optional).
        step: Take every Nth frame (e.g., step=3 takes every 3rd frame).
        image_format: Output image format (e.g., 'png', 'jpg').
    """
    input_mp4 = Path(input_mp4)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)

    if not input_mp4.exists():
        logging.error("Input video does not exist: %s", input_mp4)
        return

    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Exporting frames"):
        start_time = str(row[start_col])
        end_time = str(row[end_col])
        label = str(row[label_col]) if pd.notna(row[label_col]) else "UNKNOWN"

        # Safe filename parts
        start_safe = start_time.replace(":", "_").replace(".", "_")
        end_safe = end_time.replace(":", "_").replace(".", "_")
        label_safe = "".join(c if c.isalnum() else "_" for c in label)

        clip_dir = (
            output_dir
            / f"{idx}_start_{start_safe}_end_{end_safe}_{label_safe}"
        )
        clip_dir.mkdir(parents=True, exist_ok=True)

        start_sec = float(row["Begin Time - ss.msec"])
        end_sec = float(row["End Time - ss.msec"])
        duration = max(0, end_sec - start_sec)

        cmd = [
            "ffmpeg",
            "-y",
            "-ss", str(start_sec),
            "-t", str(duration),
            "-i", str(input_mp4),
            "-vf", f"select='not(mod(n\\,{step}))',setpts=N/FRAME_RATE/TB",
            str(clip_dir / f"frame_%04d.{image_format}"),
        ]

        try:
            subprocess.run(cmd, check=True, capture_output=True)
        except subprocess.CalledProcessError as e:
            logging.warning(
                "Failed to extract frames for row %d: %s",
                idx,
                e.stderr.decode("utf-8", errors="ignore"),
            )


In [None]:
export_snippets_from_df(
    df=fail_conquer_ai_truth_df,
    input_mp4=fail_conquer_ai_mp4,
    output_dir=Path("./snippets/cbt033/mp4")
)


In [None]:
export_gifs_from_df(
    df=fail_conquer_ai_truth_df,
    input_mp4=fail_conquer_ai_mp4,
    output_dir=Path("./snippets/cbt033/gifs"),
    scale=320,
    fps=15,
)


In [None]:
export_frames_from_df(
    df=fail_conquer_ai_truth_df,
    input_mp4=fail_conquer_ai_mp4,
    output_dir=Path("./snippets/cbt033/frames")
)


In [None]:
!zip -r snippets/cbt033.zip snippets/cbt033/

In [None]:
!find "/opt/home/cleong/projects/semantic-sign-language-search/setup_signCLIP/fairseq/examples/MMPT/results/asl_finetune_checkpoint_best/samplespergloss_5/start_0_end_None/windowsize500_step100" -wholename "*queries/*" -type d > top_setting_actual_queries.txt

In [None]:
!zip -r top_setting.zip "/opt/home/cleong/projects/semantic-sign-language-search/setup_signCLIP/fairseq/examples/MMPT/results/asl_finetune_checkpoint_best/samplespergloss_5/start_0_end_None/windowsize500_step100"

In [None]:
cbt_tsvs = list(Path("/opt/home/cleong/projects/semantic_and_visual_similarity/sign-bibles-dataset/sign_bibles_dataset/data_analysis/gloss_annotations/avodah/").glob("CBT*.tsv"))
for tsv in cbt_tsvs:
    cbt_number = int(tsv.name.split("-")[1])
    print(tsv.name)
    print(f"CBT # {cbt_number:03d}")
    corresponding_vids=list(webdataset_extracted_dir.glob(f"*cbt*{cbt_number:03d}*-passage*.mp4"))
    print(corresponding_vids)
    assert len(corresponding_vids)==1
    corresponding_vid = corresponding_vids[0]

    truth_df = pd.read_csv(tsv, delimiter="\t")

    export_snippets_from_df(
        df=truth_df,
        input_mp4=corresponding_vid,
        output_dir=Path(f"./snippets/cbt{cbt_number:03d}/mp4")
    )
    export_gifs_from_df(
        df=truth_df,
        input_mp4=corresponding_vid,
        output_dir=Path(f"./snippets/cbt{cbt_number:03d}/gifs"),
        scale=320,
        fps=15,
    )




In [None]:
!zip -r snippets.zip snippets/