## Create captions for training

1. Load json of Gemini Descriptions
2. Grab file_name, short_descriptions
3. grab geolocation_hints, and insignia that don't include "none, not, empty strings" 

### Load json

In [None]:
import json
from datetime import datetime


meta_data = []
with open("data/results/metadata/metadata.json", "r") as f:
    for line in f:
        meta_data.append(json.loads(line))

### Take relevant key values pairs

In [2]:
filtered = []
for item in meta_data:
    description = item.get("short_description")
    file_name = item.get("file_name", "")
    # Skip if not a video file (e.g., doesn't end with .mp4, .avi, .mov, .mkv)
    #if not any(file_name.lower().endswith(ext) for ext in [".mp4", ".avi", ".mov", ".mkv"]):
    if not any(file_name.lower().endswith(ext) for ext in [".jpg", ".jpeg", ".png", ".bmp", ".tiff", ".webp"]):
        continue
    if description is None:
        continue
    entry = {
        "file_name": file_name,
        "captions": description
    }
    # Compute idx from "start" if present and not empty
    start_val = item.get("start")
    if start_val not in [None, ""]:
        try:
            t = datetime.strptime(start_val, "%M:%S")
            total_seconds = t.minute * 60 + t.second
            idx = total_seconds // 30
            entry["idx"] = idx
        except Exception:
            pass

    # Append geolocation_hints if valid
    geo = item.get("geolocation_hints", "")
    if isinstance(geo, str) and geo.strip() and not any(x in geo.lower() for x in ["none", "not"]):
        entry["captions"] += f". {geo}"
    # Append insignia_markings if valid
    insignia = item.get("insignia_markings", "")
    if isinstance(insignia, str) and insignia.strip() and not any(x in insignia.lower() for x in ["none", "not"]):
        entry["captions"] += f". {insignia}"
    filtered.append(entry)

### Output new json

Remove non-enlgish and emojis from strings.

In [None]:
with open("data/training_captions_images.jsonl", "w") as f:
    for entry in filtered:
        # Remove non-ASCII from all string values in the entry
        entry_ascii = {
            k: v.encode("ascii", "ignore").decode() if isinstance(v, str) else v
            for k, v in entry.items()
        }
        f.write(json.dumps(entry_ascii) + "\n")