# Stock Video Metadata Pipeline

In [1]:
# Running on host - Cell 1: Install dependencies
!pip install pandas opencv-python Pillow transformers torch paramiko ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.154-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.154-py3-none-any.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 1.0/1.0 MB ? eta 0:00:00
Downloading ultralytics_thop-2.0.14-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics

   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- -----

In [2]:
# Running on Mobile - Cell 1: Install dependencies
!pip install pandas opencv-python Pillow ultralytics paramiko
# (Skip transformers/torch on mobile if you don’t need captioning here)



In [16]:
# Cell 2: Imports & Paths (fault‐tolerant for missing crypto libs)
import sys
from pathlib import Path
import pandas as pd

# 1) Locate & add the `scripts/` folder
repo_root = Path.cwd()
scripts_path = repo_root / "scripts"
if not scripts_path.is_dir():
    raise FileNotFoundError("Missing 'scripts/' directory in repo root")
sys.path.insert(0, str(scripts_path))

# 2) Import pipeline modules, with graceful fallbacks
from config      import get_smb_root, get_repo_root
from discovery   import discover_video_batches, save_inventory

# enrichment is already fault‐tolerant per our earlier patch
from enrichment  import enrich_dataframe

from export      import export_blackbox_csv, export_blackbox_xml

# upload may warn if Paramiko isn’t installed
from upload      import upload_batch_or_zip

# 3) Resolve key paths
batches_root = get_smb_root()
metadata_dir  = repo_root / "metadata"
metadata_dir.mkdir(exist_ok=True)

print("📂 Repo root:    ", repo_root)
print("📂 Batches root: ", batches_root)
print("📂 Metadata dir: ", metadata_dir)

📂 Repo root:     C:\Users\david\JupyterLab\stock-video-metadata
📂 Batches root:  B:\Video\StockFootage\Batches
📂 Metadata dir:  C:\Users\david\JupyterLab\stock-video-metadata\metadata


In [17]:
# Cell 3: Load or Discover Inventory Metadata
from pprint import pprint

batches = {}

try:
    if 'batches_root' in globals() and batches_root and batches_root.exists():
        print(f"📂 Using discovered video batches from: {batches_root}")
        batches = discover_video_batches(batches_root)
        if batches:
            save_inventory(
                batches,
                out_json = metadata_dir / "batch_metadata.json",
                out_csv  = metadata_dir / "video_inventory.csv"
            )
            print(f"✅ Discovered {len(batches)} batches and saved metadata.")
        else:
            print("⚠️ No batches discovered.")

    elif (metadata_dir / "video_inventory.csv").exists():
        print(f"📁 Loading video inventory from existing CSV in: {metadata_dir}")
        df_inventory = pd.read_csv(metadata_dir / "video_inventory.csv")
        print(f"✅ Loaded {len(df_inventory)} video entries from CSV.")
        batches = None  # discovery dict not used in this fallback path

    else:
        raise FileNotFoundError("Neither batches directory nor video inventory CSV is available.")

    # Show preview if we discovered or loaded something
    if batches:
        pprint({k: len(v) for k, v in batches.items()})
    elif 'df_inventory' in locals():
        print(df_inventory[["filename", "full_path"]].head())

except FileNotFoundError as e:
    print(f"❌ Error: {e}")
    print("💡 Make sure either `batches_root` is valid or `video_inventory.csv` exists.")

except Exception as e:
    print(f"🚨 Unexpected error:\n{e}")

📂 Using discovered video batches from: B:\Video\StockFootage\Batches
✅ Inventory written to C:\Users\david\JupyterLab\stock-video-metadata\metadata\batch_metadata.json and C:\Users\david\JupyterLab\stock-video-metadata\metadata\video_inventory.csv
✅ Discovered 1 batches and saved metadata.
{'well_pump': 16}


In [19]:
# ✅ Enrich Inventory with Enhanced AI Metadata (multi-frame, multi-model)

import pandas as pd
from pathlib import Path
from scripts.enrichment import enrich_dataframe

metadata_dir = Path("metadata")
inventory_csv = metadata_dir / "video_inventory.csv"
enriched_csv = metadata_dir / "enriched_videos.csv"

try:
    # Load inventory
    if not inventory_csv.exists():
        raise FileNotFoundError(f"Missing inventory file: {inventory_csv}")
    df_inventory = pd.read_csv(inventory_csv)
    print(f"📥 Loaded {len(df_inventory)} videos from inventory.")

    # Optionally load enriched data to preserve existing values
    df_enriched = None
    if enriched_csv.exists():
        df_enriched = pd.read_csv(enriched_csv)
        print(f"🧠 Found existing enrichment data: {len(df_enriched)} rows")

        # Merge enriched data into inventory, preserving enhanced fields
        enrichment_cols = [
            "AI_Description", "AI_Keywords", "YOLO_Objects", "Hybrid_Description"
        ]
        for col in enrichment_cols:
            if col in df_enriched.columns:
                df_inventory[col] = df_enriched.set_index("filename").reindex(df_inventory["filename"])[col].values

    # Drop entries without a valid full_path
    df_inventory = df_inventory[df_inventory["full_path"].notna()]
    if df_inventory.empty:
        raise ValueError("No valid video paths to enrich.")

    # Enrich!
    df_enriched = enrich_dataframe(df_inventory)
    print(f"✨ Enriched {len(df_enriched)} videos with AI+YOLO metadata.")

    # Save enriched CSV
    df_enriched.to_csv(enriched_csv, index=False)
    print(f"💾 Saved enriched metadata to: {enriched_csv}")

    # Preview enriched sample
    preview_cols = [
        "filename", "AI_Description", "AI_Keywords", "YOLO_Objects", "Hybrid_Description"
    ]
    display(df_enriched[preview_cols].head())

except FileNotFoundError as e:
    print(f"❌ Inventory not found: {e}")
except Exception as e:
    print(f"🚨 Unexpected error during enrichment: {e}")


📥 Loaded 16 videos from inventory.
🧠 Found existing enrichment data: 16 rows




✨ Enriched 16 videos with AI+YOLO metadata.
💾 Saved enriched metadata to: metadata\enriched_videos.csv
🚨 Unexpected error during enrichment: "['AI_Description', 'AI_Keywords', 'YOLO_Objects', 'Hybrid_Description'] not in index"


## 📤 Optional — Save the enriched version

If enrich_dataframe() runs successfully, you can persist results:

In [8]:
# Save the enriched version
(df_videos if df_videos is not None else df_inventory).to_csv(metadata_dir / "enriched_videos.csv", index=False)

## Export BlackBox‐Ready CSV + XML: Modular Cell 5

In [9]:
# Cell 5: Export BlackBox‐Ready CSV + XML
from scripts.export import export_blackbox_csv, export_blackbox_xml

# 1) Export the three‐column CSV
csv_path = metadata_dir / "blackbox_metadata.csv"
export_blackbox_csv(df_videos, csv_path)

# 2) Export one metadata.xml per batch, creating any missing batch folders
export_blackbox_xml(df_videos, batches_root)

KeyError: "None of [Index(['Filename', 'Description', 'Keywords'], dtype='object')] are in the [columns]"

In [10]:
# (Ignore for Modular `script`) Cell 5: Export BlackBox‐Ready CSV + XML (Safe, Cross-Platform)

csv_path = metadata_dir / "blackbox_metadata.csv"
xml_errors = []

try:
    export_blackbox_csv(df_videos, csv_path)
    print(f"✅ Exported BlackBox CSV:\n📄 {csv_path}")
except Exception as e:
    print(f"❌ Failed to export CSV: {e}")

try:
    # Ensure all XML batch subdirs exist before writing
    if not batches_root.exists():
        raise FileNotFoundError(f"Batches root not found: {batches_root}")
    
    # Patch export_blackbox_xml to create dirs if needed
    from xml.etree import ElementTree as ET
    from scripts.export import export_blackbox_xml as original_export_xml

    def export_blackbox_xml_safe(df, batches_root):
        from collections import defaultdict
        from pathlib import Path
        import xml.etree.ElementTree as ET

        grouped = df.groupby("batch")
        for batch, group in grouped:
            batch_dir = batches_root / batch
            if not batch_dir.exists():
                try:
                    batch_dir.mkdir(parents=True, exist_ok=True)
                    print(f"📁 Created missing batch directory: {batch_dir}")
                except Exception as e:
                    xml_errors.append((batch, str(e)))
                    continue
            
            root = ET.Element("videos")
            for _, row in group.iterrows():
                video_el = ET.SubElement(root, "video")
                ET.SubElement(video_el, "filename").text = row.get("filename", "")
                ET.SubElement(video_el, "description").text = row.get("Description", "")
                ET.SubElement(video_el, "keywords").text = row.get("Keywords", "")
            
            xml_path = batch_dir / "metadata.xml"
            try:
                tree = ET.ElementTree(root)
                tree.write(str(xml_path), encoding="utf-8", xml_declaration=True)
                print(f"✅ Exported XML for {batch}: {xml_path}")
            except Exception as e:
                xml_errors.append((batch, str(e)))

    # Call patched exporter
    export_blackbox_xml_safe(df_videos, batches_root)

    if xml_errors:
        print(f"\n⚠️ XML export completed with {len(xml_errors)} errors:")
        for batch, err in xml_errors:
            print(f" - Batch `{batch}`: {err}")

except Exception as e:
    print(f"🚨 Failed during XML export: {e}")
    print("💡 Ensure `batches_root` is accessible and batch folders exist or can be created.")

❌ Failed to export CSV: "None of [Index(['Filename', 'Description', 'Keywords'], dtype='object')] are in the [columns]"
🚨 Failed during XML export: 'batch'
💡 Ensure `batches_root` is accessible and batch folders exist or can be created.


# Upload or ZIP per batch ()Modular Cell 6)

In [6]:
# Upload or ZIP per batch
from upload import upload_batch_or_zip
zip_dir = metadata_dir/"blackbox_uploads"
for batch_name in df_videos['batch_name'].unique():
    batch_path = batches_root/batch_name
    upload_batch_or_zip(batch_path, zip_dir)

📦 Created ZIP for manual upload: /private/var/mobile/Containers/Shared/AppGroup/08003265-A677-4358-B939-7E86CE490040/File Provider Storage/Repositories/cda_ASSETS/blackbox-stock-video-metadata/metadata/blackbox_uploads/well_pump.zip


In [7]:
# (Ignore for modular scripts) Cell 6: Upload All Batches to BlackBox SFTP
for batch_name in df_videos['batch_name'].unique():
    upload_batch(batches_root / batch_name)

print("🎉 Upload complete for all batches.")

NameError: name 'upload_batch' is not defined