# Stock Video Metadata Pipeline

In [1]:
# Running on host - Cell 1: Install dependencies
!pip install pandas opencv-python Pillow transformers torch paramiko ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.154-py3-none-any.whl.metadata (37 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.154-py3-none-any.whl (1.0 MB)
   ---------------------------------------- 0.0/1.0 MB ? eta -:--:--
   ---------------------------------------- 1.0/1.0 MB ? eta 0:00:00
Downloading ultralytics_thop-2.0.14-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics

   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- ------------------- 1/2 [ultralytics]
   -------------------- -----

In [2]:
# Running on Mobile - Cell 1: Install dependencies
# !pip install pandas opencv-python Pillow ultralytics paramiko
# (Skip transformers/torch on mobile if you don’t need captioning here)



In [1]:
# Cell 2: Imports & Paths (fault‐tolerant for missing crypto libs)
import sys
from pathlib import Path
import pandas as pd

# 1) Locate & add the `scripts/` folder
repo_root = Path.cwd()
scripts_path = repo_root / "scripts"
if not scripts_path.is_dir():
    raise FileNotFoundError("Missing 'scripts/' directory in repo root")
sys.path.insert(0, str(scripts_path))

# 2) Import pipeline modules, with graceful fallbacks
from config      import get_smb_root, get_repo_root
from discovery   import discover_video_batches, save_inventory

# enrichment is already fault‐tolerant per our earlier patch
from enrichment  import VideoEnricher

from export      import export_blackbox_csv, export_blackbox_xml

# upload may warn if Paramiko isn’t installed
from upload      import upload_batch_or_zip

# 3) Resolve key paths
batches_root = get_smb_root()
metadata_dir  = repo_root / "metadata"
metadata_dir.mkdir(exist_ok=True)

print("📂 Repo root:    ", repo_root)
print("📂 Batches root: ", batches_root)
print("📂 Metadata dir: ", metadata_dir)

  from pkg_resources import resource_filename


📂 Repo root:     C:\Users\david\JupyterLab\stock-video-metadata
📂 Batches root:  [WindowsPath('B:/Video/StockFootage/Batches'), WindowsPath('B:/Video/StockFootage/Curated')]
📂 Metadata dir:  C:\Users\david\JupyterLab\stock-video-metadata\metadata


In [2]:
# Cell 3: 📊 Load or Discover Inventory Metadata
from pathlib import Path
from pprint import pprint

batches = {}
try:
    # batches_root might be a list of shares or a single Path
    roots = batches_root if isinstance(batches_root, (list, tuple)) else [batches_root]
    # pick first existing path
    valid = [Path(r) for r in roots if Path(r).exists()]

    if valid:
        root = valid[0]
        print(f"📂 Using discovered video batches from: {root}")
        # (we leave batches_root alone — discovery will pick it up via get_smb_root())
        batches = discover_video_batches()        # ← no arguments
        if batches:
            save_inventory(
                batches,
                out_json = metadata_dir / "batch_metadata.json",
                out_csv  = metadata_dir / "video_inventory.csv"
            )
            print(f"✅ Discovered {len(batches)} batches and saved metadata.")
        else:
            print("⚠️ No batches discovered.")

    elif (metadata_dir / "video_inventory.csv").exists():
        inv_csv = metadata_dir / "video_inventory.csv"
        print(f"📁 Loading video inventory from CSV: {inv_csv}")
        df_inventory = pd.read_csv(inv_csv)
        print(f"✅ Loaded {len(df_inventory)} entries from CSV.")
        batches = None

    else:
        raise FileNotFoundError(
            "Neither a valid batches_root nor video_inventory.csv was found."
        )

    # preview
    if batches:
        pprint({k: len(v) for k, v in batches.items()})
    else:
        print(df_inventory[["filename","full_path"]].head())

except FileNotFoundError as e:
    print(f"❌ Error: {e}")
    print("💡 Ensure `batches_root` is correct or `metadata/video_inventory.csv` exists.")
except Exception as e:
    print(f"🚨 Unexpected error:\n{e}")

📂 Using discovered video batches from: B:\Video\StockFootage\Batches
🔍 Searching in: B:\Video\StockFootage\Batches
🎯 Found 1 batch(es).
✅ Inventory saved to:
  - C:\Users\david\JupyterLab\stock-video-metadata\metadata\batch_metadata.json
  - C:\Users\david\JupyterLab\stock-video-metadata\metadata\video_inventory.csv
✅ Discovered 1 batches and saved metadata.
{'well_pump': 16}


In [8]:
# Cell 4: Enrich Inventory with Enhanced AI Metadata (multi-frame, multi-model)

import pandas as pd
from pathlib import Path
from scripts.enrichment import enrich_dataframe  # <-- imports your new function!

df_videos = None

try:
    inventory_csv = metadata_dir / "video_inventory.csv"
    if not inventory_csv.exists():
        raise FileNotFoundError(f"Missing inventory file: {inventory_csv}")

    # Load video inventory
    df_videos = pd.read_csv(inventory_csv)
    print(f"📥 Loaded {len(df_videos)} videos from inventory.")

    # Drop entries without a valid full_path
    df_videos = df_videos[df_videos["full_path"].notna()]
    if df_videos.empty:
        raise ValueError("No valid video paths to enrich.")

    # *** ENRICH with new script ***
    df_enriched = enrich_dataframe(df_videos)
    print(f"✨ Enriched {len(df_enriched)} videos with AI+YOLO metadata.")

    # Preview
    preview_cols = [
        "filename", "AI_Description", "AI_Keywords", "YOLO_Objects", "Hybrid_Description"
    ]
    display(df_enriched[preview_cols].head())

except FileNotFoundError as e:
    print(f"❌ Inventory not found: {e}")
except Exception as e:
    print(f"🚨 Unexpected error while loading inventory:\n{e}")

ImportError: cannot import name 'enrich_dataframe' from 'scripts.enrichment' (C:\Users\david\JupyterLab\stock-video-metadata\scripts\enrichment.py)

In [13]:
# Cell 4 Modular Script
from scripts.enrichment import VideoEnricher
enricher = VideoEnricher()
df = pd.read_csv("video_inventory.csv")
df_enriched = enricher.enrich_dataframe(df, enriched_csv="enriched_videos.csv")
df_enriched.to_csv("enriched_videos.csv", index=False)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

FileNotFoundError: [Errno 2] No such file or directory: 'video_inventory.csv'

In [3]:
# Cell 4: 🚀 Enrich & Export
from scripts.enrichment import VideoEnricher

# instantiate & point at the CSV we just wrote
enricher = VideoEnricher()
inv_csv    = metadata_dir / "video_inventory.csv"
out_csv    = metadata_dir / "enriched_videos.csv"

# read, enrich, save
df = pd.read_csv(inv_csv)
df_enriched = enricher.enrich_dataframe(df, enriched_csv=str(out_csv))
df_enriched.to_csv(out_csv, index=False)

print(f"✅ Wrote enriched metadata to: {out_csv}")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

✅ Wrote enriched metadata to: C:\Users\david\JupyterLab\stock-video-metadata\metadata\enriched_videos.csv


## 📤 Optional — Save the enriched version

If enrich_dataframe() runs successfully, you can persist results:

In [8]:
(df_videos if df_videos is not None else df_inventory).to_csv(metadata_dir / "enriched_videos.csv", index=False)

## Modular Cell 5

In [9]:
# Cell 5: Export BlackBox‐Ready CSV + XML
from scripts.export import export_blackbox_csv, export_blackbox_xml

# 1) Export the three‐column CSV
csv_path = metadata_dir / "blackbox_metadata.csv"
export_blackbox_csv(df_videos, csv_path)

# 2) Export one metadata.xml per batch, creating any missing batch folders
export_blackbox_xml(df_videos, batches_root)

KeyError: "None of [Index(['Filename', 'Description', 'Keywords'], dtype='object')] are in the [columns]"

In [10]:
# (Ignore for Modular `script`) Cell 5: Export BlackBox‐Ready CSV + XML (Safe, Cross-Platform)

csv_path = metadata_dir / "blackbox_metadata.csv"
xml_errors = []

try:
    export_blackbox_csv(df_videos, csv_path)
    print(f"✅ Exported BlackBox CSV:\n📄 {csv_path}")
except Exception as e:
    print(f"❌ Failed to export CSV: {e}")

try:
    # Ensure all XML batch subdirs exist before writing
    if not batches_root.exists():
        raise FileNotFoundError(f"Batches root not found: {batches_root}")
    
    # Patch export_blackbox_xml to create dirs if needed
    from xml.etree import ElementTree as ET
    from scripts.export import export_blackbox_xml as original_export_xml

    def export_blackbox_xml_safe(df, batches_root):
        from collections import defaultdict
        from pathlib import Path
        import xml.etree.ElementTree as ET

        grouped = df.groupby("batch")
        for batch, group in grouped:
            batch_dir = batches_root / batch
            if not batch_dir.exists():
                try:
                    batch_dir.mkdir(parents=True, exist_ok=True)
                    print(f"📁 Created missing batch directory: {batch_dir}")
                except Exception as e:
                    xml_errors.append((batch, str(e)))
                    continue
            
            root = ET.Element("videos")
            for _, row in group.iterrows():
                video_el = ET.SubElement(root, "video")
                ET.SubElement(video_el, "filename").text = row.get("filename", "")
                ET.SubElement(video_el, "description").text = row.get("Description", "")
                ET.SubElement(video_el, "keywords").text = row.get("Keywords", "")
            
            xml_path = batch_dir / "metadata.xml"
            try:
                tree = ET.ElementTree(root)
                tree.write(str(xml_path), encoding="utf-8", xml_declaration=True)
                print(f"✅ Exported XML for {batch}: {xml_path}")
            except Exception as e:
                xml_errors.append((batch, str(e)))

    # Call patched exporter
    export_blackbox_xml_safe(df_videos, batches_root)

    if xml_errors:
        print(f"\n⚠️ XML export completed with {len(xml_errors)} errors:")
        for batch, err in xml_errors:
            print(f" - Batch `{batch}`: {err}")

except Exception as e:
    print(f"🚨 Failed during XML export: {e}")
    print("💡 Ensure `batches_root` is accessible and batch folders exist or can be created.")

❌ Failed to export CSV: "None of [Index(['Filename', 'Description', 'Keywords'], dtype='object')] are in the [columns]"
🚨 Failed during XML export: 'batch'
💡 Ensure `batches_root` is accessible and batch folders exist or can be created.


# Modular Cell 6

In [6]:
# Upload or ZIP per batch
from upload import upload_batch_or_zip
zip_dir = metadata_dir/"blackbox_uploads"
for batch_name in df_videos['batch_name'].unique():
    batch_path = batches_root/batch_name
    upload_batch_or_zip(batch_path, zip_dir)

📦 Created ZIP for manual upload: /private/var/mobile/Containers/Shared/AppGroup/08003265-A677-4358-B939-7E86CE490040/File Provider Storage/Repositories/cda_ASSETS/blackbox-stock-video-metadata/metadata/blackbox_uploads/well_pump.zip


In [7]:
# (Ignore for modular scripts) Cell 6: Upload All Batches to BlackBox SFTP
for batch_name in df_videos['batch_name'].unique():
    upload_batch(batches_root / batch_name)

print("🎉 Upload complete for all batches.")

NameError: name 'upload_batch' is not defined