In [None]:
gsutil -m cp -r "gs://public-datasets-lila/wcs-unzipped/*" ./wcs_dataset_all

In [None]:
python3 - <<EOF
import ijson
from collections import defaultdict

# Count category occurrences from local file
category_counts = defaultdict(int)
with open("wcs_camera_traps.json", "rb") as f:  # 'rb' mode for ijson
    for ann in ijson.items(f, "annotations.item"):
        category_counts[ann["category_id"]] += 1
        print(f"{ann['category_name']}:{category_counts[ann['category_id']]}")

# Get top 20 categories
top_20 = sorted(category_counts.items(), key=lambda x: -x[1])[:20]
top_20_ids = [cat_id for cat_id, _ in top_20]

print("Top 20 category IDs:", top_20_ids)
EOF

In [None]:
import json

# Load annotation data
with open("wcs_camera_traps.json") as f:
    data = json.load(f)

# Define target category IDs
target_category_ids = {
    2, 372, 71, 96, 111, 374, 3, 115, 10,
    317, 90, 11, 8, 468, 24
}

# Step 1: Get image IDs that have at least one annotation with a target category ID
target_image_ids = set()

for ann in data["annotations"]:
    if ann["category_id"] in target_category_ids:
        target_image_ids.add(ann["image_id"])

# Step 2: Map image_id -> file name
id_to_filename = {img["id"]: img["file_name"] for img in data["images"]}

# Step 3: Create list of full GCS paths
output_paths = []
for img_id in target_image_ids:
    if img_id in id_to_filename:
        file_path = id_to_filename[img_id]
        gcs_path = f"gs://public-datasets-lila/wcs-unzipped/animals/{file_path}"
        output_paths.append(gcs_path)

# Step 4: Save to a file
with open("target_animal_image_paths.txt", "w") as f:
    for path in output_paths:
        f.write(path + "\n")

print(f"Saved {len(output_paths)} image paths.")


In [None]:
import os
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed

INPUT_FILE = "target_animal_image_paths.txt"
OUTPUT_DIR = "downloaded_images"
MAX_WORKERS = 10  # Threads, adjust depending on bandwidth/CPU

os.makedirs(OUTPUT_DIR, exist_ok=True)

def download_image(line):
    line = line.strip()
    if not line:
        return None

    rel_path = line.replace("gs://public-datasets-lila/wcs-unzipped/", "")
    safe_name = rel_path.replace("/", "_")
    output_path = os.path.join(OUTPUT_DIR, safe_name)

    if os.path.exists(output_path):
        return f"✔ Skipped: {safe_name}"

    try:
        result = subprocess.run(
            ["gsutil", "cp", line, output_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        if result.returncode == 0:
            return f"✅ Downloaded: {safe_name}"
        else:
            return f"❌ Failed: {safe_name} — {result.stderr.decode().strip()}"
    except Exception as e:
        return f"💥 Error: {safe_name} — {str(e)}"

def main():
    with open(INPUT_FILE, "r") as f:
        lines = f.readlines()

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(download_image, line) for line in lines]
        for future in as_completed(futures):
            result = future.result()
            if result:
                print(result)

if __name__ == "__main__":
    main()


In [None]:
import json
import os
import shutil
from collections import defaultdict

# === Paths ===
coco_json_path = "wcs_camera_traps.json"          
download_dir = "downloaded_images"           
output_dir = "organized_by_species"          

# === Load JSON ===
with open(coco_json_path, 'r') as f:
    data = json.load(f)

# === Map categories and images ===
category_id_to_name = {cat["id"]: cat["name"] for cat in data["categories"]}
image_id_to_file = {
    img["id"]: img["file_name"].replace("/", "_")
    for img in data["images"]
}

# === Map file_name to species ===
file_to_species = defaultdict(list)

for ann in data["annotations"]:
    img_id = ann["image_id"]
    cat_id = ann["category_id"]
    file_name = image_id_to_file.get(img_id)
    species = category_id_to_name.get(cat_id)
    if file_name and species:
        file_to_species[file_name].append(species)

# === Organize files ===
os.makedirs(output_dir, exist_ok=True)

for file_name, species_list in file_to_species.items():
    src_path = os.path.join(download_dir, file_name)
    if not os.path.exists(src_path):
        print(f"⚠️ Skipping {file_name} (not found)")
        continue

    for species in set(species_list):  # avoid duplicates
        species_dir = os.path.join(output_dir, species)
        os.makedirs(species_dir, exist_ok=True)
        dst_path = os.path.join(species_dir, file_name)
        shutil.copy2(src_path, dst_path)
        print(f"📦 Copied {file_name} to {species_dir}")

print("✅ Done organizing by species!")
