In [None]:
gsutil -m cp -r "gs://public-datasets-lila/wcs-unzipped/*" ./wcs_dataset_all

In [None]:
python3 - <<EOF
import ijson
from collections import defaultdict

# Count category occurrences from local file
category_counts = defaultdict(int)
with open("wcs_camera_traps.json", "rb") as f:  # 'rb' mode for ijson
    for ann in ijson.items(f, "annotations.item"):
        category_counts[ann["category_id"]] += 1
        print(f"{ann['category_name']}:{category_counts[ann['category_id']]}")

# Get top 20 categories
top_50 = sorted(category_counts.items(), key=lambda x: -x[1])[:50]
top_50_ids = [cat_id for cat_id, _ in top_50]

print("Top 20 category IDs:", top_50_ids)
EOF

In [None]:
import json

# Load annotation data
with open("wcs_camera_traps.json") as f:
    data = json.load(f)

# Define target category IDs
target_category_ids = {
    2, 372, 71, 96, 111, 374, 3, 115, 10,
    317, 90, 11, 8, 468, 24
}

# Step 1: Get image IDs that have at least one annotation with a target category ID
target_image_ids = set()

for ann in data["annotations"]:
    if ann["category_id"] in target_category_ids:
        target_image_ids.add(ann["image_id"])

# Step 2: Map image_id -> file name
id_to_filename = {img["id"]: img["file_name"] for img in data["images"]}

# Step 3: Create list of full GCS paths
output_paths = []
for img_id in target_image_ids:
    if img_id in id_to_filename:
        file_path = id_to_filename[img_id]
        gcs_path = f"gs://public-datasets-lila/wcs-unzipped/animals/{file_path}"
        output_paths.append(gcs_path)

# Step 4: Save to a file
with open("target_animal_image_paths.txt", "w") as f:
    for path in output_paths:
        f.write(path + "\n")

print(f"Saved {len(output_paths)} image paths.")


In [None]:
import os
import subprocess
from concurrent.futures import ThreadPoolExecutor, as_completed

INPUT_FILE = "target_animal_image_paths.txt"
OUTPUT_DIR = "downloaded_images"
MAX_WORKERS = 10  # Threads, adjust depending on bandwidth/CPU

os.makedirs(OUTPUT_DIR, exist_ok=True)

def download_image(line):
    line = line.strip()
    if not line:
        return None

    rel_path = line.replace("gs://public-datasets-lila/wcs-unzipped/", "")
    safe_name = rel_path.replace("/", "_")
    output_path = os.path.join(OUTPUT_DIR, safe_name)

    if os.path.exists(output_path):
        return f"✔ Skipped: {safe_name}"

    try:
        result = subprocess.run(
            ["gsutil", "cp", line, output_path],
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE
        )
        if result.returncode == 0:
            return f"✅ Downloaded: {safe_name}"
        else:
            return f"❌ Failed: {safe_name} — {result.stderr.decode().strip()}"
    except Exception as e:
        return f"💥 Error: {safe_name} — {str(e)}"

def main():
    with open(INPUT_FILE, "r") as f:
        lines = f.readlines()

    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(download_image, line) for line in lines]
        for future in as_completed(futures):
            result = future.result()
            if result:
                print(result)

if __name__ == "__main__":
    main()


In [None]:
import json
import os
import shutil
from collections import defaultdict

# === Paths ===
coco_json_path = "wcs_camera_traps.json"          
download_dir = "downloaded_images"           
output_dir = "organized_by_species"          

# === Load JSON ===
with open(coco_json_path, 'r') as f:
    data = json.load(f)

# === Map categories and images ===
category_id_to_name = {cat["id"]: cat["name"] for cat in data["categories"]}
image_id_to_file = {
    img["id"]: img["file_name"].replace("/", "_")
    for img in data["images"]
}

# === Map file_name to species ===
file_to_species = defaultdict(list)

for ann in data["annotations"]:
    img_id = ann["image_id"]
    cat_id = ann["category_id"]
    file_name = image_id_to_file.get(img_id)
    species = category_id_to_name.get(cat_id)
    if file_name and species:
        file_to_species[file_name].append(species)

# === Organize files ===
os.makedirs(output_dir, exist_ok=True)

for file_name, species_list in file_to_species.items():
    src_path = os.path.join(download_dir, file_name)
    if not os.path.exists(src_path):
        print(f"⚠️ Skipping {file_name} (not found)")
        continue

    for species in set(species_list):  # avoid duplicates
        species_dir = os.path.join(output_dir, species)
        os.makedirs(species_dir, exist_ok=True)
        dst_path = os.path.join(species_dir, file_name)
        shutil.copy2(src_path, dst_path)
        print(f"📦 Copied {file_name} to {species_dir}")

print("✅ Done organizing by species!")


In [None]:
import pandas as pd
from tqdm import tqdm
import os

# Paths
metadata_csv_path = 'na.csv'
output_txt_path = 'files_to_download.txt'
base_path_in_bucket = 'nacti-unzipped/'

# Read the metadata CSV
print(f"Reading metadata from {metadata_csv_path}...")
metadata_df = pd.read_csv(metadata_csv_path)
print(f"Metadata loaded: {len(metadata_df)} rows.")

# Initialize tracking
species_download_count = {species: 0 for species in metadata_df['common_name'].unique()}
max_images_per_species = 5000

# Prepare output file
count_total = 0
with open(output_txt_path, 'w') as f:
    # tqdm progress bar
    for index, row in tqdm(metadata_df.iterrows(), total=len(metadata_df), desc="Processing images"):
        common_name = row['common_name']
        filename = row['filename']

        if species_download_count[common_name] < max_images_per_species:
            full_path = 'gs://public-datasets-lila/' + base_path_in_bucket + filename
            f.write(full_path + '\n')
            species_download_count[common_name] += 1
            count_total += 1

        # Stop early if we already have enough images
        if all(count >= max_images_per_species for count in species_download_count.values()):
            print("Reached target of 5000 images per species. Stopping early.")
            break

print(f"\nFinished. Total images selected: {count_total}")
print(f"File list saved to {output_txt_path}")


In [None]:
gsutil -m cp -I < files_to_download.txt lila_species_subset/


In [None]:
import os
import pandas as pd
import shutil
from tqdm import tqdm

# Paths
metadata_csv_path = 'na.csv'
downloaded_images_path = 'lila_species_subset'
organized_dataset_path = 'lila_species_organized'

# Load metadata
print(f"Loading metadata from {metadata_csv_path}...")
metadata_df = pd.read_csv(metadata_csv_path)
print(f"Loaded {len(metadata_df)} metadata entries.")

# Create a mapping from just the filename (no path) to species
filename_to_species = {os.path.basename(path): species for path, species in zip(metadata_df['filename'], metadata_df['common_name'])}

# Make sure the output directory exists
os.makedirs(organized_dataset_path, exist_ok=True)

# List all files to organize
all_files = os.listdir(downloaded_images_path)
print(f"Found {len(all_files)} downloaded images to organize.")

# Organize the files with tqdm progress bar
for filename in tqdm(all_files, desc="Organizing images"):
    species_name = filename_to_species.get(filename)
    if species_name:
        # Create a species folder if it doesn't exist
        species_folder = os.path.join(organized_dataset_path, species_name)
        os.makedirs(species_folder, exist_ok=True)
        
        # Move the file
        src = os.path.join(downloaded_images_path, filename)
        dst = os.path.join(species_folder, filename)
        shutil.move(src, dst)
    else:
        print(f"Warning: {filename} not found in metadata.")

print("\n✅ Done organizing files by species!")


In [None]:
import json
import os
from tqdm import tqdm

def main():
    # Load annotation data
    with open("wcs_camera_traps.json") as f:
        data = json.load(f)

    # Define the top 50 species (excluding empty/human/unknown)
    top_50_species = [
        "tayassu pecari", "meleagris ocellata", "bos taurus", "aepyceros melampus",
        "equus quagga", "crax rubra", "dasyprocta punctata", "madoqua guentheri",
        "leopardus pardalis", "cephalophus nigrifrons", "loxodonta africana",
        "mitu tuberosum", "pecari tajacu", "didelphis pernigra", "panthera onca",
        "giraffa camelopardalis", "psophia leucoptera", "tapirus terrestris",
        "mazama americana", "puma concolor", "cuniculus paca", "urocyon cinereoargenteus",
        "syncerus caffer", "dasyprocta leporina", "mazama temama", "muntiacus muntjak",
        "sylvilagus brasiliensis", "coua serriana", "capra aegagrus", "tapirus bairdii",
        "papio anubis", "macaca nemestrina", "cricetomys gambianus", "didelphis sp",
        "tragelaphus oryx", "agouti paca", "cercopithecus lhoesti", "equus grevyi",
        "penelope jacquacu", "nanger granti", "crocuta crocuta", "equus ferus",
        "eira barbara", "dasypus novemcinctus", "argusianus argus", "alectoris rufa",
        "psophia crepitans", "francolinus nobilis", "didelphis marsupialis", "nasua narica"
    ]

    # Step 1: Map species names to category_ids
    species_to_category_id = {}
    category_id_to_species = {}
    for category in tqdm(data["categories"], desc="Processing categories"):
        if category["name"] in top_50_species:
            species_to_category_id[category["name"]] = category["id"]
            category_id_to_species[category["id"]] = category["name"].replace(" ", "_")  # Format for folder names

    # Step 2: Get image IDs and their species
    image_id_to_species = {}
    for ann in tqdm(data["annotations"], desc="Processing annotations"):
        if ann["category_id"] in category_id_to_species:
            image_id_to_species[ann["image_id"]] = category_id_to_species[ann["category_id"]]

    # Step 3: Map image_id -> file_name
    id_to_filename = {img["id"]: img["file_name"] for img in tqdm(data["images"], desc="Processing images")}

    # Step 4: Generate download commands (organized by species)
    download_commands = []
    for img_id, species_folder in tqdm(image_id_to_species.items(), desc="Generating commands"):
        if img_id in id_to_filename:
            file_path = id_to_filename[img_id]
            gcs_path = f"gs://public-datasets-lila/wcs-unzipped/{file_path}"
            local_path = os.path.join(species_folder, file_path.split("/")[-1])  # Keep only filename
            download_commands.append(f"{gcs_path} {local_path}")

    # Step 5: Save commands to a file
    with open("download_commands.sh", "w") as f:
        f.write("#!/bin/bash\n")
        
        # Create all species folders
        species_folders = set(image_id_to_species.values())
        for folder in species_folders:
            f.write(f"mkdir -p {folder}\n")
        
        # Batch all downloads into one gsutil -m cp command
        f.write("gsutil -m cp \\\n")
        for img_id, species_folder in tqdm(image_id_to_species.items(), desc="Generating commands"):
            if img_id in id_to_filename:
                file_path = id_to_filename[img_id]
                gcs_path = f"gs://public-datasets-lila/wcs-unzipped/{file_path}"
                local_path = os.path.join(species_folder, os.path.basename(file_path))
                f.write(f"  {gcs_path} {local_path} \\\n")
        
        f.write("  .\n")  # Dummy line to close the command

    print(f"✅ Saved {len(download_commands)} download commands to 'download_commands.sh'.")
    print("Run this script with: bash download_commands.sh")

if __name__ == "__main__":
    main()

In [None]:
import os
import json
import random
import requests
from tqdm import tqdm
from collections import defaultdict
from concurrent.futures import ThreadPoolExecutor

# Configuration
METADATA_FILE = "wcs_camera_traps.json/wcs_camera_traps.json"
DOWNLOAD_BASE = "https://storage.googleapis.com/public-datasets-lila/wcs-unzipped/"
NUM_WORKERS = 8
TOP_N_SPECIES = 50
MAX_IMAGES_PER_SPECIES = 20000
MAX_RETRIES = 3

def load_metadata():
    with open(METADATA_FILE, 'r') as f:
        return json.load(f)

def main():
    print("Loading metadata...")
    data = load_metadata()

    print("Building species index...")
    species_images = defaultdict(list)
    category_map = {c["id"]: c["name"] for c in data["categories"]}
    image_id_to_file = {img["id"]: img["file_name"] for img in data["images"]}

    for ann in tqdm(data["annotations"], desc="Indexing"):
        species_name = category_map[ann["category_id"]]
        image_id = ann["image_id"]
        species_images[species_name].append(image_id)

    print("Selecting top species...")
    top_species = sorted(
        [(k, len(v)) for k, v in species_images.items()],
        key=lambda x: -x[1]
    )[:TOP_N_SPECIES]

    download_queue = []
    species_counts = defaultdict(int)

    for species, count in tqdm(top_species, desc="Sampling"):
        folder_name = species.replace(" ", "_").lower()
        base_species_path = os.path.join("wcs_download", folder_name)
        os.makedirs(base_species_path, exist_ok=True)

        image_ids = random.sample(species_images[species], min(MAX_IMAGES_PER_SPECIES, count))
        for img_id in image_ids:
            if img_id in image_id_to_file:
                img_file = image_id_to_file[img_id]
                url = f"{DOWNLOAD_BASE}{img_file}"
                sub_path = os.path.normpath(img_file)
                dest_path = os.path.join(base_species_path, sub_path)
                os.makedirs(os.path.dirname(dest_path), exist_ok=True)
                download_queue.append((url, dest_path, folder_name))
                species_counts[folder_name] += 1

    def download_file(args):
        url, dest_path, folder_name = args
        if os.path.exists(dest_path):
            return True
        for attempt in range(MAX_RETRIES):
            try:
                response = requests.get(url, stream=True, timeout=30)
                response.raise_for_status()
                with open(dest_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        f.write(chunk)
                return True
            except Exception as e:
                if attempt == MAX_RETRIES - 1:
                    print(f"Failed to download {url}: {e}")
                    return False

    print(f"\nDownloading {len(download_queue)} images...")
    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        results = list(tqdm(
            executor.map(download_file, download_queue),
            total=len(download_queue),
            desc="Progress"
        ))

    print("\nDownload Summary:")
    for species, count in species_counts.items():
        species_path = os.path.join("wcs_download", species)
        total_files = sum([len(files) for _, _, files in os.walk(species_path)])
        print(f"{species:20s}: {total_files:5d}/{min(MAX_IMAGES_PER_SPECIES, count):5d}")

if __name__ == "__main__":
    main()
