In [1]:
import pandas as pd, random, sys
from collections import defaultdict, Counter

In [None]:
# === PARAMS (edit these) ======================================================
INPUT_CSV  = "C:/Users/chris/Desktop/Documents/Code/MultimodalDataChallenge2025/generatemetadatafile.ipynb"      # existing metadata CSV
OUTPUT_CSV = "shopping_list.csv"              # where to save upload file
PER_CLASS  = 30                               # how many per class per type
TYPES      = ["Latitude","Longitude","Habitat","Substrate","eventDate"]
SEED       = 42
# ============================================================================

COSTS = {"Latitude":1, "Longitude":1, "Habitat":2, "Substrate":2, "eventDate":2}

def _norm_empty(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    return "" if s.lower() in {"", "nan", "none"} else s

# Load & sanity checks
df = pd.read_csv(INPUT_CSV)
required_cols = ["filename_index","taxonID_index","Habitat","Latitude","Longitude","Substrate","eventDate"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in CSV: {missing}")

# Normalize empties
for col in ["Habitat","Latitude","Longitude","Substrate","eventDate"]:
    df[col] = df[col].map(_norm_empty)

# Sampling
random.seed(SEED)
shopping = []
shortfalls = defaultdict(Counter)

for cls, g in df.groupby("taxonID_index", sort=True):
    for mtype in TYPES:
        pool = g[g[mtype] == ""]["filename_index"].tolist()
        if len(pool) < PER_CLASS:
            shortfalls[cls][mtype] = PER_CLASS - len(pool)
            chosen = pool
        else:
            chosen = random.sample(pool, PER_CLASS)
        shopping.extend((fname, mtype) for fname in chosen)

# Save CSV (no header): "image,metadata_type"
out = pd.DataFrame(shopping, columns=["image","metadata_type"])
out.to_csv(OUTPUT_CSV, index=False, header=False)

# Report
counts = Counter(mt for _, mt in shopping)
classes = df["taxonID_index"].nunique()
target_per_type = PER_CLASS * classes
target_cost = sum(target_per_type * COSTS[t] for t in TYPES)
selected_cost = sum(counts[t] * COSTS[t] for t in counts)

print(f"Classes: {classes}")
print("Selected per type:", dict(counts))
print(f"Estimated cost (selected): {selected_cost} credits")
print(f"Ideal target cost (no shortfalls): {target_cost} credits")

# Shortfalls summary (only show types with deficits)
missing_total = sum(sum(c.values()) for c in shortfalls.values())
if missing_total > 0:
    print("\nShortfalls (couldn't find enough empties):")
    # show first few lines; comment out the next block if you want full dump
    shown = 0
    for cls in sorted(shortfalls):
        row = {k:v for k,v in shortfalls[cls].items() if v>0}
        if row:
            print(f"  class {cls}: {row}")
            shown += 1
            if shown >= 15:
                print("  … (truncated)")
                break

# Peek at output
print("\nFirst 10 rows of shopping list:")
display(out.head(10))
print(f"\nSaved to: {OUTPUT_CSV}")


SyntaxError: (unicode error) 'unicodeescape' codec can't decode bytes in position 2-3: truncated \UXXXXXXXX escape (241134964.py, line 2)