In [1]:
import pandas as pd, random, sys
from collections import defaultdict, Counter

In [6]:
# === PARAMS (edit these) ======================================================
INPUT_CSV  = "C:/Users/chris/Desktop/Documents/Code/MultimodalDataChallenge2025/metadata.csv"   # existing metadata CSV
OUTPUT_CSV = "shopping_list.csv"           # output upload file
PER_CLASS  = 30                            # how many per class per type
SEED       = 42

# Buy these solo types individually:
SOLO_TYPES = ["Habitat","Substrate","eventDate"]

# Treat this pair as a single decision: only buy if BOTH are missing on the same image:
PAIR       = ("Latitude","Longitude")
# ============================================================================

import pandas as pd, random
from collections import defaultdict, Counter

COSTS = {"Latitude":1, "Longitude":1, "Habitat":2, "Substrate":2, "eventDate":2}
PAIR_LABEL = f"{PAIR[0]}+{PAIR[1]}"

def _norm_empty(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    return "" if s.lower() in {"", "nan", "none"} else s

# Load & checks
df = pd.read_csv(INPUT_CSV)
required_cols = ["filename_index","taxonID_index","Habitat","Latitude","Longitude","Substrate","eventDate"]
missing = [c for c in required_cols if c not in df.columns]
if missing:
    raise ValueError(f"Missing columns in CSV: {missing}")

# Normalize empties
for col in ["Habitat","Latitude","Longitude","Substrate","eventDate"]:
    df[col] = df[col].map(_norm_empty)

random.seed(SEED)
shopping = []
shortfalls = defaultdict(Counter)

# Group by class and sample
for cls, g in df.groupby("taxonID_index", sort=True):
    # 1) Paired Latitude+Longitude: require BOTH missing
    pair_pool = g[(g[PAIR[0]] == "") & (g[PAIR[1]] == "")]["filename_index"].tolist()
    if len(pair_pool) < PER_CLASS:
        shortfalls[cls][PAIR_LABEL] = PER_CLASS - len(pair_pool)
        pair_chosen = pair_pool
    else:
        pair_chosen = random.sample(pair_pool, PER_CLASS)
    for fname in pair_chosen:
        shopping.append((fname, PAIR[0]))
        shopping.append((fname, PAIR[1]))

    # 2) Solo types (Habitat, Substrate, eventDate)
    for mtype in SOLO_TYPES:
        pool = g[g[mtype] == ""]["filename_index"].tolist()
        if len(pool) < PER_CLASS:
            shortfalls[cls][mtype] = PER_CLASS - len(pool)
            chosen = pool
        else:
            chosen = random.sample(pool, PER_CLASS)
        shopping.extend((fname, mtype) for fname in chosen)

# Save CSV (no header): "image,metadata_type"
out = pd.DataFrame(shopping, columns=["image","metadata_type"])
out.to_csv(OUTPUT_CSV, index=False, header=False)

# Reporting
counts = Counter(mt for _, mt in shopping)
classes = df["taxonID_index"].nunique()
target_per_type = PER_CLASS * classes
target_cost = (target_per_type * (COSTS[PAIR[0]] + COSTS[PAIR[1]])) \
            + sum(target_per_type * COSTS[t] for t in SOLO_TYPES)
selected_cost = sum(counts[t] * COSTS[t] for t in counts)

print(f"Classes: {classes}")
print("Selected per type:", dict(counts))
print(f"Estimated cost (selected): {selected_cost} credits")
print(f"Ideal target cost (no shortfalls): {target_cost} credits")

missing_total = sum(sum(c.values()) for c in shortfalls.values())
if missing_total > 0:
    print("\nShortfalls (couldn't find enough empties):")
    shown = 0
    for cls in sorted(shortfalls):
        row = {k:v for k,v in shortfalls[cls].items() if v>0}
        if row:
            print(f"  class {cls}: {row}")
            shown += 1
            if shown >= 15:
                print("  … (truncated)")
                break

print("\nFirst 10 rows of shopping list:")
display(out.head(5000))
print(f"\nSaved to: {OUTPUT_CSV}")


Classes: 183
Selected per type: {'Latitude': 5394, 'Longitude': 5394, 'Habitat': 5409, 'Substrate': 5402, 'eventDate': 5411}
Estimated cost (selected): 43232 credits
Ideal target cost (no shortfalls): 43920 credits

Shortfalls (couldn't find enough empties):
  class 36.0: {'Latitude+Longitude': 5, 'Habitat': 4, 'Substrate': 6, 'eventDate': 4}
  class 59.0: {'Latitude+Longitude': 4, 'Habitat': 5, 'Substrate': 3, 'eventDate': 3}
  class 89.0: {'Latitude+Longitude': 11, 'Habitat': 8, 'Substrate': 9, 'eventDate': 9}
  class 103.0: {'Latitude+Longitude': 6, 'Habitat': 3, 'Substrate': 6, 'eventDate': 3}
  class 108.0: {'Latitude+Longitude': 8, 'Habitat': 7, 'Substrate': 8, 'eventDate': 7}
  class 118.0: {'Latitude+Longitude': 5, 'Habitat': 5, 'Substrate': 5, 'eventDate': 5}
  class 132.0: {'Latitude+Longitude': 1, 'Habitat': 1, 'Substrate': 1, 'eventDate': 2}
  class 146.0: {'Latitude+Longitude': 8, 'Habitat': 6, 'Substrate': 6, 'eventDate': 6}
  class 148.0: {'Latitude+Longitude': 17, 'Habi

Unnamed: 0,image,metadata_type
0,fungi_train022085.jpg,Latitude
1,fungi_train022085.jpg,Longitude
2,fungi_train003606.jpg,Latitude
3,fungi_train003606.jpg,Longitude
4,fungi_train000863.jpg,Latitude
...,...,...
4995,fungi_train020889.jpg,Longitude
4996,fungi_train018582.jpg,Latitude
4997,fungi_train018582.jpg,Longitude
4998,fungi_train000985.jpg,Latitude



Saved to: shopping_list.csv
