In [1]:
# === PARAMS (edit) ============================================================
INPUT_CSV      = "C:/Users/chris/Desktop/Documents/Code/MultimodalDataChallenge2025/metadata.csv"      # has columns incl. filename_index, taxonID_index, Habitat, Latitude, Longitude, Substrate, eventDate
SHOPPING_CSV   = "C:/Users/chris/Desktop/Documents/Code/MultimodalDataChallenge2025/shopping_list.csv"              # two columns: image,metadata_type (no header)
PER_CLASS_GOAL = 30
# ============================================================================

import pandas as pd
from collections import Counter, defaultdict

COSTS = {"Latitude":1, "Longitude":1, "Habitat":2, "Substrate":2, "eventDate":2}
META_COLS = ["Habitat","Latitude","Longitude","Substrate","eventDate"]

def _norm_empty(x):
    if pd.isna(x): return ""
    s = str(x).strip()
    return "" if s.lower() in {"", "nan", "none"} else s

# Load data
df = pd.read_csv(INPUT_CSV)
df[META_COLS] = df[META_COLS].applymap(_norm_empty)

shop = pd.read_csv(SHOPPING_CSV, header=None, names=["image","metadata_type"])
# Validate metadata types
bad_types = set(shop["metadata_type"]) - set(META_COLS)
if bad_types:
    raise ValueError(f"Unknown metadata types in shopping list: {bad_types}")

# Map shopping list to classes
name_to_class = dict(zip(df["filename_index"], df["taxonID_index"]))
shop["taxonID_index"] = shop["image"].map(name_to_class)

# Basic validations
missing_in_master = shop[shop["taxonID_index"].isna()]
if not missing_in_master.empty:
    print("WARNING: These shopping-list images are not in the master CSV (ignored):")
    display(missing_in_master.head(10))
shop = shop.dropna(subset=["taxonID_index"])
shop["taxonID_index"] = shop["taxonID_index"].astype(int)

# Check we aren't buying something already present
present_checks = shop.merge(df[["filename_index"] + META_COLS], left_on="image", right_on="filename_index", how="left")
present_checks["already_present"] = present_checks.apply(lambda r: _norm_empty(r[r["metadata_type"]]) != "", axis=1)
bad_buys = present_checks[present_checks["already_present"]]
if not bad_buys.empty:
    print(f"WARNING: {len(bad_buys)} shopping entries request metadata that already exists (ignored in 'after' simulation).")
    display(bad_buys.head(10))

# Current counts per class
def counts_from_df(dataframe):
    grp = dataframe.groupby("taxonID_index")
    out = pd.DataFrame(index=sorted(dataframe["taxonID_index"].unique()))
    for col in META_COLS:
        out[col] = grp[col].apply(lambda s: (s != "").sum())
    return out.sort_index()

current_counts = counts_from_df(df)

# Simulate after-purchase counts (ignore buys where metadata already present)
to_apply = present_checks[~present_checks["already_present"]][["image","metadata_type","taxonID_index"]]
after_counts = current_counts.copy()
for (cls, mtype), n in to_apply.groupby(["taxonID_index","metadata_type"]).size().items():
    after_counts.loc[cls, mtype] = after_counts.loc[cls, mtype] + n

# Summaries
classes = df["taxonID_index"].nunique()
selected_counts = Counter(to_apply["metadata_type"])
selected_cost = sum(selected_counts[t] * COSTS[t] for t in selected_counts)

print(f"Classes: {classes}")
print("Selected (valid) purchases per type:", dict(selected_counts))
print(f"Estimated credits for valid purchases: {selected_cost}")

print("\nCurrent counts (first 10 classes):")
display(current_counts.head(10))

print("After-purchase counts (first 10 classes):")
display(after_counts.head(10))

# Goal check: how many NEW items per class per type (vs. PER_CLASS_GOAL)
goal_df = pd.DataFrame(index=after_counts.index)
for col in META_COLS:
    goal_df[col] = (after_counts[col] - current_counts[col]).clip(lower=0)
print("\nNewly added per class (should be close to goal where possible):")
display(goal_df.head(10))

# Per-class shortfall relative to goal (only for types that were in the shopping list at all)
goal_shortfall = pd.DataFrame(index=after_counts.index)
for col in META_COLS:
    planned = PER_CLASS_GOAL if col in set(shop["metadata_type"]) else 0
    goal_shortfall[col] = (planned - goal_df[col]).clip(lower=0)
print("Per-class shortfalls vs. requested PER_CLASS_GOAL (0 means met or exceeded):")
display(goal_shortfall.head(10))

# Sanity: list any classes where we requested but had zero valid buys
zero_buys = []
for col in META_COLS:
    if col in set(shop["metadata_type"]):
        z = goal_df.index[goal_df[col] == 0].tolist()
        if z:
            zero_buys.append((col, len(z)))
if zero_buys:
    print("\nNote: Some classes received 0 valid buys for certain types (likely no empties). Summary:")
    print(zero_buys)


  df[META_COLS] = df[META_COLS].applymap(_norm_empty)


Classes: 183
Selected (valid) purchases per type: {'Latitude': 5394, 'Longitude': 5394, 'Habitat': 5409, 'Substrate': 5402, 'eventDate': 5411}
Estimated credits for valid purchases: 43232

Current counts (first 10 classes):


Unnamed: 0,Habitat,Latitude,Longitude,Substrate,eventDate
0.0,5.0,5.0,8.0,3.0,7.0
1.0,6.0,4.0,8.0,1.0,3.0
2.0,0.0,3.0,4.0,4.0,4.0
3.0,9.0,14.0,10.0,9.0,13.0
4.0,2.0,3.0,3.0,3.0,0.0
5.0,20.0,7.0,7.0,7.0,9.0
6.0,4.0,2.0,1.0,9.0,9.0
7.0,2.0,2.0,4.0,0.0,1.0
8.0,1.0,2.0,0.0,0.0,0.0
9.0,6.0,7.0,4.0,6.0,2.0


After-purchase counts (first 10 classes):


Unnamed: 0,Habitat,Latitude,Longitude,Substrate,eventDate
0.0,35.0,35.0,38.0,33.0,37.0
1.0,36.0,34.0,38.0,31.0,33.0
2.0,30.0,33.0,34.0,34.0,34.0
3.0,39.0,44.0,40.0,39.0,43.0
4.0,32.0,33.0,33.0,33.0,30.0
5.0,50.0,37.0,37.0,37.0,39.0
6.0,34.0,32.0,31.0,39.0,39.0
7.0,32.0,32.0,34.0,30.0,31.0
8.0,31.0,32.0,30.0,30.0,30.0
9.0,36.0,37.0,34.0,36.0,32.0



Newly added per class (should be close to goal where possible):


Unnamed: 0,Habitat,Latitude,Longitude,Substrate,eventDate
0.0,30.0,30.0,30.0,30.0,30.0
1.0,30.0,30.0,30.0,30.0,30.0
2.0,30.0,30.0,30.0,30.0,30.0
3.0,30.0,30.0,30.0,30.0,30.0
4.0,30.0,30.0,30.0,30.0,30.0
5.0,30.0,30.0,30.0,30.0,30.0
6.0,30.0,30.0,30.0,30.0,30.0
7.0,30.0,30.0,30.0,30.0,30.0
8.0,30.0,30.0,30.0,30.0,30.0
9.0,30.0,30.0,30.0,30.0,30.0


Per-class shortfalls vs. requested PER_CLASS_GOAL (0 means met or exceeded):


Unnamed: 0,Habitat,Latitude,Longitude,Substrate,eventDate
0.0,0.0,0.0,0.0,0.0,0.0
1.0,0.0,0.0,0.0,0.0,0.0
2.0,0.0,0.0,0.0,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0
6.0,0.0,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0,0.0
8.0,0.0,0.0,0.0,0.0,0.0
9.0,0.0,0.0,0.0,0.0,0.0
