In [1]:
!pip install wildlife-datasets git+https://github.com/WildlifeDatasets/wildlife-tools --quiet --upgrade-strategy only-if-needed

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.9/108.9 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.8/139.8 kB[0m [31m12.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.8/127.8 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.6/69.6 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00

In [2]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
from sklearn.metrics import adjusted_rand_score
from wildlife_datasets.datasets import AnimalCLEF2026
from wildlife_tools.features import DeepFeatures
from wildlife_tools.similarity import CosineSimilarity
import torchvision.transforms as T
from transformers import AutoModel
import timm
import torch

# 1. Setup Device & Config
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
batch_size = 32
root = '/kaggle/input/animal-clef-2026'

# 2. LOAD Data
print("loading dataset...")
dataset_tuning = AnimalCLEF2026(root, transform=None, load_label=True)

# 3. Define Models to Tune
model_configs = {
    'LynxID2025': ('conservationxlabs/miewid-msv3', 512),
    'SalamanderID2025': ("hf-hub:BVRA/MegaDescriptor-L-384", 384),
    'SeaTurtleID2022': ("hf-hub:BVRA/MegaDescriptor-L-384", 384),
}

best_eps_values = {}

print("\n--- Starting Automatic EPS Tuning ---")

for species, (model_name, size) in model_configs.items():
    print(f"\nProcessing {species}...")
    
    # Get Training Data Only
    df = dataset_tuning.df
    idx_train = df[(df['dataset'] == species) & (df['split'] == 'train')].index
    if len(idx_train) == 0:
        print(f"Skipping {species} (No training data found)")
        continue
        
    sub_dataset = dataset_tuning.get_subset(idx_train)
    
    # Setup Transforms & Model
    transform = T.Compose([
        T.Resize(size=(size, size)),
        T.ToTensor(),
        T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ])
    sub_dataset.set_transform(transform)
    
    # Load Model
    if 'MegaDescriptor' in model_name:
        model = timm.create_model(model_name, pretrained=True).eval().to(device)
    else:
        model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)
    
    # Extract Features
    print(f"Extracting features for {len(sub_dataset)} images...")
    extractor = DeepFeatures(model=model, device=device, batch_size=batch_size)
    features = extractor(sub_dataset)
    
    # Calculate Similarity & Distance
    matcher = CosineSimilarity()
    similarity = matcher(features, features)
    distance = 1 - similarity 
    distance = np.maximum(distance, 0)

    # Grid Search for Best EPS
    best_ari = -1
    best_eps = 0.0
    
    
    true_labels = sub_dataset.df['identity'].values
    
   
    for eps in np.arange(0.05, 0.50, 0.02):
        clustering = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
        clusters = clustering.fit_predict(distance)
        
        # Handle Noise (-1)
        unique_labels = clusters.copy()
        max_label = unique_labels.max()
        noise_mask = (unique_labels == -1)
        unique_labels[noise_mask] = np.arange(max_label + 1, max_label + 1 + noise_mask.sum())
        
        ari = adjusted_rand_score(true_labels, unique_labels)
        
        if ari > best_ari:
            best_ari = ari
            best_eps = eps
            
    print(f"  -> Best EPS: {best_eps:.2f} (ARI Score: {best_ari:.4f})")
    best_eps_values[species] = best_eps

# 4. Print Final Dictionary
if 'LynxID2025' in best_eps_values:
    best_eps_values['TexasHornedLizards'] = best_eps_values['LynxID2025']
else:
    best_eps_values['TexasHornedLizards'] = 0.24

print("eps_opt = {")
for k, v in best_eps_values.items():
    print(f"    '{k}': {v:.2f},")
print("}")

2026-02-05 20:27:50.228848: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1770323270.388045      24 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1770323270.439681      24 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1770323270.977997      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770323270.978032      24 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1770323270.978035      24 computation_placer.cc:177] computation placer alr

Using device: cuda
loading dataset...

--- Starting Automatic EPS Tuning ---

Processing LynxID2025...


config.json:   0%|          | 0.00/528 [00:00<?, ?B/s]

configuration_miewid.py:   0%|          | 0.00/777 [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/conservationxlabs/miewid-msv3:
- configuration_miewid.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_miewid.py: 0.00B [00:00, ?B/s]

heads.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/conservationxlabs/miewid-msv3:
- heads.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/conservationxlabs/miewid-msv3:
- modeling_miewid.py
- heads.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/206M [00:00<?, ?B/s]

Building Model Backbone for efficientnetv2_rw_m model
config.model_name efficientnetv2_rw_m
model_name efficientnetv2_rw_m


model.safetensors:   0%|          | 0.00/214M [00:00<?, ?B/s]



final_in_features 2152
Extracting features for 2957 images...


100%|███████████████████████████████████████████████████████████████| 93/93 [01:57<00:00,  1.26s/it]


  -> Best EPS: 0.37 (ARI Score: 0.1520)

Processing SalamanderID2025...


config.json:   0%|          | 0.00/609 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Extracting features for 1388 images...


100%|███████████████████████████████████████████████████████████████| 44/44 [01:03<00:00,  1.43s/it]


  -> Best EPS: 0.19 (ARI Score: 0.0866)

Processing SeaTurtleID2022...
Extracting features for 8729 images...


100%|█████████████████████████████████████████████████████████████| 273/273 [06:08<00:00,  1.35s/it]


  -> Best EPS: 0.17 (ARI Score: 0.8115)
eps_opt = {
    'LynxID2025': 0.37,
    'SalamanderID2025': 0.19,
    'SeaTurtleID2022': 0.17,
    'TexasHornedLizards': 0.37,
}


In [3]:

import pandas as pd
import numpy as np

submission_dfs = []

print("--- Generating Final Submission ---")

# Ensure TexasHornedLizards has a value (fallback to Lynx or baseline if missing)
if 'TexasHornedLizards' not in best_eps_values:
    best_eps_values['TexasHornedLizards'] = best_eps_values.get('LynxID2025', 0.24)

# Iterate through all competition datasets
competition_datasets = [
    'LynxID2025', 'SalamanderID2025', 'SeaTurtleID2022', 'TexasHornedLizards'
]

for name in competition_datasets:
    print(f"\nProcessing Test Set: {name}")
    
    # 1. Get the TEST subset
    df_full = dataset_tuning.df
    idx_test = df_full[(df_full['dataset'] == name) & (df_full['split'] == 'test')].index
    
    if len(idx_test) == 0:
        print(f"Warning: No test data found for {name}")
        continue
        
    dataset_test = dataset_tuning.get_subset(idx_test)
    print(f"  -> Found {len(dataset_test)} test images")
    
    # 2. Setup Model (Same as tuning step)
    if name in ['SalamanderID2025', 'SeaTurtleID2022']:
        model_name = "hf-hub:BVRA/MegaDescriptor-L-384"
        size = 384
    else:
        # Lynx and TexasHornedLizards use the miewid model
        model_name = "conservationxlabs/miewid-msv3"
        size = 512
        
    # Load Model
    if 'MegaDescriptor' in model_name:
        model = timm.create_model(model_name, pretrained=True).eval().to(device)
    else:
        model = AutoModel.from_pretrained(model_name, trust_remote_code=True).to(device)
    
    # 3. Extract Features
    transform = T.Compose([
        T.Resize(size=(size, size)),
        T.ToTensor(),
        T.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ])
    dataset_test.set_transform(transform)
    
    extractor = DeepFeatures(model=model, device=device, batch_size=batch_size)
    features = extractor(dataset_test)
    
    # 4. Clustering with OPTIMIZED EPS
    # Retrieve the tuned eps value from the dictionary
    eps = best_eps_values[name]
    print(f"  -> Using optimized EPS: {eps:.3f}")
    
    matcher = CosineSimilarity()
    similarity = matcher(features, features)
    distance = np.maximum(1 - similarity, 0) # Convert similarity to distance
    
    clustering = DBSCAN(eps=eps, metric='precomputed', min_samples=2)
    clusters = clustering.fit_predict(distance)
    
    # 5. Format Labels for Submission
    # "cluster_{dataset}_{label}"
    unique_labels = clusters.copy()
    
    # Handle Noise (-1): Assign unique IDs so they don't cluster together
    max_label = unique_labels.max()
    noise_mask = (unique_labels == -1)
    if noise_mask.sum() > 0:
        # Create new IDs starting after the max existing label
        new_ids = np.arange(max_label + 1, max_label + 1 + noise_mask.sum())
        unique_labels[noise_mask] = new_ids
    
    # Create prediction strings
    predictions = [f"cluster_{name}_{label}" for label in unique_labels]
    
    # 6. Create DataFrame
    # Note: Adjust 'image_id' if your dataset column is named differently (e.g., 'id')
    ids = dataset_test.df['image_id'].values
    sub = pd.DataFrame({'image_id': ids, 'prediction': predictions})
    submission_dfs.append(sub)

# Combine and Save
if len(submission_dfs) > 0:
    full_submission = pd.concat(submission_dfs)
    full_submission.to_csv('submission.csv', index=False)
    print("\n" + "="*40)
    print(f"SUCCESS: Saved submission.csv with {len(full_submission)} rows.")
    print("="*40)
    print(full_submission.head())
else:
    print("\nError: No predictions generated.")

--- Generating Final Submission ---

Processing Test Set: LynxID2025
  -> Found 946 test images
Building Model Backbone for efficientnetv2_rw_m model
config.model_name efficientnetv2_rw_m
model_name efficientnetv2_rw_m




final_in_features 2152


100%|███████████████████████████████████████████████████████████████| 30/30 [00:41<00:00,  1.39s/it]


  -> Using optimized EPS: 0.370

Processing Test Set: SalamanderID2025
  -> Found 689 test images


100%|███████████████████████████████████████████████████████████████| 22/22 [00:30<00:00,  1.40s/it]


  -> Using optimized EPS: 0.190

Processing Test Set: SeaTurtleID2022
  -> Found 500 test images


100%|███████████████████████████████████████████████████████████████| 16/16 [00:21<00:00,  1.37s/it]


  -> Using optimized EPS: 0.170

Processing Test Set: TexasHornedLizards
  -> Found 274 test images
Building Model Backbone for efficientnetv2_rw_m model
config.model_name efficientnetv2_rw_m
model_name efficientnetv2_rw_m




final_in_features 2152


100%|█████████████████████████████████████████████████████████████████| 9/9 [00:17<00:00,  1.96s/it]

  -> Using optimized EPS: 0.370

SUCCESS: Saved submission.csv with 2409 rows.
   image_id             prediction
0         3  cluster_LynxID2025_48
1         5   cluster_LynxID2025_0
2        12  cluster_LynxID2025_49
3        13  cluster_LynxID2025_50
4        18   cluster_LynxID2025_0



