In [23]:
import pandas as pd
hierarchy_annotations = pd.read_csv("../fgvc-comp-2025/datasets/hierarchy_labels_train.csv")

In [24]:
import json
index_to_taxon_map = json.load(open("../fgvc-comp-2025/datasets/index_to_taxon.json"))

In [25]:
# Define the hierarchical levels in order
levels = ["Domain", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]

def find_leaf(row):
    """
    For a given row, return a Series containing:
    - leaf_level: The hierarchical column (level) where classification ends.
    - leaf_value: The value in that column.
    """
    leaf_level = None
    leaf_value = None
    for level in levels:
        value = row.get(level)
        if pd.notnull(value) and value != "":
            leaf_level = level
            leaf_value = int(value)
    return pd.Series({"leaf_level": leaf_level, "leaf_value": leaf_value})

# Apply the find_leaf function to each row
hierarchy_annotations[["leaf_level", "leaf_value"]] = hierarchy_annotations.apply(find_leaf, axis=1)

# For each level, collect the unique leaf nodes that end at that level
leaf_nodes_by_level = {}
for lvl in levels:
    leaf_nodes = hierarchy_annotations[hierarchy_annotations["leaf_level"] == lvl]["leaf_value"].unique().tolist()
    leaf_nodes_by_level[lvl] = leaf_nodes
    print(f"{lvl} leaf nodes: {sorted(leaf_nodes)}")


Domain leaf nodes: []
Kingdom leaf nodes: []
Phylum leaf nodes: [8]
Class leaf nodes: [9, 10, 13, 16, 18, 19, 23, 27]
Order leaf nodes: [28, 29, 30, 32, 53, 60]
Family leaf nodes: [67, 77, 78, 82, 86, 89, 102, 106, 108]
Genus leaf nodes: [112, 113, 114, 116, 121, 122, 123, 125, 127, 130, 132, 133, 139, 140, 145, 147, 148, 155, 156, 157, 158, 160]
Species leaf nodes: [161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193]


In [26]:
def find_first_leaf_in_row(row, leaf_nodes_by_level, levels):
    """
    For a given row, iterate through levels in order.
    For each level, check if the row's value is in the list of leaf nodes for that level.
    Terminate and return the first matching level and value.
    """
    for level in levels:
        value = row.get(level)
        if pd.notnull(value) and value != "":
            # Convert value to int as leaf nodes were cast to int earlier
            value_int = int(value)
            if value_int in leaf_nodes_by_level[level]:
                return pd.Series({"detected_leaf_level": level, "detected_leaf_value": value_int})
    return pd.Series({"detected_leaf_level": None, "detected_leaf_value": None})

# Apply the function to each row
hierarchy_annotations[["detected_leaf_level", "detected_leaf_value"]] = hierarchy_annotations.apply(
    find_first_leaf_in_row, args=(leaf_nodes_by_level, levels), axis=1
)

# Print a few sample rows to check results
print(hierarchy_annotations[["Domain", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species",
                             "detected_leaf_level", "detected_leaf_value"]].head())

   Domain  Kingdom  Phylum  Class  Order  Family  Genus  Species  \
0       0        1       4   25.0   47.0   101.0  157.0      NaN   
1       0        1       6   19.0   55.0   103.0  115.0    162.0   
2       0        1       6   19.0   37.0    72.0  155.0      NaN   
3       0        1       5   22.0   54.0    79.0  133.0      NaN   
4       0        1       5   22.0   54.0    79.0  133.0      NaN   

  detected_leaf_level  detected_leaf_value  
0               Genus                  157  
1               Class                   19  
2               Class                   19  
3               Genus                  133  
4               Genus                  133  


In [27]:
new_df = hierarchy_annotations[["annotation_id", "detected_leaf_value"]].rename(columns={"detected_leaf_value": "label"})
new_df["label"] = new_df["label"].apply(
    lambda x: index_to_taxon_map[str(int(x))]
    )

In [28]:
csv_output_path = "cfg/hierarchy/leafnode_labels_train.csv"

new_df.to_csv(csv_output_path, index=False)

In [29]:
print(f"Frequency distribution saved to {csv_output_path}")
freq_dist = new_df["label"].value_counts().sort_index()

# Print frequency distribution
print("Frequency Distribution of Labels:")
print(freq_dist)


Frequency distribution saved to cfg/hierarchy/leafnode_labels_train.csv
Frequency Distribution of Labels:
label
Acanthoptilum                   300
Actiniaria                     1800
Actinopterygii                  300
Amphipoda                       300
Asteroidea                     1500
Benthocodon pedunculata         300
Caridea                         300
Ceriantharia                    300
Chionoecetes tanneri            300
Chorilia longipes               300
Corallimorphus pilatus          300
Crinoidea                       900
Delectopecten                   300
Funiculina                      300
Gastropoda                      300
Gersemia juliepackardae         300
Heterocarpus                    300
Heteropolypus ritteri           300
Holothuroidea                  2999
Isidella tentaculum             300
Isididae                        300
Keratoisis                      300
Lithodidae                      600
Merluccius productus            300
Microstomus pacificus   

In [9]:
import pandas as pd
from utils.utils import df_split, map_label_to_idx
def assign_class_weights(df):
    freq_dist = df["label"].value_counts().sort_index()
    
    total_samples = freq_dist.sum()
    n_classes = len(freq_dist)
    class_weights = {}
    
    # Compute weight for each class
    for cls, count in freq_dist.items():
        # Avoid division by zero (if a class count is 0, though it should not occur in this context)
        if count > 0:
            class_weights[cls] = total_samples / (n_classes * count)
        else:
            class_weights[cls] = 0.0
    return class_weights

df = pd.read_csv("cfg/hierarchy/leafnode_labels_train.csv")
print(df)
df, label_map = map_label_to_idx(df, "label")
train_df, val_df = df_split(
        df, validation_ratio=0.2, seed=42
    )
freq_dist = train_df["label"].value_counts().sort_index()
print(label_map)
print(freq_dist)
class_weights = assign_class_weights(train_df)
mapped_class_weights = {one_hot: class_weights[label] for label, one_hot in label_map.items()}
print(mapped_class_weights)

        annotation_id          label
0             1_1.png   Sebastolobus
1             2_2.png  Holothuroidea
2             3_3.png  Holothuroidea
3             4_4.png     Keratoisis
4             4_5.png     Keratoisis
...               ...            ...
23694  8979_23696.png      Crinoidea
23695  8979_23697.png      Crinoidea
23696  8980_23698.png      Crinoidea
23697  8980_23699.png      Crinoidea
23698  8981_23700.png      Crinoidea

[23699 rows x 2 columns]
{'Sebastolobus': 0, 'Holothuroidea': 1, 'Keratoisis': 2, 'Munnopsidae': 3, 'Chionoecetes tanneri': 4, 'Asteroidea': 5, 'Munidopsis': 6, 'Serpulidae': 7, 'Delectopecten': 8, 'Crinoidea': 9, 'Tunicata': 10, 'Pandalus amplus': 11, 'Isidella tentaculum': 12, 'Paragorgia': 13, 'Porifera': 14, 'Terebellidae': 15, 'Ophiuroidea': 16, 'Actiniaria': 17, 'Ceriantharia': 18, 'Isididae': 19, 'Strongylocentrotus fragilis': 20, 'Caridea': 21, 'Gastropoda': 22, 'Benthocodon pedunculata': 23, 'Octopus rubescens': 24, 'Microstomus pacificus':