In [52]:
import pandas as pd
hierarchy_annotations = pd.read_csv("../fgvc-comp-2025/datasets/hierarchy_labels_train.csv")

In [53]:
import json
index_to_taxon_map = json.load(open("../fgvc-comp-2025/datasets/index_to_taxon.json"))

In [54]:
levels = ["Domain", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species"]

In [55]:
# Define the hierarchical levels in order

def find_leaf(row):
    """
    For a given row, return a Series containing:
    - leaf_level: The hierarchical column (level) where classification ends.
    - leaf_value: The value in that column.
    """
    leaf_level = None
    leaf_value = None
    for level in levels:
        value = row.get(level)
        if pd.notnull(value) and value != "":
            leaf_level = level
            leaf_value = int(value)
    return pd.Series({"leaf_level": leaf_level, "leaf_value": leaf_value})

# Apply the find_leaf function to each row
hierarchy_annotations[["leaf_level", "leaf_value"]] = hierarchy_annotations.apply(find_leaf, axis=1)

# For each level, collect the unique leaf nodes that end at that level
leaf_nodes_by_level = {}
for lvl in levels:
    leaf_nodes = hierarchy_annotations[hierarchy_annotations["leaf_level"] == lvl]["leaf_value"].unique().tolist()
    leaf_nodes_by_level[lvl] = leaf_nodes
    print(f"{lvl} leaf nodes: {sorted(leaf_nodes)}")


Domain leaf nodes: []
Kingdom leaf nodes: []
Phylum leaf nodes: [8]
Class leaf nodes: [9, 10, 13, 16, 18, 19, 23, 27]
Order leaf nodes: [28, 29, 30, 32, 53, 60]
Family leaf nodes: [67, 77, 78, 82, 86, 89, 102, 106, 108]
Genus leaf nodes: [112, 113, 114, 116, 121, 122, 123, 125, 127, 130, 132, 133, 139, 140, 145, 147, 148, 155, 156, 157, 158, 160]
Species leaf nodes: [161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193]


In [56]:
def find_first_leaf_in_row(row, leaf_nodes_by_level, levels):
    """
    For a given row, iterate through levels in order.
    For each level, check if the row's value is in the list of leaf nodes for that level.
    Terminate and return the first matching level and value.
    """
    for level in levels:
        value = row.get(level)
        if pd.notnull(value) and value != "":
            # Convert value to int as leaf nodes were cast to int earlier
            value_int = int(value)
            if value_int in leaf_nodes_by_level[level]:
                return pd.Series({"detected_leaf_level": level, "lnh_label": value_int})
    return pd.Series({"detected_leaf_level": None, "lnh_label": None})

# Apply the function to each row
hierarchy_annotations[["detected_leaf_level", "lnh_label"]] = hierarchy_annotations.apply(
    find_first_leaf_in_row, args=(leaf_nodes_by_level, levels), axis=1
)

# Print a few sample rows to check results
print(hierarchy_annotations[["Domain", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species",
                             "detected_leaf_level", "lnh_label"]].head())

   Domain  Kingdom  Phylum  Class  Order  Family  Genus  Species  \
0       0        1       4   25.0   47.0   101.0  157.0      NaN   
1       0        1       6   19.0   55.0   103.0  115.0    162.0   
2       0        1       6   19.0   37.0    72.0  155.0      NaN   
3       0        1       5   22.0   54.0    79.0  133.0      NaN   
4       0        1       5   22.0   54.0    79.0  133.0      NaN   

  detected_leaf_level  lnh_label  
0               Genus        157  
1               Class         19  
2               Class         19  
3               Genus        133  
4               Genus        133  


In [57]:
levels = ["Domain", "Kingdom", "Phylum", "Class", "Order", "Family", "Genus", "Species", 'lnh_label']
# Apply the mapping to each hierarchy level column in hierarchy_annotations
for level in levels:
    if level in hierarchy_annotations.columns:
        hierarchy_annotations[level] = hierarchy_annotations[level].apply(
            lambda x: index_to_taxon_map[str(int(x))] if pd.notnull(x) and x != "" else x
        )

print(hierarchy_annotations.head())

  annotation_id     Domain   Kingdom         Phylum          Class  \
0       1_1.png  Eukaryota  Animalia       Chordata      Teleostei   
1       2_2.png  Eukaryota  Animalia  Echinodermata  Holothuroidea   
2       3_3.png  Eukaryota  Animalia  Echinodermata  Holothuroidea   
3       4_4.png  Eukaryota  Animalia       Cnidaria   Octocorallia   
4       4_5.png  Eukaryota  Animalia       Cnidaria   Octocorallia   

             Order          Family         Genus                  Species  \
0      Perciformes      Sebastidae  Sebastolobus                      NaN   
1     Synallactida   Stichopodidae  Apostichopus  Apostichopus leukothele   
2      Elasipodida      Elpidiidae   Scotoplanes                      NaN   
3  Scleralcyonacea  Keratoisididae    Keratoisis                      NaN   
4  Scleralcyonacea  Keratoisididae    Keratoisis                      NaN   

  leaf_level  leaf_value detected_leaf_level      lnh_label  
0      Genus         157               Genus   Sebasto

In [58]:
new_df = hierarchy_annotations[["annotation_id", "Phylum", "Class", "Order", "Family", "Genus", "Species", "lnh_label"]]

new_df

Unnamed: 0,annotation_id,Phylum,Class,Order,Family,Genus,Species,lnh_label
0,1_1.png,Chordata,Teleostei,Perciformes,Sebastidae,Sebastolobus,,Sebastolobus
1,2_2.png,Echinodermata,Holothuroidea,Synallactida,Stichopodidae,Apostichopus,Apostichopus leukothele,Holothuroidea
2,3_3.png,Echinodermata,Holothuroidea,Elasipodida,Elpidiidae,Scotoplanes,,Holothuroidea
3,4_4.png,Cnidaria,Octocorallia,Scleralcyonacea,Keratoisididae,Keratoisis,,Keratoisis
4,4_5.png,Cnidaria,Octocorallia,Scleralcyonacea,Keratoisididae,Keratoisis,,Keratoisis
...,...,...,...,...,...,...,...,...
23694,8979_23696.png,Echinodermata,Crinoidea,Comatulida,Zenometridae,Psathyrometra,Psathyrometra fragilis,Crinoidea
23695,8979_23697.png,Echinodermata,Crinoidea,Comatulida,Zenometridae,Psathyrometra,Psathyrometra fragilis,Crinoidea
23696,8980_23698.png,Echinodermata,Crinoidea,Comatulida,Zenometridae,Psathyrometra,Psathyrometra fragilis,Crinoidea
23697,8980_23699.png,Echinodermata,Crinoidea,Comatulida,Zenometridae,Psathyrometra,Psathyrometra fragilis,Crinoidea


In [59]:
levels = ["Phylum", "Class", "Order", "Family", "Genus", "Species", 'lnh_label']
def fill_missing_levels(row):
    """
    For a row of hierarchical levels, if a level is missing (None or empty),
    assign it the value of the most recent previous level that is not missing.
    """
    last_val = None
    for level in levels:
        current_val = row.get(level)
        if pd.notnull(current_val) and current_val != "":
            last_val = current_val
        else:
            row[level] = last_val
    return row

# Apply this function to every row in the DataFrame
new_df = new_df.apply(fill_missing_levels, axis=1)
new_df

Unnamed: 0,annotation_id,Phylum,Class,Order,Family,Genus,Species,lnh_label
0,1_1.png,Chordata,Teleostei,Perciformes,Sebastidae,Sebastolobus,Sebastolobus,Sebastolobus
1,2_2.png,Echinodermata,Holothuroidea,Synallactida,Stichopodidae,Apostichopus,Apostichopus leukothele,Holothuroidea
2,3_3.png,Echinodermata,Holothuroidea,Elasipodida,Elpidiidae,Scotoplanes,Scotoplanes,Holothuroidea
3,4_4.png,Cnidaria,Octocorallia,Scleralcyonacea,Keratoisididae,Keratoisis,Keratoisis,Keratoisis
4,4_5.png,Cnidaria,Octocorallia,Scleralcyonacea,Keratoisididae,Keratoisis,Keratoisis,Keratoisis
...,...,...,...,...,...,...,...,...
23694,8979_23696.png,Echinodermata,Crinoidea,Comatulida,Zenometridae,Psathyrometra,Psathyrometra fragilis,Crinoidea
23695,8979_23697.png,Echinodermata,Crinoidea,Comatulida,Zenometridae,Psathyrometra,Psathyrometra fragilis,Crinoidea
23696,8980_23698.png,Echinodermata,Crinoidea,Comatulida,Zenometridae,Psathyrometra,Psathyrometra fragilis,Crinoidea
23697,8980_23699.png,Echinodermata,Crinoidea,Comatulida,Zenometridae,Psathyrometra,Psathyrometra fragilis,Crinoidea


In [60]:
csv_output_path = "cfg/hierarchy/hierarchy_labels_train_noNone.csv"

new_df.to_csv(csv_output_path, index=False)

In [61]:
print(f"Frequency distribution saved to {csv_output_path}")
freq_dist = new_df["Species"].value_counts().sort_index()

# Print frequency distribution
print("Frequency Distribution of Labels:")
print(freq_dist)


Frequency distribution saved to cfg/hierarchy/hierarchy_labels_train_noNone.csv
Frequency Distribution of Labels:
Species
Abyssocucumis abyssorum    300
Acanthascinae              300
Acanthoptilum              300
Actinernus                 300
Actiniaria                 300
                          ... 
Terebellidae               300
Tunicata                   300
Umbellula                  300
Vesicomyidae               300
Zoantharia                 300
Name: count, Length: 79, dtype: int64


In [7]:
import pandas as pd
# Load the two CSV files
csv_output_path_test = "cfg/hierarchy/self_training_hierarchy_labels_test_noNone.csv"
csv_output_path_train = "cfg/hierarchy/hierarchy_labels_train_noNone.csv"

test_df = pd.read_csv(csv_output_path_test)
train_df = pd.read_csv(csv_output_path_train)

# Extract unique labels from the "Species" column in both CSVs
test_species_labels = set(test_df["Species"].dropna().unique())
train_species_labels = set(train_df["Species"].dropna().unique())

# Find labels present in test but absent in train
missing_in_train = test_species_labels - train_species_labels

# Print the results
print("Labels in 'Species' column present in test but absent in train:")
print(missing_in_train)

# Filter test_df to include only rows where the "Species" label is in missing_in_train
missing_in_train_df = test_df[test_df["Species"].isin(missing_in_train)]

# Compute the frequency distribution of the missing labels
missing_in_train_freq_dist = missing_in_train_df["Species"].value_counts().sort_index()

# Print the frequency distribution
print("Frequency distribution of examples in missing_in_train:")
print(missing_in_train_freq_dist)

Labels in 'Species' column present in test but absent in train:
set()
Frequency distribution of examples in missing_in_train:
Series([], Name: count, dtype: int64)


In [64]:
import pandas as pd
from utils.utils import df_split, map_label_to_idx
def assign_class_weights(df):
    freq_dist = df["label"].value_counts().sort_index()
    
    total_samples = freq_dist.sum()
    n_classes = len(freq_dist)
    class_weights = {}
    
    # Compute weight for each class
    for cls, count in freq_dist.items():
        # Avoid division by zero (if a class count is 0, though it should not occur in this context)
        if count > 0:
            class_weights[cls] = total_samples / (n_classes * count)
        else:
            class_weights[cls] = 0.0
    return class_weights

df = pd.read_csv("cfg/hierarchy/hierarchy_labels_train_noNone.csv")
print(df)
rank = "Species"
df, label_map = map_label_to_idx(df, rank)
train_df, val_df = df_split(
        df, validation_ratio=0.2, seed=42
    )
freq_dist = train_df[rank].value_counts().sort_index()
print(label_map)
print(freq_dist)
class_weights = assign_class_weights(train_df)
mapped_class_weights = {one_hot: class_weights[label] for label, one_hot in label_map.items()}
print(mapped_class_weights)

AttributeError: module 'matplotlib' has no attribute 'get_data_path'