In [1]:
!pip install torch hf_transfer huggingface_hub datasets



# invasive_plants_hawaii EDA
Tripp Lyons

## Load the dataset

In [2]:
import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

from datasets import load_dataset

dataset_id = "imageomics/invasive_plants_hawaii"

# splits are "dorsal", "ventral", "both"
both_dataset = load_dataset(dataset_id, split="both")
both_df = both_dataset.to_pandas()
del both_dataset

both_df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,session,filename,sampling_type,image,site,day,plant,level,view,healthy,...,mechanical_damage,other_remarks,expert_healthy,expert_rust,expert_leaf_miner,expert_other_insect,expert_mechanical_damage,expert_confidence,expert_other_remarks,expert_notes
0,session_1_1_16_2025,DSC00752.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,D,No,...,No,,No,Yes,No,No,No,Clear,,spots
1,session_1_1_16_2025,DSC00753.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,V,No,...,No,,No,Yes,No,No,No,Clear,,spots
2,session_1_1_16_2025,DSC00675.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,D,No,...,No,,No,Yes,No,No,No,Clear,,
3,session_1_1_16_2025,DSC00676.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,V,No,...,No,,No,Yes,No,No,No,Clear,,
4,session_1_1_16_2025,DSC00687.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,D,No,...,No,,No,Yes,No,No,No,Maybe,,


## Are there strong correlations between leaf damage and location site?

In [3]:
import pandas as pd
import numpy as np

site_results = []

damage_types = [
    "healthy",
    "rust",
    "leaf_miner",
    "other_insect",
    "mechanical_damage"
]

damage_amounts = ["Yes", "No", "Maybe"]

for site in both_df["site"].unique():
    result = {
        "site": site,
        "total": len(both_df[both_df["site"] == site])
    }
    for damage_type in damage_types:
        values = both_df[
            (both_df["site"] == site)
        ][damage_type].value_counts()
        for damage_amount in damage_amounts:
            if damage_amount not in values:
                result[f"{damage_type}_{damage_amount}"] = 0
            else:
                result[f"{damage_type}_{damage_amount}"] = values[damage_amount]
    site_results.append(result)

df = pd.DataFrame(site_results)
df.set_index("site", inplace=True)
df.sort_index(inplace=True)

df["healthy_percentage"] = df["healthy_Yes"] / df["total"]
print("Healthy percentage by site:")
for site in df.index:
    print(f"Site {site}: {df.loc[site]["healthy_percentage"]:.2%}")


Healthy percentage by site:
Site 1: 24.81%
Site 2: 36.36%
Site 3: 3.45%
Site 5: 0.00%
Site 6: 1.72%
Site 7: 16.87%
Site 9: 22.22%
Site 10: 16.04%
Site 11: 39.53%
Site 12: 27.08%


## Are there any missing or inconsistent metadata values?

In [4]:
results = []

for column in both_df.columns:
    if column == "image":
        continue

    values = both_df[column]

    is_numeric = values.apply(lambda x: isinstance(x, (int, float)))
    numeric_values = values[is_numeric]

    z_scores = (numeric_values - np.mean(numeric_values)) / np.std(numeric_values)
    outliers = np.abs(z_scores) > 3

    result = {
        "column": column,
        "nan_count": values.isna().sum() + (values == "nan").sum(),
        "unique_count": values.nunique(),
        "outlier_count": np.sum(outliers),
    }

    for count_type in ["nan", "outlier"]:
        result[f"{count_type}_percentage"] = result[f"{count_type}_count"] / len(values)

    results.append(result)

df = pd.DataFrame(results)
df.set_index("column", inplace=True)

df

Unnamed: 0_level_0,nan_count,unique_count,outlier_count,nan_percentage,outlier_percentage
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
session,0,12,0,0.0,0.0
filename,0,1596,0,0.0,0.0
sampling_type,0,2,0,0.0,0.0
site,0,10,0,0.0,0.0
day,0,6,0,0.0,0.0
plant,0,4,0,0.0,0.0
level,982,4,0,0.598051,0.0
view,0,2,0,0.0,0.0
healthy,0,3,0,0.0,0.0
rust,0,3,0,0.0,0.0
