In [1]:
!pip install torch hf_transfer huggingface_hub datasets



# invasive_plants_hawaii EDA
Tripp Lyons

## Load the dataset

In [2]:
import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

from datasets import load_dataset

dataset_id = "imageomics/invasive_plants_hawaii"

# splits are "dorsal", "ventral", "both"
both_dataset = load_dataset(dataset_id, split="both")
both_df = both_dataset.to_pandas()
del both_dataset

both_df.head()

  from .autonotebook import tqdm as notebook_tqdm


Unnamed: 0,session,filename,sampling_type,image,site,day,plant,level,view,healthy,...,mechanical_damage,other_remarks,expert_healthy,expert_rust,expert_leaf_miner,expert_other_insect,expert_mechanical_damage,expert_confidence,expert_other_remarks,expert_notes
0,session_1_1_16_2025,DSC00752.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,D,No,...,No,,No,Yes,No,No,No,Clear,,spots
1,session_1_1_16_2025,DSC00753.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,V,No,...,No,,No,Yes,No,No,No,Clear,,spots
2,session_1_1_16_2025,DSC00675.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,D,No,...,No,,No,Yes,No,No,No,Clear,,
3,session_1_1_16_2025,DSC00676.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,V,No,...,No,,No,Yes,No,No,No,Clear,,
4,session_1_1_16_2025,DSC00687.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,D,No,...,No,,No,Yes,No,No,No,Maybe,,


In [3]:
import pandas as pd

metadata_url = "https://huggingface.co/datasets/imageomics/invasive_plants_hawaii/resolve/main/metadata/full_dataset.csv"

metadata_df = pd.read_csv(metadata_url)[["filename", "index"]]

print(metadata_df)

both_df = pd.merge(both_df, metadata_df, left_on="filename", right_on="filename", how="left")

both_df["unique_index"] = both_df["index"].astype(str) + " " + both_df["session"].astype(str) + " " + both_df["site"].astype(str) + " " + both_df["day"].astype(str) + " " + both_df["plant"].astype(str) + " " + both_df["sampling_type"].astype(str)

both_df.head()

                        filename  index
0                   DSC00752.png      1
1                   DSC00753.png      1
2                   DSC00675.png      2
3                   DSC00676.png      2
4                   DSC00687.png      3
...                          ...    ...
1637  PXL_20250129_071728427.png     10
1638  PXL_20250129_072340989.png     11
1639  PXL_20250129_072347509.png     11
1640  PXL_20250129_070740688.png     12
1641  PXL_20250129_070746621.png     12

[1642 rows x 2 columns]


Unnamed: 0,session,filename,sampling_type,image,site,day,plant,level,view,healthy,...,expert_healthy,expert_rust,expert_leaf_miner,expert_other_insect,expert_mechanical_damage,expert_confidence,expert_other_remarks,expert_notes,index,unique_index
0,session_1_1_16_2025,DSC00752.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,D,No,...,No,Yes,No,No,No,Clear,,spots,1,1 session_1_1_16_2025 1 16 exp opportunistic
1,session_1_1_16_2025,DSC00753.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,V,No,...,No,Yes,No,No,No,Clear,,spots,1,1 session_1_1_16_2025 1 16 exp opportunistic
2,session_1_1_16_2025,DSC00675.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,D,No,...,No,Yes,No,No,No,Clear,,,2,2 session_1_1_16_2025 1 16 exp opportunistic
3,session_1_1_16_2025,DSC00676.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,V,No,...,No,Yes,No,No,No,Clear,,,2,2 session_1_1_16_2025 1 16 exp opportunistic
4,session_1_1_16_2025,DSC00687.png,opportunistic,{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHD...,1,16,exp,,D,No,...,No,Yes,No,No,No,Maybe,,,3,3 session_1_1_16_2025 1 16 exp opportunistic


## Are there strong correlations between leaf damage and location site?

In [4]:
import pandas as pd
import numpy as np

site_results = []

damage_types = [
    "healthy",
    "rust",
    "leaf_miner",
    "other_insect",
    "mechanical_damage"
]

damage_amounts = ["Yes", "No", "Maybe"]

for site in both_df["site"].unique():
    result = {
        "site": site,
        "total": len(both_df[both_df["site"] == site])
    }
    for damage_type in damage_types:
        values = both_df[
            (both_df["site"] == site)
        ][damage_type].value_counts()
        for damage_amount in damage_amounts:
            if damage_amount not in values:
                result[f"{damage_type}_{damage_amount}"] = 0
            else:
                result[f"{damage_type}_{damage_amount}"] = values[damage_amount]
    site_results.append(result)

df = pd.DataFrame(site_results)
df.set_index("site", inplace=True)
df.sort_index(inplace=True)

df["healthy_percentage"] = df["healthy_Yes"] / df["total"]
print("Healthy percentage by site:")
for site in df.index:
    print(f"Site {site}: {df.loc[site]["healthy_percentage"]:.2%} (n={int(df.loc[site]['total'])})")


Healthy percentage by site:
Site 1: 24.81% (n=266)
Site 2: 36.36% (n=44)
Site 3: 3.45% (n=58)
Site 5: 0.00% (n=76)
Site 6: 1.72% (n=116)
Site 7: 16.87% (n=166)
Site 9: 23.01% (n=452)
Site 10: 16.04% (n=374)
Site 11: 39.53% (n=86)
Site 12: 27.08% (n=96)


## Are there any missing or inconsistent metadata values?

In [5]:
results = []

for column in both_df.columns:
    if column == "image":
        continue

    values = both_df[column]

    is_numeric = values.apply(lambda x: isinstance(x, (int, float)))
    numeric_values = values[is_numeric]

    z_scores = (numeric_values - np.mean(numeric_values)) / np.std(numeric_values)
    outliers = np.abs(z_scores) > 3

    result = {
        "column": column,
        "nan_count": values.isna().sum() + (values == "nan").sum(),
        "unique_count": values.nunique(),
        "outlier_count": np.sum(outliers),
        "non_nan_count": len(values) - values.isna().sum() - (values == "nan").sum()
    }

    for count_type in ["nan", "outlier"]:
        result[f"{count_type}_percentage"] = result[f"{count_type}_count"] / len(values)

    results.append(result)

df = pd.DataFrame(results)
df.set_index("column", inplace=True)

df

Unnamed: 0_level_0,nan_count,unique_count,outlier_count,non_nan_count,nan_percentage,outlier_percentage
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
session,0,12,0,1734,0.0,0.0
filename,0,1596,0,1734,0.0,0.0
sampling_type,0,2,0,1734,0.0,0.0
site,0,10,0,1734,0.0,0.0
day,0,6,0,1734,0.0,0.0
plant,0,4,0,1734,0.0,0.0
level,982,4,0,752,0.566321,0.0
view,0,2,0,1734,0.0,0.0
healthy,0,3,0,1734,0.0,0.0
rust,0,3,0,1734,0.0,0.0


## Are the dorsal and ventral labels consistent?

In [14]:
view = both_df["view"]
print(both_df["unique_index"].value_counts())

matching_count = 0
total_count = 0

for unique_index in both_df["unique_index"].unique():
    for damage_type in damage_types:
        values = both_df[both_df["unique_index"] == unique_index][damage_type]
        if len(values) == 2:
            if (values == values.iloc[0]).all():
                matching_count += 1
            else:
                print(damage_type, unique_index, values)
            total_count += 1

print(f"Matching count: {matching_count}, Total count: {total_count}")
print(f"Matching percentage: {(matching_count / total_count) * 100:.2f}%")


unique_index
1 session_8_1_27_2025 9 24 2 systematic         12
7 session_8_1_27_2025 9 24 2 systematic         12
4 session_8_1_27_2025 9 24 2 systematic         12
5 session_8_1_27_2025 9 24 1 systematic         12
4 session_8_1_27_2025 9 24 1 systematic         12
                                                ..
4 session_6_1_27_2025 9 24 exp opportunistic     2
5 session_6_1_27_2025 9 24 exp opportunistic     2
7 session_6_1_27_2025 9 24 exp opportunistic     2
8 session_6_1_27_2025 9 24 exp opportunistic     2
12 session_12_1_28_2025 10 28 2 systematic       2
Name: count, Length: 647, dtype: int64
mechanical_damage 30 session_3_1_19_2025 3 19 2 systematic 976    Yes
979     No
Name: mechanical_damage, dtype: object
Matching count: 2679, Total count: 2680
Matching percentage: 99.96%
