In [6]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

train_dir = "../input/herbarium-2022-fgvc9/train_metadata.json"
test_dir = "../input/herbarium-2022-fgvc9/test_metadata.json"

#Load training data
with open("../input/herbarium-2022-fgvc9/train_metadata.json") as json_file:
    train_meta = json.load(json_file)

#Load test data
with open("../input/herbarium-2022-fgvc9/test_metadata.json") as json_file:
    test_meta = json.load(json_file)

In [7]:
image_ids = [image["image_id"] for image in train_meta["images"]]
image_dirs = [train_dir + image['file_name'] for image in train_meta["images"]]
category_ids = [annotation['category_id'] for annotation in train_meta['annotations']]
genus_ids = [annotation['genus_id'] for annotation in train_meta['annotations']]

test_ids = [image['image_id'] for image in test_meta]
test_dirs = [test_dir + image['file_name'] for image in test_meta]

#Create the initial training dataframe with the above defined columns
train_df = pd.DataFrame({
    "image_id" : image_ids,
    "image_dir" : image_dirs,
    "category" : category_ids,
    "genus" : genus_ids})

#Create a testing dataframe
test_df = pd.DataFrame({
    "test_id" : test_ids,
    "test_dir" : test_dirs
})

#Add a genus column to the dataframe
genus_map = {genus['genus_id'] : genus['genus'] for genus in train_meta['genera']}
train_df['genus'] = train_df['genus'].map(genus_map)

In [8]:
##Create a family column in the datagframe based on the genus names
    # Step 1: Create dictionary of genus -> family mapping
genus_family_map = {}
for category in train_meta["categories"]:
    genus = category['genus']
    family = category['family']
    genus_family_map[genus] = family

    # Step 2: Create new column with default value of None™
train_df['family'] = None

    # Step 3: Update values in new column based on genus -> family mapping
for i, row in train_df.iterrows():
    genus = row['genus']
    if genus in genus_family_map:
        family = genus_family_map[genus]
        train_df.at[i, 'family'] = family

train_df

Unnamed: 0,image_id,image_dir,category,genus,family
0,00000__001,../input/herbarium-2022-fgvc9/train_metadata.j...,0,Abies,Pinaceae
1,00000__002,../input/herbarium-2022-fgvc9/train_metadata.j...,0,Abies,Pinaceae
2,00000__003,../input/herbarium-2022-fgvc9/train_metadata.j...,0,Abies,Pinaceae
3,00000__004,../input/herbarium-2022-fgvc9/train_metadata.j...,0,Abies,Pinaceae
4,00000__005,../input/herbarium-2022-fgvc9/train_metadata.j...,0,Abies,Pinaceae
...,...,...,...,...,...
839767,15504__032,../input/herbarium-2022-fgvc9/train_metadata.j...,15504,Zygophyllum,Zygophyllaceae
839768,15504__033,../input/herbarium-2022-fgvc9/train_metadata.j...,15504,Zygophyllum,Zygophyllaceae
839769,15504__035,../input/herbarium-2022-fgvc9/train_metadata.j...,15504,Zygophyllum,Zygophyllaceae
839770,15504__036,../input/herbarium-2022-fgvc9/train_metadata.j...,15504,Zygophyllum,Zygophyllaceae


In [27]:
#Filter only the images of plants that are in the Poaceae family
grass_images = train_df.loc[train_df['family'] == 'Poaceae']
#Reset index
grass_images = Poaceae_family.reset_index(drop=True)
grass_images

Unnamed: 0,image_id,image_dir,category,genus,family
0,00333__001,../input/herbarium-2022-fgvc9/train_metadata.j...,333,Agrostis,Poaceae
1,00333__002,../input/herbarium-2022-fgvc9/train_metadata.j...,333,Agrostis,Poaceae
2,00333__003,../input/herbarium-2022-fgvc9/train_metadata.j...,333,Agrostis,Poaceae
3,00333__004,../input/herbarium-2022-fgvc9/train_metadata.j...,333,Agrostis,Poaceae
4,00333__005,../input/herbarium-2022-fgvc9/train_metadata.j...,333,Agrostis,Poaceae
...,...,...,...,...,...
53542,15501__101,../input/herbarium-2022-fgvc9/train_metadata.j...,15501,Zuloagaea,Poaceae
53543,15501__103,../input/herbarium-2022-fgvc9/train_metadata.j...,15501,Zuloagaea,Poaceae
53544,15501__105,../input/herbarium-2022-fgvc9/train_metadata.j...,15501,Zuloagaea,Poaceae
53545,15501__106,../input/herbarium-2022-fgvc9/train_metadata.j...,15501,Zuloagaea,Poaceae


In [30]:
#Add category_id and species column
grass_images["category_id"] = None
grass_images["species"] = None

# Extract category_id and species values from categories where the family is Poaceae
rows = []
for category in train_meta["categories"]:
    if category["family"] == "Poaceae":
        rows.append({
            "category_id": category["category_id"],
            "species": category["species"]
        })


[{'category_id': 333, 'species': 'blasdalei'},
 {'category_id': 334, 'species': 'breviculmis'},
 {'category_id': 335, 'species': 'clavata'},
 {'category_id': 336, 'species': 'densiflora'},
 {'category_id': 337, 'species': 'elliottiana'},
 {'category_id': 338, 'species': 'exarata'},
 {'category_id': 339, 'species': 'hallii'},
 {'category_id': 340, 'species': 'howellii'},
 {'category_id': 341, 'species': 'hyemalis'},
 {'category_id': 342, 'species': 'idahoensis'},
 {'category_id': 343, 'species': 'mertensii'},
 {'category_id': 344, 'species': 'microphylla'},
 {'category_id': 345, 'species': 'oregonensis'},
 {'category_id': 346, 'species': 'pallens'},
 {'category_id': 347, 'species': 'perennans'},
 {'category_id': 348, 'species': 'rossiae'},
 {'category_id': 349, 'species': 'scabra'},
 {'category_id': 350, 'species': 'variabilis'},
 {'category_id': 351, 'species': 'vinealis'},
 {'category_id': 458, 'species': 'texana'},
 {'category_id': 477, 'species': 'aequalis'},
 {'category_id': 478, '