In [5]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
import cv2

train_dir = "../input/herbarium-2022-fgvc9/train_metadata.json"
test_dir = "../input/herbarium-2022-fgvc9/test_metadata.json"


with open("../input/herbarium-2022-fgvc9/train_metadata.json") as json_file:
    train_meta = json.load(json_file)
    
with open("../input/herbarium-2022-fgvc9/test_metadata.json") as json_file:
    test_meta = json.load(json_file)

In [6]:
image_ids = [image["image_id"] for image in train_meta["images"]]
image_dirs = [train_dir + image['file_name'] for image in train_meta["images"]]
category_ids = [annotation['category_id'] for annotation in train_meta['annotations']]
genus_ids = [annotation['genus_id'] for annotation in train_meta['annotations']]

test_ids = [image['image_id'] for image in test_meta]
test_dirs = [test_dir + image['file_name'] for image in test_meta]

train_df = pd.DataFrame({
    "image_id" : image_ids,
    "image_dir" : image_dirs,
    "category" : category_ids,
    "genus" : genus_ids})

test_df = pd.DataFrame({
    "test_id" : test_ids,
    "test_dir" : test_dirs
})

train_df

Unnamed: 0,image_id,image_dir,category,genus
0,00000__001,../input/herbarium-2022-fgvc9/train_metadata.j...,0,1
1,00000__002,../input/herbarium-2022-fgvc9/train_metadata.j...,0,1
2,00000__003,../input/herbarium-2022-fgvc9/train_metadata.j...,0,1
3,00000__004,../input/herbarium-2022-fgvc9/train_metadata.j...,0,1
4,00000__005,../input/herbarium-2022-fgvc9/train_metadata.j...,0,1
...,...,...,...,...
839767,15504__032,../input/herbarium-2022-fgvc9/train_metadata.j...,15504,2584
839768,15504__033,../input/herbarium-2022-fgvc9/train_metadata.j...,15504,2584
839769,15504__035,../input/herbarium-2022-fgvc9/train_metadata.j...,15504,2584
839770,15504__036,../input/herbarium-2022-fgvc9/train_metadata.j...,15504,2584


In [7]:
genus_map = {genus['genus_id'] : genus['genus'] for genus in train_meta['genera']}
train_df['genus'] = train_df['genus'].map(genus_map)

# Step 1: Create dictionary of genus -> family mapping
genus_family_map = {}
for category in train_meta["categories"]:
    genus = category['genus']
    family = category['family']
    genus_family_map[genus] = family

# Step 2: Create new column with default value of None
train_df['family'] = None

# Step 3: Update values in new column based on genus -> family mapping
for i, row in train_df.iterrows():
    genus = row['genus']
    if genus in genus_family_map:
        family = genus_family_map[genus]
        train_df.at[i, 'family'] = family

poaceae_images = train_df[train_df['family'] == 'Poaceae']
print(len(poaceae_images))

53547
