In [1]:
import pandas as pd
import numpy as np
import json

## Herbaria 2022

In [2]:
HERBARIA22_TRAIN_PATH = "./../../data/kaggle-herbaria/herbarium-2021/train_images/"
HERBARIA22_TEST_PATH = "./../../data/kaggle-herbaria/herbarium-2022/test_images/"
HERBARIA22_METADATA_TRAIN = "./../../data/kaggle-herbaria/herbarium-2022/train_metadata.json"
HERBARIA22_METADATA_TEST = "./../../data/kaggle-herbaria/herbarium-2022/test_metadata.json"

In [3]:
with open(HERBARIA22_METADATA_TRAIN) as f:
    train_metadata = json.load(f)

with open(HERBARIA22_METADATA_TEST) as f:
    test_metadata = json.load(f)

In [14]:
categories = {cat['category_id']: cat for cat in train_metadata['categories']}
images = {im['image_id']: im for im in train_metadata['images']}
institutions = {inst['institution_id']: inst for inst in train_metadata['institutions']}

In [15]:
train22_processed = [] # will be turned into a dataframe later. sublists are of the form <image_id>, <image_path>, <species>, <genus>, <family>, <institution>

for i in range(len(train_metadata['annotations'])):
    image = train_metadata['annotations'][i]

    image_id = image['image_id']
    image_path = images[image_id]['file_name']
    category = categories[image['category_id']]
    institution = institutions[image['institution_id']]    

    sublist = [image_id, image_path, category['species'], category['genus'], category['family'], institution['collectionCode']]
    train22_processed += [sublist]
    
    

In [26]:
df = pd.DataFrame(train22_processed, columns=['image_id', 'filename', 'species', 'genus', 'family', 'institution'])
df.to_csv("/projectnb/herbdl/data/kaggle-herbaria/train_2022_metadata.csv")
df.head()

Unnamed: 0,image_id,filename,species,genus,family,institution
0,00000__001,000/00/00000__001.jpg,amabilis,Abies,Pinaceae,A
1,00000__002,000/00/00000__002.jpg,amabilis,Abies,Pinaceae,A
2,00000__003,000/00/00000__003.jpg,amabilis,Abies,Pinaceae,A
3,00000__004,000/00/00000__004.jpg,amabilis,Abies,Pinaceae,A
4,00000__005,000/00/00000__005.jpg,amabilis,Abies,Pinaceae,A


#### Labeling

In [19]:
labeled = []
for i, row in df.iterrows():
    species = row['species']
    genus = row['genus']
    family = row['family']
    institution = row['institution']

    img_id = row['image_id']
    filename = row['filename']
    label = f"This is an image of species {species}, in the genus {genus} of family {family}. It is part of the collection of institution {institution}."

    labeled += [[img_id, filename, label]]

In [27]:
labeled_df = pd.DataFrame(labeled, columns=["image_id", "filename", "label"])
labeled_df.to_csv('/projectnb/herbdl/data/kaggle-herbaria/train_2022_labeled.csv')
labeled_df['label'][0]

'This is an image of species amabilis, in the genus Abies of family Pinaceae. It is part of the collection of institution A.'

## Herbaria 2021

In [73]:
HERBARIA21_TRAIN_PATH = "./herbarium-2021/train/"
HERBARIA21_TEST_PATH = "./herbarium-2021/test/"
HERBARIA21_METADATA_TRAIN = "./herbarium-2021/train/metadata.json"
HERBARIA21_METADATA_TEST = "./herbarium-2021/test/metadata.json"

In [74]:
with open(HERBARIA21_METADATA_TRAIN) as f:
    train_metadata = json.load(f)

with open(HERBARIA21_METADATA_TEST) as f:
    test_metadata = json.load(f)

In [82]:
train_metadata.keys()

dict_keys(['annotations', 'categories', 'images', 'info', 'licenses', 'institutions'])

In [92]:
train_metadata['categr']

[{'category_id': 60492,
  'id': 1814367,
  'image_id': 1814367,
  'institution_id': 0},
 {'category_id': 10824,
  'id': 1308257,
  'image_id': 1308257,
  'institution_id': 0},
 {'category_id': 33076,
  'id': 1270453,
  'image_id': 1270453,
  'institution_id': 3},
 {'category_id': 24799,
  'id': 1123834,
  'image_id': 1123834,
  'institution_id': 0},
 {'category_id': 17018,
  'id': 1042410,
  'image_id': 1042410,
  'institution_id': 0},
 {'category_id': 61302,
  'id': 1526092,
  'image_id': 1526092,
  'institution_id': 0},
 {'category_id': 54546,
  'id': 1621621,
  'image_id': 1621621,
  'institution_id': 0},
 {'category_id': 9574,
  'id': 1473291,
  'image_id': 1473291,
  'institution_id': 0},
 {'category_id': 22425,
  'id': 1056165,
  'image_id': 1056165,
  'institution_id': 0},
 {'category_id': 37047,
  'id': 1612588,
  'image_id': 1612588,
  'institution_id': 0},
 {'category_id': 45524,
  'id': 1676961,
  'image_id': 1676961,
  'institution_id': 0},
 {'category_id': 54204,
  'id': 1

In [97]:
institutions

{0: {'id': 0, 'name': 'New York Botanical Garden'},
 1: {'id': 1, 'name': 'Auckland War Memorial Museum'},
 2: {'id': 2, 'name': 'Bishop Museum'},
 3: {'id': 3, 'name': 'Queensland Herbarium'},
 4: {'id': 4, 'name': 'Naturalis Biodiversity Center'}}

In [93]:
categories = {cat['id']: cat for cat in train_metadata['categories']}
images = {im['id']: im for im in train_metadata['images']}
institutions = {inst['id']: inst for inst in train_metadata['institutions']}

In [99]:
train21_processed = [] # will be turned into a dataframe later. sublists are of the form <image_id>, <image_path>, <species>, <order>, <family>, <institution>

for i in range(len(train_metadata['annotations'])):
    image = train_metadata['annotations'][i]

    image_id = image['id']
    image_path = images[image_id]['file_name']
    category = categories[image['category_id']]
    institution = institutions[image['institution_id']]    

    sublist = [image_id, image_path, category['name'], category['order'], category['family'], institution['name']]
    train21_processed += [sublist]
    
    

In [101]:
df = pd.DataFrame(train21_processed, columns=['image_id', 'filename', 'species', 'genus', 'family', 'institution'])
df.to_csv("./train_2021_metadata.csv")
df.head()

Unnamed: 0,image_id,filename,species,genus,family,institution
0,1814367,images/604/92/1814367.jpg,Thysanocarpus curvipes Hook.,Brassicales,Brassicaceae,New York Botanical Garden
1,1308257,images/108/24/1308257.jpg,Cassia grandis L.f.,Fabales,Fabaceae,New York Botanical Garden
2,1270453,images/330/76/1270453.jpg,Leptospermum whitei Cheel,Myrtales,Myrtaceae,Queensland Herbarium
3,1123834,images/247/99/1123834.jpg,Fallopia scandens (L.) Holub,Caryophyllales,Polygonaceae,New York Botanical Garden
4,1042410,images/170/18/1042410.jpg,Cyperus dentatus Torr.,Poales,Cyperaceae,New York Botanical Garden


In [103]:
df.shape

(2257759, 6)

In [105]:
labeled = []
for i, row in df.iterrows():
    species = row['species']
    genus = row['genus']
    family = row['family']
    institution = row['institution']

    img_id = row['image_id']
    filename = row['filename']
    label = f"This is an image of species {species}, in the genus {genus} of family {family}. It is part of the collection of institution {institution}."

    labeled += [[img_id, filename, label]]

labeled_df = pd.DataFrame(labeled, columns=["image_id", "filename", "label"])
labeled_df.to_csv('./train_2021_labeled.csv')
labeled_df

Unnamed: 0,image_id,filename,label
0,1814367,images/604/92/1814367.jpg,This is an image of species Thysanocarpus curv...
1,1308257,images/108/24/1308257.jpg,This is an image of species Cassia grandis L.f...
2,1270453,images/330/76/1270453.jpg,This is an image of species Leptospermum white...
3,1123834,images/247/99/1123834.jpg,This is an image of species Fallopia scandens ...
4,1042410,images/170/18/1042410.jpg,This is an image of species Cyperus dentatus T...
...,...,...,...
2257754,812040,images/511/30/812040.jpg,"This is an image of species Quercus rubra L., ..."
2257755,701754,images/354/99/701754.jpg,This is an image of species Malus fusca (Raf.)...
2257756,662006,images/565/47/662006.jpg,This is an image of species Solanum elaeagnifo...
2257757,234505,images/574/99/234505.jpg,This is an image of species Sphyrospermum buxi...


In [28]:
train22_labeled = pd.read_csv('/projectnb/herbdl/data/kaggle-herbaria/train_2022_labeled.csv')
train22_metadata = pd.read_csv('/projectnb/herbdl/data/kaggle-herbaria/train_2022_metadata.csv')

In [29]:
train22_labeled.head()

Unnamed: 0.1,Unnamed: 0,image_id,filename,label
0,0,00000__001,000/00/00000__001.jpg,"This is an image of species amabilis, in the g..."
1,1,00000__002,000/00/00000__002.jpg,"This is an image of species amabilis, in the g..."
2,2,00000__003,000/00/00000__003.jpg,"This is an image of species amabilis, in the g..."
3,3,00000__004,000/00/00000__004.jpg,"This is an image of species amabilis, in the g..."
4,4,00000__005,000/00/00000__005.jpg,"This is an image of species amabilis, in the g..."


In [30]:
train22_metadata.sort_values('image_id')

Unnamed: 0.1,Unnamed: 0,image_id,filename,species,genus,family,institution
0,0,00000__001,000/00/00000__001.jpg,amabilis,Abies,Pinaceae,A
1,1,00000__002,000/00/00000__002.jpg,amabilis,Abies,Pinaceae,A
2,2,00000__003,000/00/00000__003.jpg,amabilis,Abies,Pinaceae,A
3,3,00000__004,000/00/00000__004.jpg,amabilis,Abies,Pinaceae,A
4,4,00000__005,000/00/00000__005.jpg,amabilis,Abies,Pinaceae,A
...,...,...,...,...,...,...,...
839767,839767,15504__032,155/04/15504__032.jpg,pachyacanthum,Zygophyllum,Zygophyllaceae,UTC
839768,839768,15504__033,155/04/15504__033.jpg,pachyacanthum,Zygophyllum,Zygophyllaceae,USF
839769,839769,15504__035,155/04/15504__035.jpg,pachyacanthum,Zygophyllum,Zygophyllaceae,COLO
839770,839770,15504__036,155/04/15504__036.jpg,pachyacanthum,Zygophyllum,Zygophyllaceae,COLO
