In [71]:
import pandas as pd
import numpy as np
import json

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Herbaria 2022

In [72]:
HERBARIA22_TRAIN_PATH = "/projectnb/herbdl/data/kaggle-herbaria/herbarium-2021/train_images/"
HERBARIA22_TEST_PATH = "/projectnb/herbdl/data/kaggle-herbaria/herbarium-2022/test_images/"
HERBARIA22_METADATA_TRAIN = "/projectnb/herbdl/data/kaggle-herbaria/herbarium-2022/train_metadata.json"
HERBARIA22_METADATA_TEST = "/projectnb/herbdl/data/kaggle-herbaria/herbarium-2022/test_metadata.json"

In [73]:
with open(HERBARIA22_METADATA_TRAIN) as f:
    train_metadata = json.load(f)

with open(HERBARIA22_METADATA_TEST) as f:
    test_metadata = json.load(f)

In [74]:
categories = {cat['category_id']: cat for cat in train_metadata['categories']}
images = {im['image_id']: im for im in train_metadata['images']}
institutions = {inst['institution_id']: inst for inst in train_metadata['institutions']}

In [75]:
train22_processed = [] # will be turned into a dataframe later. sublists are of the form <image_id>, <image_path>, <species>, <genus>, <family>, <institution>

for i in range(len(train_metadata['annotations'])):
    image = train_metadata['annotations'][i]

    image_id = image['image_id']
    image_path = images[image_id]['file_name']
    category = categories[image['category_id']]
    institution = institutions[image['institution_id']]    

    sublist = [image_id, image_path, category['species'], category['genus'], category['family'], institution['collectionCode']]
    train22_processed += [sublist]
    
    

In [76]:
df = pd.DataFrame(train22_processed, columns=['image_id', 'filename', 'species', 'genus', 'family', 'institution'])
df.to_csv("/projectnb/herbdl/data/kaggle-herbaria/train_2022_metadata.csv")
df.head()

Unnamed: 0,image_id,filename,species,genus,family,institution
0,00000__001,000/00/00000__001.jpg,amabilis,Abies,Pinaceae,A
1,00000__002,000/00/00000__002.jpg,amabilis,Abies,Pinaceae,A
2,00000__003,000/00/00000__003.jpg,amabilis,Abies,Pinaceae,A
3,00000__004,000/00/00000__004.jpg,amabilis,Abies,Pinaceae,A
4,00000__005,000/00/00000__005.jpg,amabilis,Abies,Pinaceae,A


In [77]:
len(df)

839772

#### Labeling

In [78]:
labeled = []
for i, row in df.iterrows():
    species = row['species']
    genus = row['genus']
    family = row['family']
    institution = row['institution']

    img_id = row['image_id']
    filename = row['filename']
    caption = f"This is an image of species {species}, in the genus {genus} of family {family}. It is part of the collection of institution {institution}."
    scientificName = family + " " + genus + " " + species

    labeled += [[img_id, filename, caption, scientificName, family, genus, species]]

kaggle22 = pd.DataFrame(labeled, columns=["image_id", "filename", "caption", "scientificName", "family", "genus", "species"])

In [79]:
le = LabelEncoder()
le.fit(kaggle22['scientificName'])
kaggle22['scientificNameEncoded'] = le.transform(kaggle22['scientificName'])

kaggle22.head()

In [None]:
train22, val22 = train_test_split(kaggle22, test_size=0.2, random_state=42)

In [80]:
train22.head()

Unnamed: 0,image_id,filename,caption,scientificName,family,genus,species,scientificNameEncoded
686446,12640__012,126/40/12640__012.jpg,"This is an image of species cordovensis, in th...",Asteraceae Roldana cordovensis,Asteraceae,Roldana,cordovensis,3150
83405,01612__004,016/12/01612__004.jpg,"This is an image of species sophoroides, in th...",Fabaceae Astragalus sophoroides,Fabaceae,Astragalus,sophoroides,7279
155613,02962__081,029/62/02962__081.jpg,"This is an image of species whitneyi, in the g...",Cyperaceae Carex whitneyi,Cyperaceae,Carex,whitneyi,6101
455876,08401__098,084/01/08401__098.jpg,"This is an image of species philadelphicum, in...",Liliaceae Lilium philadelphicum,Liliaceae,Lilium,philadelphicum,9199
141438,02707__054,027/07/02707__054.jpg,"This is an image of species hoodii, in the gen...",Cyperaceae Carex hoodii,Cyperaceae,Carex,hoodii,5846


In [81]:
val22.head()

Unnamed: 0,image_id,filename,caption,scientificName,family,genus,species,scientificNameEncoded
412294,07635__028,076/35/07635__028.jpg,"This is an image of species remota, in the gen...",Malvaceae Iliamna remota,Malvaceae,Iliamna,remota,9543
363802,06751__013,067/51/06751__013.jpg,"This is an image of species clokeyi, in the ge...",Polemoniaceae Gilia clokeyi,Polemoniaceae,Gilia,clokeyi,12694
790220,14572__001,145/72/14572__001.jpg,"This is an image of species variabilis, in the...",Bromeliaceae Tillandsia variabilis,Bromeliaceae,Tillandsia,variabilis,4603
780204,14383__110,143/83/14383__110.jpg,"This is an image of species buceras, in the ge...",Combretaceae Terminalia buceras,Combretaceae,Terminalia,buceras,5273
502646,09270__114,092/70/09270__114.jpg,"This is an image of species elaeagnoides, in t...",Melastomataceae Miconia elaeagnoides,Melastomataceae,Miconia,elaeagnoides,9794


In [84]:
train22.shape, val22.shape

((671817, 8), (167955, 8))

In [96]:
train22.to_csv('/projectnb/herbdl/data/kaggle-herbaria/train_2022.csv')
val22.to_csv('/projectnb/herbdl/data/kaggle-herbaria/val_2022.csv')

In [111]:
train22.to_json('/projectnb/herbdl/data/kaggle-herbaria/train_2022.json', orient='records', lines=True)
val22.to_json('/projectnb/herbdl/data/kaggle-herbaria/val_2022.json', orient='records', lines=True)

## Herbaria 2021

In [113]:
HERBARIA21_TRAIN_PATH = "/projectnb/herbdl/data/kaggle-herbaria/herbarium-2021/train/"
HERBARIA21_TEST_PATH = "/projectnb/herbdl/data/kaggle-herbaria/herbarium-2021/test/"
HERBARIA21_METADATA_TRAIN = "/projectnb/herbdl/data/kaggle-herbaria/herbarium-2021/train/metadata.json"
HERBARIA21_METADATA_TEST = "/projectnb/herbdl/data/kaggle-herbaria/herbarium-2021/test/metadata.json"

In [114]:
with open(HERBARIA21_METADATA_TRAIN) as f:
    train_metadata = json.load(f)

with open(HERBARIA21_METADATA_TEST) as f:
    test_metadata = json.load(f)

In [115]:
train_metadata.keys()

dict_keys(['annotations', 'categories', 'images', 'info', 'licenses', 'institutions'])

In [116]:
train_metadata['categories']

[{'family': 'Orchidaceae',
  'order': 'Asparagales',
  'name': 'Aa calceata (Rchb.f.) Schltr.',
  'id': 0},
 {'family': 'Orchidaceae',
  'order': 'Asparagales',
  'name': 'Aa matthewsii (Rchb.f.) Schltr.',
  'id': 1},
 {'family': 'Orchidaceae',
  'order': 'Asparagales',
  'name': 'Aa paleacea (Kunth) Rchb.f.',
  'id': 2},
 {'family': 'Fabaceae',
  'order': 'Fabales',
  'name': 'Abarema abbottii (Rose & Leonard) Barneby & J.W.Grimes',
  'id': 3},
 {'family': 'Fabaceae',
  'order': 'Fabales',
  'name': 'Abarema acreana (J.F.Macbr.) L.Rico',
  'id': 4},
 {'family': 'Fabaceae',
  'order': 'Fabales',
  'name': 'Abarema adenophora (Ducke) Barneby & J.W.Grimes',
  'id': 5},
 {'family': 'Fabaceae',
  'order': 'Fabales',
  'name': 'Abarema agropecuaria Barneby & J.W.Grimes',
  'id': 6},
 {'family': 'Fabaceae',
  'order': 'Fabales',
  'name': 'Abarema alexandrae (Urb.) Barneby & J.W.Grimes',
  'id': 7},
 {'family': 'Fabaceae',
  'order': 'Fabales',
  'name': 'Abarema auriculata (Benth.) Barneby 

In [117]:
categories = {cat['id']: cat for cat in train_metadata['categories']}
images = {im['id']: im for im in train_metadata['images']}
institutions = {inst['id']: inst for inst in train_metadata['institutions']}

In [118]:
train21_processed = [] # will be turned into a dataframe later. sublists are of the form <image_id>, <image_path>, <species>, <order>, <family>, <institution>

for i in range(len(train_metadata['annotations'])):
    image = train_metadata['annotations'][i]

    image_id = image['id']
    image_path = images[image_id]['file_name']
    category = categories[image['category_id']]
    institution = institutions[image['institution_id']]
    name = category['name']
    species = name.split(' ')[1]
    genus = name.split(' ')[0]
    family = category['family']

    sublist = [image_id, image_path, species, genus, family, institution['name']]
    train21_processed += [sublist]
    
    

In [119]:
df = pd.DataFrame(train21_processed, columns=['image_id', 'filename', 'species', 'genus', 'family', 'institution'])
df.head()

Unnamed: 0,image_id,filename,species,genus,family,institution
0,1814367,images/604/92/1814367.jpg,curvipes,Thysanocarpus,Brassicaceae,New York Botanical Garden
1,1308257,images/108/24/1308257.jpg,grandis,Cassia,Fabaceae,New York Botanical Garden
2,1270453,images/330/76/1270453.jpg,whitei,Leptospermum,Myrtaceae,Queensland Herbarium
3,1123834,images/247/99/1123834.jpg,scandens,Fallopia,Polygonaceae,New York Botanical Garden
4,1042410,images/170/18/1042410.jpg,dentatus,Cyperus,Cyperaceae,New York Botanical Garden


In [120]:
df.shape

(2257759, 6)

In [121]:
labeled = []
for i, row in df.iterrows():
    species = row['species']
    genus = row['genus']
    family = row['family']
    institution = row['institution']

    img_id = row['image_id']
    filename = row['filename'].replace("images/", "")
    caption = f"This is an image of species {species}, in the genus {genus} of family {family}. It is part of the collection of institution {institution}."
    scientificName = family + " " + genus + " " + species

    labeled += [[img_id, filename, caption, scientificName, family, genus, species]]

kaggle21 = pd.DataFrame(labeled, columns=["image_id", "filename", "caption", "scientificName", "family", "genus", "species"])

In [122]:
le = LabelEncoder()
le.fit(kaggle21['scientificName'])
kaggle21['scientificNameEncoded'] = le.transform(kaggle21['scientificName'])

train21, val21 = train_test_split(kaggle21, test_size=0.2, random_state=42)

In [123]:
train21.head()

Unnamed: 0,image_id,filename,caption,scientificName,family,genus,species,scientificNameEncoded
1873445,1123740,494/80/1123740.jpg,"This is an image of species mollis, in the gen...",Apocynaceae Prestonia mollis,Apocynaceae,Prestonia,mollis,3320
644516,1242345,161/93/1242345.jpg,"This is an image of species parsonsia, in the ...",Lythraceae Cuphea parsonsia,Lythraceae,Cuphea,parsonsia,32636
1177436,2238072,069/53/2238072.jpg,"This is an image of species egensis, in the ge...",Melastomataceae Bellucia egensis,Melastomataceae,Bellucia,egensis,34222
1700471,1458,001/99/1458.jpg,"This is an image of species acradenia, in the ...",Fabaceae Acacia acradenia,Fabaceae,Acacia,acradenia,22767
1958054,344538,219/79/344538.jpg,"This is an image of species halleanum, in the ...",Onagraceae Epilobium halleanum,Onagraceae,Epilobium,halleanum,39755


In [124]:
val21.head()

Unnamed: 0,image_id,filename,caption,scientificName,family,genus,species,scientificNameEncoded
720000,1926097,109/97/1926097.jpg,"This is an image of species equisetifolia, in ...",Casuarinaceae Casuarina equisetifolia,Casuarinaceae,Casuarina,equisetifolia,13986
2107948,25793,426/57/25793.jpg,"This is an image of species hoffmannseggiana, ...",Rubiaceae Palicourea hoffmannseggiana,Rubiaceae,Palicourea,hoffmannseggiana,54157
1715913,822223,532/16/822223.jpg,"This is an image of species fragilis_x, in the...",Salicaceae Salix fragilis_x,Salicaceae,Salix,fragilis_x,55603
697108,1593343,359/14/1593343.jpg,"This is an image of species polyantha, in the ...",Marcgraviaceae Marcgravia polyantha,Marcgraviaceae,Marcgravia,polyantha,33963
1173726,531146,316/21/531146.jpg,"This is an image of species affinis, in the ge...",Rubiaceae Kadua affinis,Rubiaceae,Kadua,affinis,53838


In [125]:
train21.to_csv('/projectnb/herbdl/data/kaggle-herbaria/train_2021.csv')
val21.to_csv('/projectnb/herbdl/data/kaggle-herbaria/val_2021.csv')

In [126]:
train21.to_json('/projectnb/herbdl/data/kaggle-herbaria/train_2021.json', orient='records', lines=True)
val21.to_json('/projectnb/herbdl/data/kaggle-herbaria/val_2021.json', orient='records', lines=True)