## Selecting and Preprocessing iNaturalist Data

In [1]:
import json
import pandas as pd

In [2]:
# load the data
with open('train2019.json') as json_file:
    data2019 = json.load(json_file)
with open('categories2019.json') as json_file:
    cat2019 = json.load(json_file)
with open('train2018.json') as json_file:
    data2018 = json.load(json_file)
with open('categories2018.json') as json_file:
    cat2018 = json.load(json_file)

In [3]:
# check out what keys are available
data2019.keys()

dict_keys(['info', 'images', 'licenses', 'annotations', 'categories'])

In [4]:
# get numeric ID from binomial name
def getID(binom_name, cat=cat2019):
    for row in cat:
        if row['name'].lower() == binom_name.lower():
            print(row['id'])

In [5]:
getID("acer macrophyllum")

405


In [6]:
# key is binomial name, returns tuple (common name, (2019_id,2018_id))
species_map = {"Corvus Corax": ("Raven", (212, 3278)),
"Sceloporus Occidentalis": ("Western Fence Lizard", (165, 4436)),
"Quercus Kelloggii": ("Black Oak", (657, 6851)),
"Cornus Nuttallii": ("Pacific Dogwood", (442, None)),
"Asclepias Speciosa": ("Milkweed", (68, 6904)),
"Pinus Ponderosa": ("Ponderosa Pine", (981, 7963)),
"Castilleja Miniata": ("Giant Red Paintbrush", (736, None)),
"Toxicodendron Diversilobum": ("Poison Oak", (None, 7741)),
"Odocoileus Hemionus": ("Mule Deer", (None, 4068)),
"Sequoiadendron Giganteum":("Redwood Tree", (None, 7935) ),
"Sarcodes Sanguinea": ("Snowplant", (None, 6607)),
"Calocedrus Decurrens":("California Incense Cedar", (None,7920)),
"Eschscholzia Californica": ("California Poppy", (None, 7512)),
"Juniperus Occidentalis": ("Western Juniper", (None, 7930)),
"Pinus Albicaulis": ("White Bark Pine", (None,7949)),
"Pinus Sabiniana": ("Digger Pine", (None,7966)),
"Quercus Lobata": ("Valley Oak", (None, 6853)),
"Pinus Attenuata": ("Knobcone Pine", (None,7950)),
"Aesculus Californica": ("California Buckeye", (None,7767)),
"Umbellularia Californica": ("California Laurel", (None,7246)),
"Torreya Californica": ("California Nutmeg", (None,7979)),
"Pseudotsuga Menziesii": ("Douglas Fir", (None,7971)),
"Acer Macrophyllum": ("Broadleaf Maple", (None,7757)),
"Populus Tremuloides": ("Quaking Aspen", (None,7330))
}

In [207]:
def getDirectory(year=2019, species_map=species_map):
    '''returns tuple of common name and dir'''

    data = eval("data"+str(year))
    dirs = []
    for keys in species_map.keys():
        common_name = species_map[keys][0]
        if year == 2019:
            key = species_map[keys][1][0]
        elif year == 2018:
            key = species_map[keys][1][1]
        try:
            # join and split token
            s ="/"
            # get dirname if numeric ID key matches key from species dict
            dirname = [s.join(row["file_name"].split("/")[:3]) for row in data["images"] if row["file_name"].split("/")[2]==str(key)]
            dirs.append((keys, common_name, dirname[0]))
        except:
            pass
    return dirs

### List of directories for species in list

In [206]:
getDirectory(year=2018)

[('Corvus Corax', 'Raven', 'train_val2018/Aves/3278'),
 ('Sceloporus Occidentalis',
  'Western Fence Lizard',
  'train_val2018/Reptilia/4436'),
 ('Quercus Kelloggii', 'Black Oak', 'train_val2018/Plantae/6851'),
 ('Asclepias Speciosa', 'Milkweed', 'train_val2018/Plantae/6904'),
 ('Pinus Ponderosa', 'Ponderosa Pine', 'train_val2018/Plantae/7963'),
 ('Toxicodendron Diversilobum', 'Poison Oak', 'train_val2018/Plantae/7741'),
 ('Odocoileus Hemionus', 'Mule Deer', 'train_val2018/Mammalia/4068'),
 ('Sequoiadendron Giganteum', 'Redwood Tree', 'train_val2018/Plantae/7935'),
 ('Sarcodes Sanguinea', 'Snowplant', 'train_val2018/Plantae/6607'),
 ('Calocedrus Decurrens',
  'California Incense Cedar',
  'train_val2018/Plantae/7920'),
 ('Eschscholzia Californica',
  'California Poppy',
  'train_val2018/Plantae/7512'),
 ('Juniperus Occidentalis', 'Western Juniper', 'train_val2018/Plantae/7930'),
 ('Pinus Albicaulis', 'White Bark Pine', 'train_val2018/Plantae/7949'),
 ('Pinus Sabiniana', 'Digger Pine', 

In [208]:
getDirectory(year=2019)

[('Corvus Corax', 'Raven', 'train_val2019/Birds/212'),
 ('Sceloporus Occidentalis',
  'Western Fence Lizard',
  'train_val2019/Reptiles/165'),
 ('Quercus Kelloggii', 'Black Oak', 'train_val2019/Plants/657'),
 ('Cornus Nuttallii', 'Pacific Dogwood', 'train_val2019/Plants/442'),
 ('Asclepias Speciosa', 'Milkweed', 'train_val2019/Insects/68'),
 ('Pinus Ponderosa', 'Ponderosa Pine', 'train_val2019/Plants/981'),
 ('Castilleja Miniata', 'Giant Red Paintbrush', 'train_val2019/Plants/736')]

In [210]:
# brief EDA on dataset
im_catlist2018 = [str(row['category_id']) for row in data2018['annotations']]
im_catlist2019 = [str(row['category_id']) for row in data2019['annotations']]

#total image count
print("2018 total images:", len(im_catlist2018))
print("2019 total images:", len(im_catlist2019))
# # unique species
print("2018 total species:", len(set(im_catlist2018)))
print("2019 total species:", len(set(im_catlist2019)))

im_id = [row["image_id"] for row in data["annotations"] if row["category_id"]==212]

2018 total images: 437513
2019 total images: 265213
2018 total species: 8142
2019 total species: 1010


In [211]:
# how many images in each category?
cat_table = pd.Series(im_catlist2018, name="category_id")
df_cnt2018 = cat_table.value_counts()
cat_table = pd.Series(im_catlist2019, name="category_id")
df_cnt2019 = cat_table.value_counts()

In [212]:
# many classes have 1000 photos, but avg is closer to 50
df_cnt2018.describe()

count    8142.000000
mean       53.735323
std       117.111695
min         2.000000
25%        16.000000
50%        22.000000
75%        30.000000
max      1000.000000
Name: category_id, dtype: float64

In [213]:
# mean closer to 260 photos in 2019 but fewer species
df_cnt2019.describe()

count    1010.000000
mean      262.587129
std       167.614673
min        16.000000
25%       111.000000
50%       212.000000
75%       477.750000
max       500.000000
Name: category_id, dtype: float64

In [214]:
# can look look at counts from a given class
dirs2018 = getDirectory(2018)
dirs2019 = getDirectory(2019)
print(dirs2018[0][0], df_cnt.loc[dirs2018[0][2].split("/")[2]])

Corvus Corax 677


In [215]:
# look at counts from our classes
clscnt2018 = [(row[0], df_cnt2018.loc[row[2].split("/")[2]]) for row in dirs2018]
clscnt2019 = [(row[0], df_cnt2019.loc[row[2].split("/")[2]]) for row in dirs2019]
_df2018 = pd.DataFrame(clscnt2018, columns=["binom_name", "count2018"])
_df2019 = pd.DataFrame(clscnt2019, columns=["binom_name", "count2019"])

In [216]:
_df2018.head()

Unnamed: 0,binom_name,count2018
0,Corvus Corax,677
1,Sceloporus Occidentalis,1000
2,Quercus Kelloggii,77
3,Asclepias Speciosa,61
4,Pinus Ponderosa,57


In [242]:
# unifying the tables
_dirs2018 = pd.DataFrame(dirs2018, columns=["binom_name","common_name", "dir2018"])
_dirs2019 = pd.DataFrame(dirs2019, columns=["binom_name","common_name", "dir2019"])
dirs = pd.merge(_dirs2018,_dirs2019,how="outer",on=["binom_name","common_name"])
df = pd.merge(dirs,_df2018, how="outer",on="binom_name")
df = pd.merge(df,_df2019, how="outer",on="binom_name")

In [243]:
import numpy as np 
df.replace(np.nan,"", regex=True, inplace=True)

In [341]:
df

Unnamed: 0,binom_name,common_name,dir2018,dir2019,count2018,count2019,copy_cmd2018,copy_cmd2019
0,Corvus Corax,Raven,train_val2018/Aves/3278,train_val2019/Birds/212,677.0,500.0,gsutil -m cp -r train_val2018/Aves/3278 gs://m...,gsutil -m cp -r train_val2018/Aves/3278 gs://m...
1,Sceloporus Occidentalis,Western Fence Lizard,train_val2018/Reptilia/4436,train_val2019/Reptiles/165,1000.0,500.0,gsutil -m cp -r train_val2018/Reptilia/4436 gs...,gsutil -m cp -r train_val2018/Reptilia/4436 gs...
2,Quercus Kelloggii,Black Oak,train_val2018/Plantae/6851,train_val2019/Plants/657,77.0,500.0,gsutil -m cp -r train_val2018/Plantae/6851 gs:...,gsutil -m cp -r train_val2018/Plantae/6851 gs:...
3,Asclepias Speciosa,Milkweed,train_val2018/Plantae/6904,train_val2019/Insects/68,61.0,500.0,gsutil -m cp -r train_val2018/Plantae/6904 gs:...,gsutil -m cp -r train_val2018/Plantae/6904 gs:...
4,Pinus Ponderosa,Ponderosa Pine,train_val2018/Plantae/7963,train_val2019/Plants/981,57.0,500.0,gsutil -m cp -r train_val2018/Plantae/7963 gs:...,gsutil -m cp -r train_val2018/Plantae/7963 gs:...
5,Toxicodendron Diversilobum,Poison Oak,train_val2018/Plantae/7741,,604.0,,gsutil -m cp -r train_val2018/Plantae/7741 gs:...,
6,Odocoileus Hemionus,Mule Deer,train_val2018/Mammalia/4068,,950.0,,gsutil -m cp -r train_val2018/Mammalia/4068 gs...,
7,Sequoiadendron Giganteum,Redwood Tree,train_val2018/Plantae/7935,,350.0,,gsutil -m cp -r train_val2018/Plantae/7935 gs:...,
8,Sarcodes Sanguinea,Snowplant,train_val2018/Plantae/6607,,90.0,,gsutil -m cp -r train_val2018/Plantae/6607 gs:...,
9,Calocedrus Decurrens,California Incense Cedar,train_val2018/Plantae/7920,,52.0,,gsutil -m cp -r train_val2018/Plantae/7920 gs:...,


In [346]:
# create commands to transfer from gcp instance to bucket
df['copy_cmd2018'] = "gsutil -m cp -r "+df["dir2018"]+"/. "+"gs://muirsquest/"+df["binom_name"].apply(lambda x: x.split()[0]+"_"+x.split()[1])
df['copy_cmd2019'] = "gsutil -m cp -r "+df["dir2019"]+"/. "+"gs://muirsquest/"+df["binom_name"].apply(lambda x: x.split()[0]+"_"+x.split()[1])
df.loc[df["dir2018"]=="", "copy_cmd2018"]=""
df.loc[df["dir2019"]=="", "copy_cmd2019"]=""

In [347]:
# write gsutil copy commands to file
with open('2018dirs.txt', 'w') as filehandle:
    for item in df["copy_cmd2018"]:
        filehandle.write('%s\n' % item)
        
with open('2019dirs.txt', 'w') as filehandle:
    for item in df["copy_cmd2019"]:
        filehandle.write('%s\n' % item)

In [311]:
def getNegClass(year=2019, species_map=species_map):
    '''returns sample of images not in training set'''

    data = eval("data"+str(year))
    dirs = [] 
    if year == 2019:
        valset = [str(vals[1][0]) for vals in species_map.values()]
    elif year == 2018:
        valset = [str(vals[1][1]) for vals in species_map.values()]
    try:
        # join and split token
        s ="/"
        # get dirname if numeric ID key matches key from species dict
        for row in data["images"]:
            if row["file_name"].split("/")[2] not in valset:
                dirname = row["file_name"]
                dirs.append(dirname)
    except:
        pass
    return dirs

In [322]:
negdirs2019 = getNegClass()

In [316]:
negdirs2018 = getNegClass(year=2018)

In [339]:
# write gsutil copy commands to file
with open('2018negdirs.txt', 'w') as filehandle:
    for item in negdirs2018:
        filehandle.write('%s\n' % item)
        
with open('2019negdirs.txt', 'w') as filehandle:
    for item in negdirs2019:
        filehandle.write('%s\n' % item)

In [332]:
dfn = pd.DataFrame(negdirs2018, columns=["negdirs"])

In [333]:
sampled_negs = dfn.sample(n=8000,random_state=42)

In [334]:
sampled_negs.head()

Unnamed: 0,negdirs
211499,train_val2018/Mammalia/4211/4b56fa12f3177000ba...
57699,train_val2018/Insecta/1566/765482acc10df135229...
199862,train_val2018/Aves/3355/37766b0263bce315fa62d6...
51040,train_val2018/Plantae/6660/b359c5e6625dbbad194...
200208,train_val2018/Reptilia/4362/2d2244d25249979636...


In [337]:
sampled_negs["cmd"] = "gsutil cp "+sampled_negs["negdirs"]+" "+"gs://muirsquest/negclass"

In [338]:
with open('negSampleCmds8000.txt', 'w') as filehandle:
    for item in sampled_negs["cmd"]:
        filehandle.write('%s\n' % item)