In [66]:
from nltk.corpus import wordnet as wn 
import pandas as pd
import os
import shutil
import random

In [27]:
# Get bird names from nltk corpus
bird1 = wn.synset('bird.n.01')
bird_list1 = list(set([w for s in bird1.closure(lambda s: s.hyponyms()) for w in s.lemma_names()]))

bird2 = wn.synset('bird.n.02')
bird_list2 = list(set([w for s in bird2.closure(lambda s: s.hyponyms()) for w in s.lemma_names()]))

len(bird_list1), len(bird_list2)

(1736, 27)

In [28]:
bird_list = list(set(bird_list1+bird_list2))
len(bird_list)

1747

In [29]:
# Clean up bird_list data
bird_list = [bird.replace("_"," ") for bird in bird_list]
bird_list

['Adelie',
 'Cygnus olor',
 'cardinal',
 'Eudromias morinellus',
 'yellow warbler',
 'Maryland yellowthroat',
 'goony',
 'black-fronted bush shrike',
 'black-necked stork',
 'shrike',
 'grebe',
 'bantam',
 'bellbird',
 'throstle',
 'apteryx',
 'honeycreeper',
 'Cistothorus palustris',
 'tanager',
 'Melanotis caerulescens',
 'Tringa totanus',
 'great grey owl',
 'mourning dove',
 'Delichon urbica',
 'chunga',
 'Spheniscus demersus',
 'kaki',
 'button-quail',
 'hepatic tanager',
 'common spoonbill',
 'whydah',
 'trogon',
 'courlan',
 'Emberiza hortulana',
 'New World sparrow',
 'Parus caeruleus',
 'pyrrhuloxia',
 'pea-chick',
 'sulphur-crested cockatoo',
 'white-crowned sparrow',
 'golden eagle',
 'scrubbird',
 'antbird',
 'moa',
 'shoebill',
 'wood swallow',
 'jacamar',
 'Oreortyx picta palmeri',
 'Calidris canutus',
 'Luscinia megarhynchos',
 'Leptoptilus dubius',
 'Cursorius cursor',
 'crow blackbird',
 'penguin',
 'Bartramia longicauda',
 'Auriparus flaviceps',
 'Spinus pinus',
 'mar

In [30]:
# Get generic bird name
generic_bird_name = []
for a in bird_list:
    generic_bird_name.append(a.split(" ")[-1])

generic_bird_name = list(set(generic_bird_name))
pattern = '|'.join(generic_bird_name)

In [31]:
# Get ImageNet class keys to download images from ImageNet
df = pd.read_csv("./ImageNet-Datasets-Downloader/classes_in_imagenet.csv")
df["class_name"] = df["class_name"].str.lower()
df.dropna(inplace=True)
df.head()

Unnamed: 0,synid,class_name,urls,flickr_urls
0,n00004475,organism,8,6
1,n00005787,benthos,1264,626
2,n00006024,heterotroph,1,0
3,n00006484,cell,1251,628
4,n00007846,person,1242,1138


In [32]:
df['class_name'].shape[0], df['class_name'].nunique()

(21839, 20067)

In [33]:
# Removing bird from imagenet dataset
df_non_bird = df[~df['class_name'].isin(bird_list)]
df_non_bird = df_non_bird[~df_non_bird['class_name'].str.contains(pattern)]

In [34]:
# Removing duplicates
df_non_bird = df_non_bird.drop_duplicates('class_name')

In [35]:
# Get non_bird id and name
# limit to only 7501 classes via random sampling
df_non_bird = df_non_bird.sample(n=7501,replace=False,random_state=42)
non_bird_id = df_non_bird['synid'].tolist()
non_bird_name = df_non_bird['class_name'].tolist()
non_bird_dict = dict(zip(non_bird_id, non_bird_name))

In [36]:
len(non_bird_dict)

7501

In [14]:
# Creare function to turn list of strings into single long string (to be executed on command line)
def convert_list_to_long_string(target_list):
    long_string=""
    for item in target_list:
        long_string += " " + item
    long_string = long_string[1:]
    return long_string

non_bird_id_string = convert_list_to_long_string(non_bird_id)
non_bird_id_string[:53]

'n10223606 n04390977 n03938401 n09936892 n01491874 n01'

In [15]:
# Download non_bird image from imagenet 
'''
!python3 ImageNet-Datasets-Downloader/downloader.py \
-data_root dataset/non_bird_images \
-use_class_list True \
-class_list $non_bird_id_string\
-images_per_class 10
'''

'\n!python3 ImageNet-Datasets-Downloader/downloader.py -data_root dataset/non_bird_images -use_class_list True -class_list $non_bird_id_string-images_per_class 10\n'

In [64]:
# Download image with bing image downloader
from bing_image_downloader import downloader
for query in non_bird_name:
    downloader.download(query, limit=5,  output_dir='dataset/temp', force_replace=False, verbose=False)

[%] Downloading Images to /Users/jychan/Desktop/birdr/dataset/temp/portraitist
[!] Issue getting: https://live.staticflickr.com/6050/6295387364_3b5e906718_z.jpg
[!] Error:: HTTP Error 404: Not Found


[%] Done. Downloaded 5 images.
[%] Downloading Images to /Users/jychan/Desktop/birdr/dataset/temp/pimento butter


[%] Done. Downloaded 5 images.
[%] Downloading Images to /Users/jychan/Desktop/birdr/dataset/temp/headpiece


[%] Done. Downloaded 5 images.
[%] Downloading Images to /Users/jychan/Desktop/birdr/dataset/temp/security consultant


[%] Done. Downloaded 5 images.
[%] Downloading Images to /Users/jychan/Desktop/birdr/dataset/temp/flower-of-an-hour
[Error]Invalid image, not saving https://objects.liquidweb.services/images/202009/inat_1601294267-5f71d25206291.jpg

[!] Issue getting: https://objects.liquidweb.services/images/202009/inat_1601294267-5f71d25206291.jpg
[!] Error:: Invalid image, not saving https://objects.liquidweb.services/images/202009/inat_1601294267-5f71d25206291.jp

KeyboardInterrupt: 

In [65]:
# Move non-bird image to 0_non_bird directory in train/valid/test directory
# train -7k, valid-2k, test-1k

# Move all images to non_bird_images directory
non_bird_dir = './dataset/non_bird_images'

file_names = os.listdir(non_bird_dir)
name=1
for file_name in file_names:
    for image in os.listdir(os.path.join(non_bird_dir,file_name)):
        new_filename = 'a'+str(name)+'.jpg'
        os.rename(os.path.join(non_bird_dir, file_name,image), os.path.join(non_bird_dir, file_name, new_filename))
        shutil.move(os.path.join(non_bird_dir, file_name,new_filename), non_bird_dir)
        name+=1


In [68]:
# Training set
source_dir = './dataset/non_bird_images'
target_dir = './dataset/train/0_non_bird'

for image in random.sample(os.listdir(source_dir), 7000):
    shutil.move(os.path.join(source_dir,image), target_dir)

# Validation set
target_dir = './dataset/valid/0_non_bird'
for image in random.sample(os.listdir(source_dir), 2000):
    shutil.move(os.path.join(source_dir,image), target_dir)

# Test set
target_dir = './dataset/test/0_non_bird'
for image in random.sample(os.listdir(source_dir), 1000):
    shutil.move(os.path.join(source_dir,image), target_dir)