# MS Classifier for Reticulated Giraffe Collection Classification
- MS Classifier only predicts 'giraffe' but not 'reticulated giraffe', or at least as predicted on API demo page

   

In [1]:
#to reflect changes made in modules
%load_ext autoreload
%autoreload 2

## Keys

In [2]:
DB_KEY =  "mongodb+srv://user:BCLobB4rLJucVXG2@wildbook-cmmya.mongodb.net/test?retryWrites=true&w=majority" # connect to database here (see owners for access)
# MS_key = '3c313eb853de41788b3e35e9bcf1ba2e'

In [3]:
import os, sys
# sys.path.append(os.path.join(sys.path[0], '../'))
sys.path.append(os.path.join(os.path.abspath(os.getcwd()), '../'))

#distance visualization
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib.pyplot as plt

#import flickr and db modules
from wildbook_social import Flickr, Database

#set up
db = Database(DB_KEY, 'flickr_june_2019')
# db = Database(DB_KEY, 'imgs_for_species_classifier')
fr = Flickr(db)

In [4]:
## get the current mongoDB database collection object
db_obj = db.getDB()

In [None]:
from wildbook_social import SpeciesClassifier 
from wildbook_social import Image

## instance of the MS Species Classification API and Save Class to reformat Flickr data for API
sc = SpeciesClassifier()
img = Image()

# Select MongoDB Collection + MS Classifier Setup

In [None]:
# saveTo = 'reticulated giraffe general africa bbox'
saveTo = 'reticulated giraffe general multilingual'

print('You are working with the collection: ', saveTo)

In [None]:
## rename 'url_l' field in docs to just 'url'
db.renameField(saveTo, 'url_l', 'url')

# Batch Relevance Filtration with MS Classifier
- Automatically filter through unlabeled images and have classifier mark as relevant if species is in frame
- Only choose to run either (1) classify entire collection or (2) classify smaller subsets of collection

In [None]:
res = db_obj[saveTo].find({'relevant':None})
res_list = list(res)
len(res_list)

In [None]:
saveTo

### (A) Classify Entire Collection ...

In [None]:
numToClassify = len(res_list) #can also manually set to 100 or something if your don't want to classify entire collection in one go
confidence = 0.0 # as long as MS classifier *thinks* there is a giraffe, we'll mark it relevant, and double check manually later on when filtering for captive/wild
species_keyword = 'Giraffe'

#encode the metadata in a form that fits the MS classifier
flickr_img_dicts = img.get_flickr_img_dicts(db_obj, saveTo, numToClassify)
print(len(flickr_img_dicts))

#begin running the classifier on our images in the collection
sc.predict_image_relevancy(db_obj, saveTo, flickr_img_dicts, species_keyword, confidence)

### (B) ... Or Classify Smaller Subsets of Collection to avoid Timeout

In [None]:
num_unfiltered = db_obj[saveTo].count_documents({'relevant': None})
num_unfiltered

In [None]:
num_batches = int(num_unfiltered / 50) + 1
numToClassify = 100
confidence = 0.0
species_keyword = 'Giraffe'

print('Num. of Batches: ', num_batches)
for i in range(0,num_batches):
    print('Batch:', i)
    flickr_img_dicts = img.get_flickr_img_dicts(db_obj, saveTo, numToClassify)
    print(len(flickr_img_dicts))
    sc.predict_image_relevancy(db_obj, saveTo, flickr_img_dicts, species_keyword, confidence)
print('Done with all batches')

# Visualizing MS Species Classifier Results

In [None]:
import ipyplot

In [None]:
## get images labeled as relevant and irrelevant
images = db_obj[saveTo].find({"relevant": True}, {"url": 1})
images_irrel = db_obj[saveTo].find({"relevant": False}, {"url": 1})

In [None]:
list_of_imgs = list(images)
list_of_imgs_irrel = list(images_irrel)

In [None]:
imgs_url = [dic['url'] for dic in list_of_imgs]
imgs_url_irrel = [dic['url'] for dic in list_of_imgs_irrel]

In [None]:
labels = [dic['_id'] for dic in list_of_imgs]
labels_irrel = [dic['_id'] for dic in list_of_imgs_irrel]

## Compare counts
- plot count of relevant vs non-relevant for each collection

In [None]:
saveTo

In [None]:
count_rel = len(imgs_url)
count_irrel = len(imgs_url_irrel)

data = {'relevant': count_rel, 'irrelevant': count_irrel}
df_counts = pd.DataFrame(data, index=[0])
print(df_counts)

### Plot Images in A Grid

In [None]:
ipyplot.plot_images(imgs_url, labels, max_images = 10, img_width=100)

# Double Checking Relevant Images + Classifying as Captive/Wild
- Images were marked relevant in containing Giraffe at 0.0 confidence (to pool all possible results) by MS Classifier
- wild category now has option to declare relevant images as:
    - wild (y)
    - unknown (u)
    - not wild (n)
    
    This way, we can more accurately classify an image that contains a reticulated giraffe, but we are unsure of its wild/captive status due to not much information/context available

In [None]:
# ## saveTo = 'reticulated giraffe general'
# ## saveTo = 'reticulated giraffe general africa bbox'
# ## saveTo = 'reticulated giraffe general multilingual' 
# ## saveTo = 'reticulated giraffe general multilingual africa bbox'
# ## saveTo = 'reticulated giraffe specific'
# ## saveTo = 'reticulated giraffe specific africa bbox'

saveTo

In [None]:
amt_remaining_to_check = db_obj[saveTo].count_documents({"$and": [{"relevant": True}, {"wild": None}]})
print(amt_remaining_to_check)

In [None]:
amt_remaining_to_double_check = db_obj[saveTo].count_documents({"$and": [{"relevant": True}, {"double_checked": False}]})
print(amt_remaining_to_double_check)

In [None]:
amount = 100
db.doubleCheckRelevantImages(saveTo, amount, first_round = False)

## Deleting Duplicates so I don't keep filtering/annotating same image over and over again

In [None]:
# saveTo = 'reticulated giraffe specific africa bbox'


In [None]:
print(saveTo)
db.renameField(saveTo, 'url_l', 'url')

In [None]:
res = db_obj[saveTo].find()
all_urls = [item['url'] for item in res]
len(all_urls)

In [None]:
import numpy as np
unique_urls = np.unique(all_urls)
len(unique_urls)

In [None]:
# # pass in each url and only keep one item
# # DELETES DOCUMENTS!!!
# for url in unique_urls:
#     res = db_obj[saveTo].find({'url': url}, { 'item': 0, 'status': 0, '_id': 1 })
#     count = db_obj[saveTo].count_documents({'url': url})
#     id_list = [item['_id'] for item in res]
#     #only keep the image at id_list[0]
#     for i in range(1, len(id_list)):
#         db_obj[saveTo].delete_one({'_id': id_list[i]})
#     print('We kept: {}'.format(id_list[0]))

## Cross Checking Images for Consistent Wild/Relevant Status Across Collections

In [5]:
import numpy as np
def get_all_unique_urls(db_obj, species_collections):
    
    #get urls across all species collections
    all_urls = []
    for collection in species_collections:
        all_urls += list(db_obj[collection].find({'relevant': True, 'double_checked': True}, {'url'}))
        
    # since all_urls is a list in format of [{_id:..., url:...}], 
    # we create list only_all_urls, which will only contain the urls (no dicts)
    # this will enable us to apply the np.unique() function on only_all_urls to get a list of unique urls
    only_all_urls = []
    for dict_item in all_urls:
        only_all_urls.append(dict_item['url'])
    
    #get list of unique_urls
    only_all_urls_np = np.array(only_all_urls)
    unique_urls = np.unique(only_all_urls_np)

    return unique_urls

In [8]:
reticulated_giraffe_collections = ['reticulated giraffe general', 
                                   'reticulated giraffe general africa bbox', 
                                   'reticulated giraffe general multilingual', 
                                   'reticulated giraffe general multilingual africa bbox',
                                   'reticulated giraffe specific', 
                                   'reticulated giraffe specific africa bbox']
unique_urls = get_all_unique_urls(db_obj, reticulated_giraffe_collections )

In [9]:
print(len(unique_urls))

622


### Create Dataframe with Wild Status for each url/collection

In [12]:
# create a dataframe with unique_urls
import pandas as pd

#add in the columns of wild_status for each of our collections (with dummy values for now)
dummy_vals = [None for i in range(0, len(unique_urls))]
wild_status_checked = [False for i in range(0, len(unique_urls))]

unique_urls_df = pd.DataFrame({'url': unique_urls,
                              'ret_giraffe_general_wild_status': dummy_vals,
                              'ret_giraffe_general_africa_bbox_wild_status': dummy_vals,
                              'ret_giraffe_general_multilingual_wild_status': dummy_vals,
                              'ret_giraffe_general_multilingual_africa_bbox_wild_status': dummy_vals,
                              'ret_giraffe_specific_wild_status': dummy_vals,
                              'ret_giraffe_specific_africa_bbox_wild_status': dummy_vals,
                              'wild_status_checked': wild_status_checked})
unique_urls_df.head()

Unnamed: 0,url,ret_giraffe_general_wild_status,ret_giraffe_general_africa_bbox_wild_status,ret_giraffe_general_multilingual_wild_status,ret_giraffe_general_multilingual_africa_bbox_wild_status,ret_giraffe_specific_wild_status,ret_giraffe_specific_africa_bbox_wild_status,wild_status_checked
0,https://live.staticflickr.com/31337/4816541245...,,,,,,,False
1,https://live.staticflickr.com/31337/4948338951...,,,,,,,False
2,https://live.staticflickr.com/65535/4801236293...,,,,,,,False
3,https://live.staticflickr.com/65535/4801803078...,,,,,,,False
4,https://live.staticflickr.com/65535/4825166207...,,,,,,,False


In [13]:
def get_wild_status(url, saveTo):
    status = db_obj[saveTo].find_one({'url': url}, {'wild'})
    return status['wild'] if status is not None else None

In [14]:
unique_urls_df['ret_giraffe_general_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'reticulated giraffe general'))
unique_urls_df['ret_giraffe_general_africa_bbox_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'reticulated giraffe general africa bbox'))
unique_urls_df['ret_giraffe_general_multilingual_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'reticulated giraffe general multilingual'))
unique_urls_df['ret_giraffe_general_multilingual_africa_bbox_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'reticulated giraffe general multilingual africa bbox'))
unique_urls_df['ret_giraffe_specific_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'reticulated giraffe specific'))
unique_urls_df['ret_giraffe_specific_africa_bbox_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'reticulated giraffe specific africa bbox'))


In [15]:
unique_urls_df.head()

Unnamed: 0,url,ret_giraffe_general_wild_status,ret_giraffe_general_africa_bbox_wild_status,ret_giraffe_general_multilingual_wild_status,ret_giraffe_general_multilingual_africa_bbox_wild_status,ret_giraffe_specific_wild_status,ret_giraffe_specific_africa_bbox_wild_status,wild_status_checked
0,https://live.staticflickr.com/31337/4816541245...,,,,,True,,False
1,https://live.staticflickr.com/31337/4948338951...,,,True,,,,False
2,https://live.staticflickr.com/65535/4801236293...,,,True,,,,False
3,https://live.staticflickr.com/65535/4801803078...,,,True,,,,False
4,https://live.staticflickr.com/65535/4825166207...,,,,,False,,False


In [16]:
unique_urls_df.to_csv('reticulated_giraffe_unique_urls_df.csv')

### Load in CSV (if already saved)

In [None]:
# import pandas as pd
# unique_urls_df = pd.read_csv('reticulated_giraffe_unique_urls_df.csv')

### Manually Check URLs and Wild Statuses + Update Accordingly

In [17]:
# helper functions for cross-checking docs across collections
from IPython.display import Image, display

def get_residing_collections_of_doc(url, species_collections):
    ''' returns list of collections where item/url exists '''
    does_exist = []
    for collection in species_collections:
        wild_status = get_wild_status(url, collection)
        if wild_status is not None:
            does_exist.append(collection)
    
    return does_exist

def display_image(img_url, collection):
    ''' displays image of interest and associated metadata '''
    item = db_obj[collection].find_one({'url': img_url})
    display(Image(img_url, height=500, width=500))
    location = db.coordsToLocation(item['latitude'], item['longitude'])
    
    print('ID: {}\n Title: {}\n Tags: {}\n Location: ({},{}) --> {}\n'.format(item['_id'],
                                              item['title'], 
                                              item['tags'], 
                                              item['latitude'],item['longitude'], location))
    print('Url:{}\n'.format(img_url))
    try:
        print('Confidence of Prediction: {}\n'.format(item['confidence']))
    except KeyError:
        pass

def prompt_for_relevance_classification():
    ''' prompts user for decision on relevance/wild status of photographed encounter'''
    
    print("RELEVANT (enter y=yes/n=no):", end =" ")
    rel = True if input() == "y" else False
                  
    if rel == True:
        wild_response = input("WILD (enter y=yes/u=unknown/n=no): ")
        if wild_response == 'y':
            wild = True
        elif wild_response == 'u':
            wild = 'unknown'
        else:
            wild = False
    else:
        wild = 0
        
    return rel, wild


def update_wild_status(img_url, residing_collections):
    ''' prompts for classification and updates relevant, wild, and double_checked status of our revised image url in mongoDB'''
    rel, wild = prompt_for_relevance_classification()
    
    #only want to update status in collections where the img exists (resides)
    for collection in residing_collections:
        item = db_obj[collection].find_one({'url': img_url})
        db_obj[collection].update_one({'_id': item['_id']}, {'$set': {'relevant': rel, 'wild': wild, 'double_checked': True}})

    print('updated IMG REL/WILD STATUS in collections: ', residing_collections)
    print('Response Updated: {} and {}.\n'.format(rel, wild))

          
def check_wild_status(url, species_collections):
    '''checks, displays, and updates wild status across across collections that image (url) resides in'''
    residing_collections = get_residing_collections_of_doc(url, species_collections)
    
    #check if residing_collections is empty (safety measure, though url must exist in at least one col)
    if not residing_collections:
        return False
    
    display_image(url, residing_collections[0])
    update_wild_status(url, residing_collections)
    
    #update across all necessary collections was successful
    return True

In [50]:
# get names of wild status columns from all collections
wild_status_columns = list(unique_urls_df.columns[1:-1])
#zip the collections and columns list (print out to make sure the correct corresponding column and collections names are in the same tuple)
zipped_columns_and_collections = list(zip(wild_status_columns, reticulated_giraffe_collections))

for column, collection in zipped_columns_and_collections:
    print(column, '|', collection)

ret_giraffe_general_wild_status | reticulated giraffe general
ret_giraffe_general_africa_bbox_wild_status | reticulated giraffe general africa bbox
ret_giraffe_general_multilingual_wild_status | reticulated giraffe general multilingual
ret_giraffe_general_multilingual_africa_bbox_wild_status | reticulated giraffe general multilingual africa bbox
ret_giraffe_specific_wild_status | reticulated giraffe specific
ret_giraffe_specific_africa_bbox_wild_status | reticulated giraffe specific africa bbox


In [71]:
unique_urls_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 622 entries, 0 to 621
Data columns (total 8 columns):
 #   Column                                                    Non-Null Count  Dtype 
---  ------                                                    --------------  ----- 
 0   url                                                       622 non-null    object
 1   ret_giraffe_general_wild_status                           302 non-null    object
 2   ret_giraffe_general_africa_bbox_wild_status               309 non-null    object
 3   ret_giraffe_general_multilingual_wild_status              48 non-null     object
 4   ret_giraffe_general_multilingual_africa_bbox_wild_status  1 non-null      object
 5   ret_giraffe_specific_wild_status                          311 non-null    object
 6   ret_giraffe_specific_africa_bbox_wild_status              0 non-null      object
 7   wild_status_checked                                       622 non-null    bool  
dtypes: bool(1), object(7)
memory us

In [72]:
start = 601
end = 621

In [73]:
# cross-checking with human in the loop
for index, row in unique_urls_df[start:end+1].iterrows():
    statuses = row[wild_status_columns]
    #print(set(statuses))
    
    #remove NaN/None entries from statuses before checking for discrepancies in status
    #otherwise if the set is {NaN, wild} --> we are unnecessarily checking bc in all existing collections, the img status is the same
    statuses = [x for x in statuses if pd.isnull(x) == False and x != 'nan']
    
    #determine if necessary to check wild_status (at least one wild status is different from the rest)
    if len(set(statuses)) > 1:
        print('checking status at index:', index)
        print(set(statuses))
        checked_status = check_wild_status(row['url'], reticulated_giraffe_collections)
    
        #update all rows in unique_urls_df to reflect new, consensus wild_status
        for column, collection in zipped_columns_and_collections:
            unique_urls_df.at[index, (column)] = get_wild_status(row['url'], collection)
        
    #update checked_wild_status column
    unique_urls_df.at[index, ('wild_status_checked')] = True

In [74]:
unique_urls_df[start:end+2]

Unnamed: 0,url,ret_giraffe_general_wild_status,ret_giraffe_general_africa_bbox_wild_status,ret_giraffe_general_multilingual_wild_status,ret_giraffe_general_multilingual_africa_bbox_wild_status,ret_giraffe_specific_wild_status,ret_giraffe_specific_africa_bbox_wild_status,wild_status_checked
601,https://live.staticflickr.com/65535/5035629909...,,,,,False,,True
602,https://live.staticflickr.com/65535/5035629921...,,,,,False,,True
603,https://live.staticflickr.com/65535/5035700197...,,,,,False,,True
604,https://live.staticflickr.com/65535/5035716107...,,,,,False,,True
605,https://live.staticflickr.com/65535/5036017953...,,,,,unknown,,True
606,https://live.staticflickr.com/65535/5036018162...,,,,,unknown,,True
607,https://live.staticflickr.com/65535/5036538409...,,,,,False,,True
608,https://live.staticflickr.com/65535/5036830118...,,,,,False,,True
609,https://live.staticflickr.com/65535/5036984266...,,,,,False,,True
610,https://live.staticflickr.com/65535/5041940280...,,,,,True,,True


In [75]:
# optional: save updated csv
unique_urls_df.to_csv('reticulated_giraffe_unique_urls_df.csv')

In [None]:
db.close()