# MS Classifier for Plains Zebras Collection Classification
- MS classifier **does** distinguish between grevy's zebra (equus grevyi) and plains zebra (equus quagga):
    - https://speciesclassification.westus2.cloudapp.azure.com/
- collections to run classifier on: 
   - plains zebras general
   - plains zebra general africa bbox
   

In [1]:
#to reflect changes made in modules
%load_ext autoreload
%autoreload 2

## Keys

In [2]:
DB_KEY =  "mongodb+srv://user:BCLobB4rLJucVXG2@wildbook-cmmya.mongodb.net/test?retryWrites=true&w=majority" # connect to database here (see owners for access)
# MS_key = '3c313eb853de41788b3e35e9bcf1ba2e'

In [3]:
import os, sys
# sys.path.append(os.path.join(sys.path[0], '../'))
sys.path.append(os.path.join(os.path.abspath(os.getcwd()), '../'))

#distance visualization
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib.pyplot as plt

#import flickr and db modules
from wildbook_social import Flickr, Database

#set up
db = Database(DB_KEY, 'flickr_june_2019')
# db = Database(DB_KEY, 'imgs_for_species_classifier')
fr = Flickr(db)

In [4]:
## get the current mongoDB database collection object
db_obj = db.getDB()

In [None]:
from wildbook_social import SpeciesClassifier 
from wildbook_social import Image

## instance of the MS Species Classification API and Save Class to reformat Flickr data for API
sc = SpeciesClassifier()
img = Image()

# Select MongoDB Collection + MS Classifier Setup

In [None]:
# saveTo = 'plains zebra general - 1000 demo'
# saveTo = 'plains zebra general - 1000 testing' #Vi-an

# saveTo = 'plains zebra general'
# saveTo = 'plains zebra general africa bbox'
# saveTo = 'plains zebra specific africa bbox'

print('You are working with the collection: ', saveTo)

In [None]:
## rename 'url_l' field in docs to just 'url'
db.renameField(saveTo, 'url_l', 'url')

## Demo - Classifying Images with the MS Classifier

In [None]:
# numToClassify = 20 #set number of images you want to classify
# species_keyword = 'Plains Zebra'
# confidence = 0.0 

# flickr_img_dicts = img.get_flickr_img_dicts(db_obj, saveTo, numToClassify)
# sc.predict_image_relevancy(db_obj, saveTo, flickr_img_dicts, species_keyword, confidence)

# Batch Relevance Filtration with MS Classifier
- Automatically filter through unlabeled images and have classifier mark as relevant if species is in frame
- Only choose to run either (1) classify entire collection or (2) classify smaller subsets of collection

In [None]:
res = db_obj[saveTo].find({'relevant':None})
res_list = list(res)
len(res_list)

### (A) Classify Entire Collection ...

In [None]:
# numToClassify = len(res_list) #can also manually set to 100 or something if your don't want to classify entire collection in one go
# confidence = 0.0
# species_keyword = 'Plains Zebra'

# #encode the metadata in a form that fits the MS classifier
# flickr_img_dicts = img.get_flickr_img_dicts(db_obj, saveTo, numToClassify)
# print(len(flickr_img_dicts))

# #begin running the classifier on our images in the collection
# sc.predict_image_relevancy(db_obj, saveTo, flickr_img_dicts, species_keyword, confidence)

### (B) ... Or Classify Smaller Subsets of Collection to avoid Timeout

In [None]:
num_unfiltered = db_obj[saveTo].count_documents({'relevant': None})
num_unfiltered

In [None]:
num_batches = int(num_unfiltered / 100) + 1
numToClassify = 100
confidence = 0.0
species_keyword = 'Plains Zebra'

print('Num. of Batches: ', num_batches)
for i in range(0,num_batches):
    print('Batch: ', i)
    flickr_img_dicts = img.get_flickr_img_dicts(db_obj, saveTo, numToClassify)
    print(len(flickr_img_dicts))
    sc.predict_image_relevancy(db_obj, saveTo, flickr_img_dicts, species_keyword, confidence)
print('Done with all batches')

FIXME: currently,humpback whale specific - 30 full collection, has relevant and wild bool values as strings, so our get_flickr_img_dicts 
function is not returning anything because we'd need to do relevant: "null"
we need to go back and fix the values in these fields back to bool vals. This bool -> string conversion happened when we exported our data from
the flickr db to a csv and into this dummy collection.


# Visualizing MS Species Classifier Results

In [None]:
import ipyplot

In [None]:
## get images labeled as relevant and irrelevant
images = db_obj[saveTo].find({"relevant": True}, {"url": 1})
images_irrel = db_obj[saveTo].find({"relevant": False}, {"url": 1})

In [None]:
list_of_imgs = list(images)
list_of_imgs_irrel = list(images_irrel)

In [None]:
imgs_url = [dic['url'] for dic in list_of_imgs]
imgs_url_irrel = [dic['url'] for dic in list_of_imgs_irrel]

In [None]:
labels = [dic['_id'] for dic in list_of_imgs]
labels_irrel = [dic['_id'] for dic in list_of_imgs_irrel]

## Compare counts
- plot count of relevant vs non-relevant for each collection

In [None]:
count_rel = len(imgs_url)
count_irrel = len(imgs_url_irrel)

data = {'relevant': count_rel, 'irrelevant': count_irrel}
df_counts = pd.DataFrame(data, index=[0])
print(df_counts)

## Plot Images in A Grid

In [None]:
ipyplot.plot_images(imgs_url, labels, max_images = 600, img_width=100)

## Double Checking Relevant Images and Labeling Truly Relevant Images as Wild/Not Wild

---- New Double Checking ----

## Select the Collection You Want to Filter

In [None]:
## saveTo = 'plains zebra general'
saveTo = 'plains zebra general africa bbox'
## saveTo = 'plains zebra specific'
## saveTo = 'plains zebra specific africa bbox'

print('You are working with the collection: ', saveTo)

In [None]:
## rename field and get the current mongoDB database collection object
db.renameField(saveTo, 'url_l', 'url')
db_obj = db.getDB()

In [None]:
## run this cell to see how many relevant images you have left to double check
amt_remaining_to_check = db_obj[saveTo].count_documents({"$and": [{"relevant": True}, {"double_checked": False}]})
print(amt_remaining_to_check)

### Run this cell to start double checking filtration. Below are the steps for the update filtration process:
1. Mark if the image is truly relevant (contains a real Plains zebra)
2. If the image is relevant, mark if it is a wild/unknown/captive encounter
    - **wild**: you can definitely tell that the Plains zebra is in the wild/national park. You can use the location coordinates (if available) to double check
    - **unknown**: you cannot tell if the Plains zebra is in the wild or a zoo. 
    - **captive**: you can definitely tell that the Plains zebra is in captivity/zoo. Look for "zoo" in the tags/description/title, and if the location coordinates detail an area where Plains zebras don't typically live

In [None]:
## run this cell to filter through the images
amount = 50
db.doubleCheckRelevantImages(saveTo, amount, first_round = False)

In [None]:
print(saveTo)

num_wild = db_obj[saveTo].find({'relevant': True, 'wild': True})
num_unknown = db_obj[saveTo].find({'relevant': True, 'wild': 'unknown'})
num_captive = db_obj[saveTo].find({'relevant': True, 'wild': False})
num_rel =  db_obj[saveTo].find({'relevant': True})

print('num_wild: ', len(list(num_wild)))
print('num_unknown: ', len(list(num_unknown)))
print('num_captive: ', len(list(num_captive)))

print('\nnum_rel: ', len(list(num_rel)))

In [None]:
res = db_obj[saveTo].find({})
len(list(res))

## Cross-checking Wild/Relevant Status across collections

In [5]:
import numpy as np
def get_all_unique_urls(db_obj, species_collections):
    
    #get urls across all species collections
    all_urls = []
    for collection in species_collections:
        all_urls += list(db_obj[collection].find({'relevant': True, 'double_checked': True}, {'url'}))
        
    # since all_urls is a list in format of [{_id:..., url:...}], 
    # we create list only_all_urls, which will only contain the urls (no dicts)
    # this will enable us to apply the np.unique() function on only_all_urls to get a list of unique urls
    only_all_urls = []
    for dict_item in all_urls:
        only_all_urls.append(dict_item['url'])
    
    #get list of unique_urls
    only_all_urls_np = np.array(only_all_urls)
    unique_urls = np.unique(only_all_urls_np)

    return unique_urls

In [6]:
plains_zebra_collections = ['plains zebra general',
                            'plains zebra general africa bbox',
                            'plains zebra specific',
                            'plains zebra specific africa bbox']

unique_urls = get_all_unique_urls(db_obj, plains_zebra_collections)

In [7]:
len(unique_urls)

1963

### Create Dataframe with Wild Status for each Collection

In [8]:
# create a dataframe with unique_urls
import pandas as pd

#add in the columns of wild_status for each of our collections (with dummy values for now)
dummy_vals = [None for i in range(0, len(unique_urls))]
wild_status_checked = [False for i in range(0, len(unique_urls))]

unique_urls_df = pd.DataFrame({'url': unique_urls,
                              'plains_zebra_general_wild_status': dummy_vals,
                              'plains_zebra_general_africa_bbox_wild_status': dummy_vals,
                              'plains_zebra_specific_wild_status': dummy_vals,
                              'plains_zebra_specific_africa_bbox_wild_status': dummy_vals,
                              'wild_status_checked': wild_status_checked})
unique_urls_df.head()

Unnamed: 0,url,plains_zebra_general_wild_status,plains_zebra_general_africa_bbox_wild_status,plains_zebra_specific_wild_status,plains_zebra_specific_africa_bbox_wild_status,wild_status_checked
0,https://live.staticflickr.com/31337/5014614956...,,,,,False
1,https://live.staticflickr.com/31337/5014709301...,,,,,False
2,https://live.staticflickr.com/31337/5021048898...,,,,,False
3,https://live.staticflickr.com/65535/4815945808...,,,,,False
4,https://live.staticflickr.com/65535/4818260554...,,,,,False


In [9]:
def get_wild_status(url, saveTo):
    status = db_obj[saveTo].find_one({'url': url}, {'wild'})
    return status['wild'] if status is not None else None

In [10]:
unique_urls_df['plains_zebra_general_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'plains zebra general'))
unique_urls_df['plains_zebra_general_africa_bbox_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'plains zebra general africa bbox'))
unique_urls_df['plains_zebra_specific_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'plains zebra specific'))
unique_urls_df['plains_zebra_specific_africa_bbox_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'plains zebra specific africa bbox'))

In [11]:
unique_urls_df.head()

Unnamed: 0,url,plains_zebra_general_wild_status,plains_zebra_general_africa_bbox_wild_status,plains_zebra_specific_wild_status,plains_zebra_specific_africa_bbox_wild_status,wild_status_checked
0,https://live.staticflickr.com/31337/5014614956...,True,True,,,False
1,https://live.staticflickr.com/31337/5014709301...,True,True,,,False
2,https://live.staticflickr.com/31337/5021048898...,unknown,False,,,False
3,https://live.staticflickr.com/65535/4815945808...,,True,,,False
4,https://live.staticflickr.com/65535/4818260554...,,True,,,False


In [12]:
unique_urls_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1963 entries, 0 to 1962
Data columns (total 6 columns):
 #   Column                                         Non-Null Count  Dtype 
---  ------                                         --------------  ----- 
 0   url                                            1963 non-null   object
 1   plains_zebra_general_wild_status               1033 non-null   object
 2   plains_zebra_general_africa_bbox_wild_status   1200 non-null   object
 3   plains_zebra_specific_wild_status              344 non-null    object
 4   plains_zebra_specific_africa_bbox_wild_status  190 non-null    object
 5   wild_status_checked                            1963 non-null   bool  
dtypes: bool(1), object(5)
memory usage: 78.7+ KB


In [13]:
# unique_urls_df.to_csv('plains_zebra_unique_urls_df.csv')

### Load in CSV (if already loaded)

In [None]:
# import pandas as pd
# unique_urls_df = pd.read_csv('plains_zebra_unique_urls_df.csv')

### Manually check URLS + update accordingly

In [14]:
# helper functions for cross-checking docs across collections
from IPython.display import Image, display

def get_residing_collections_of_doc(url, species_collections):
    ''' returns list of collections where item/url exists '''
    does_exist = []
    for collection in species_collections:
        wild_status = get_wild_status(url, collection)
        if wild_status is not None:
            does_exist.append(collection)
    
    return does_exist

def display_image(img_url, collection):
    ''' displays image of interest and associated metadata '''
    item = db_obj[collection].find_one({'url': img_url})
    display(Image(img_url, height=500, width=500))
    location = db.coordsToLocation(item['latitude'], item['longitude'])
    
    print('ID: {}\n Title: {}\n Tags: {}\n Location: ({},{}) --> {}\n'.format(item['_id'],
                                              item['title'], 
                                              item['tags'], 
                                              item['latitude'],item['longitude'], location))
    print('Url:{}\n'.format(img_url))
    try:
        print('Confidence of Prediction: {}\n'.format(item['confidence']))
    except KeyError:
        pass

def prompt_for_relevance_classification():
    ''' prompts user for decision on relevance/wild status of photographed encounter'''
    
    print("RELEVANT (enter y=yes/n=no):", end =" ")
    rel = True if input() == "y" else False
                  
    if rel == True:
        wild_response = input("WILD (enter y=yes/u=unknown/n=no): ")
        if wild_response == 'y':
            wild = True
        elif wild_response == 'u':
            wild = 'unknown'
        else:
            wild = False
    else:
        wild = 0
        
    return rel, wild


def update_wild_status(img_url, residing_collections):
    ''' prompts for classification and updates relevant, wild, and double_checked status of our revised image url in mongoDB'''
    rel, wild = prompt_for_relevance_classification()
    
    #only want to update status in collections where the img exists (resides)
    for collection in residing_collections:
        item = db_obj[collection].find_one({'url': img_url})
        db_obj[collection].update_one({'_id': item['_id']}, {'$set': {'relevant': rel, 'wild': wild, 'double_checked': True}})

    print('updated IMG REL/WILD STATUS in collections: ', residing_collections)
    print('Response Updated: {} and {}.\n'.format(rel, wild))

          
def check_wild_status(url, species_collections):
    '''checks, displays, and updates wild status across across collections that image (url) resides in'''
    residing_collections = get_residing_collections_of_doc(url, species_collections)
    
    #check if residing_collections is empty (safety measure, though url must exist in at least one col)
    if not residing_collections:
        return False
    
    display_image(url, residing_collections[0])
    update_wild_status(url, residing_collections)
    
    #update across all necessary collections was successful
    return True

In [15]:
# get names of wild status columns from all collections
wild_status_columns = list(unique_urls_df.columns[1:-1])
#zip the collections and columns list (print out to make sure the correct corresponding column and collections names are in the same tuple)
zipped_columns_and_collections = list(zip(wild_status_columns, plains_zebra_collections))

for column, collection in zipped_columns_and_collections:
    print(column, '|', collection)

plains_zebra_general_wild_status | plains zebra general
plains_zebra_general_africa_bbox_wild_status | plains zebra general africa bbox
plains_zebra_specific_wild_status | plains zebra specific
plains_zebra_specific_africa_bbox_wild_status | plains zebra specific africa bbox


In [81]:
start = 1901
end = 1962 #last index is 1962

In [82]:
# cross-checking with human in the loop
for index, row in unique_urls_df[start:end+1].iterrows():
    statuses = row[wild_status_columns]
    #print(set(statuses))
    
    #remove NaN/None entries from statuses before checking for discrepancies in status
    #otherwise if the set is {NaN, wild} --> we are unnecessarily checking bc in all existing collections, the img status is the same
    statuses = [x for x in statuses if pd.isnull(x) == False and x != 'nan']
    
    #determine if necessary to check wild_status (at least one wild status is different from the rest)
    if len(set(statuses)) > 1:
        print('checking status at index:', index)
        print(set(statuses))
        checked_status = check_wild_status(row['url'], plains_zebra_collections)
    
        #update all rows in unique_urls_df to reflect new, consensus wild_status
        for column, collection in zipped_columns_and_collections:
            unique_urls_df.at[index, (column)] = get_wild_status(row['url'], collection)
        
    #update checked_wild_status column
    unique_urls_df.at[index, ('wild_status_checked')] = True

In [83]:
unique_urls_df[start:end+2]

Unnamed: 0,url,plains_zebra_general_wild_status,plains_zebra_general_africa_bbox_wild_status,plains_zebra_specific_wild_status,plains_zebra_specific_africa_bbox_wild_status,wild_status_checked
1901,https://live.staticflickr.com/65535/5101415096...,,,,True,True
1902,https://live.staticflickr.com/65535/5101415115...,,,,True,True
1903,https://live.staticflickr.com/65535/5101415129...,,,,True,True
1904,https://live.staticflickr.com/65535/5101570179...,,,,True,True
1905,https://live.staticflickr.com/65535/5101570200...,,,,True,True
...,...,...,...,...,...,...
1958,https://live.staticflickr.com/65535/5115256708...,,,,True,True
1959,https://live.staticflickr.com/65535/5115256730...,,,,True,True
1960,https://live.staticflickr.com/65535/5115256896...,,,,True,True
1961,https://live.staticflickr.com/65535/5118368965...,,,,True,True


In [84]:
# optional: save updated csv
unique_urls_df.to_csv('plains_zebra_unique_urls_df.csv')

In [85]:
db.close()