# MS Classifier for Whale Shark Collection Classification
## Readme: Testing File for the Implementation of MS Classifier with Flickr Data to help speed up classification for ground truths

- will eventually implement these changes into original FlickrPlayground.ipynb
- use reticulated giraffes as test species

In [1]:
#to reflect changes made in modules
%load_ext autoreload
%autoreload 2

## Keys

In [2]:
DB_KEY =  "mongodb+srv://user:BCLobB4rLJucVXG2@wildbook-cmmya.mongodb.net/test?retryWrites=true&w=majority" # connect to database here (see owners for access)
# MS_key = '3c313eb853de41788b3e35e9bcf1ba2e'

In [3]:
import os, sys
sys.path.append(os.path.join(os.path.abspath(os.getcwd()), '../'))

#import flickr and db modules
from wildbook_social import Flickr, Database

#set up
db = Database(DB_KEY, 'flickr_june_2019')
fr = Flickr(db)

In [4]:
## get the current mongoDB database collection object
db_obj = db.getDB()

# Select MongoDB Collection + MS Classifier Setup
uncomment (remove the # symbol) the collection you want to work on

In [None]:
saveTo = 'whale shark specific'
# saveTo = 'whale shark specific bbox'
# saveTo = 'whale shark specific multilingual' 
# saveTo = 'whale shark specific multilingual bbox'

In [None]:
saveTo

In [None]:
## rename 'url_l' field in docs to just 'url'
db.renameField(saveTo, 'url_l', 'url')

# Double checking relevant documents 

You can run this cell to check how many documents you have left to double check

In [None]:
amt_remaining_to_check = db_obj[saveTo].count_documents({"$and": [{"relevant": True}, {"double_checked": False}]})
print(amt_remaining_to_check)

### Run this cell to start double checking filtration. Below are the steps for the update filtration process:
1. Mark if the image is truly relevant (contains a real whale shark)
2. If the image is relevant, mark if it is a wild/unknown/captive encounter
    - **wild**: you can definitely tell that the whale shark is in the ocean. You can use the location coordinates available to double check
    - **unknown**: you cannot tell if the whale shark is in an aquarium or the ocean. 
    - **captive**: you can definitely tell that the whale shark is in captivity/aquarium. Look for "aquarium" in the tags/description/title, and if the location coordinates detail an area where whale sharks don't typically live

In [None]:
#this cell has you filter through the images
amount = 1
db.doubleCheckRelevantImages(saveTo, amount, first_round = False)

# Cross-Checking Urls for same 'relevant' and 'wild' status/labels across all species-specific (whale shark) collections

### Get a list of unique urls across all collections

In [5]:
whale_shark_collections = ['whale shark specific', 'whale shark specific bbox', 'whale shark specific multilingual', 'whale shark specific multilingual bbox']

In [6]:
#get lists of relevant, double checked urls
specific_urls = list(db_obj[whale_shark_collections[0]].find({'relevant': True, 'double_checked': True}, {'url'}))
specific_bbox_urls = list(db_obj[whale_shark_collections[1]].find({'relevant': True, 'double_checked': True}, {'url'}))
specific_multilingual_urls = list(db_obj[whale_shark_collections[2]].find({'relevant': True, 'double_checked': True}, {'url'}))
specific_multilingual_bbox_urls = list(db_obj[whale_shark_collections[3]].find({'relevant': True, 'double_checked': True}, {'url'}))

In [7]:
print(len(specific_urls))
print(len(specific_bbox_urls))
print(len(specific_multilingual_urls))
print(len(specific_multilingual_bbox_urls))

434
13
17
13


In [8]:
# combine all urls into one list
all_urls = specific_urls + specific_bbox_urls + specific_multilingual_urls + specific_multilingual_bbox_urls
print(len(all_urls))

477


In [9]:
# since all_urls is a list in format of [{_id:..., url:...}], we create list only_all_urls, which will only contain the urls (no dicts)
# this will enable us to apply the np.unique() function on only_all_urls to get a list of unique urls
only_all_urls = []
for dict_item in all_urls:
    only_all_urls.append(dict_item['url'])
print(len(only_all_urls))

477


In [10]:
#get unique urls from only_all_urls
import numpy as np

only_all_urls_np = np.array(only_all_urls)
unique_urls = np.unique(only_all_urls_np)
print(len(unique_urls))

453


### Create Dataframe with wild status for each url/collection

In [11]:
whale_shark_collections

['whale shark specific',
 'whale shark specific bbox',
 'whale shark specific multilingual',
 'whale shark specific multilingual bbox']

In [12]:
# create a dataframe with unique_urls
import pandas as pd

#add in the columns of wild_status for each of our iberian_lynx_collections (with dummy values for now)
dummy_vals = [None for i in range(0, len(unique_urls))]
wild_status_checked = [False for i in range(0, len(unique_urls))]

unique_urls_df = pd.DataFrame({'url': unique_urls,
                              'whale_shark_specific_wild_status': dummy_vals,
                              'whale_shark_specific_bbox_wild_status': dummy_vals,
                              'whale_shark_specific_multilingual_wild_status': dummy_vals,
                              'whale_shark_specific_multilingual_bbox_wild_status': dummy_vals,
                              'wild_status_checked': wild_status_checked})
unique_urls_df.head()

Unnamed: 0,url,whale_shark_specific_wild_status,whale_shark_specific_bbox_wild_status,whale_shark_specific_multilingual_wild_status,whale_shark_specific_multilingual_bbox_wild_status,wild_status_checked
0,https://live.staticflickr.com/31337/4826568480...,,,,,False
1,https://live.staticflickr.com/31337/4826576266...,,,,,False
2,https://live.staticflickr.com/31337/4833326019...,,,,,False
3,https://live.staticflickr.com/31337/4874675029...,,,,,False
4,https://live.staticflickr.com/31337/4874675865...,,,,,False


In [13]:
def get_wild_status(url, saveTo):
    status = db_obj[saveTo].find_one({'url': url}, {'wild'})
    return status['wild'] if status is not None else None

In [14]:
unique_urls_df['whale_shark_specific_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'whale shark specific'))
unique_urls_df['whale_shark_specific_bbox_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'whale shark specific bbox'))
unique_urls_df['whale_shark_specific_multilingual_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'whale shark specific multilingual'))
unique_urls_df['whale_shark_specific_multilingual_bbox_wild_status'] = unique_urls_df['url'].apply(lambda row: get_wild_status(row, 'whale shark specific multilingual bbox'))

In [15]:
unique_urls_df.head()

Unnamed: 0,url,whale_shark_specific_wild_status,whale_shark_specific_bbox_wild_status,whale_shark_specific_multilingual_wild_status,whale_shark_specific_multilingual_bbox_wild_status,wild_status_checked
0,https://live.staticflickr.com/31337/4826568480...,unknown,,,,False
1,https://live.staticflickr.com/31337/4826576266...,unknown,,,,False
2,https://live.staticflickr.com/31337/4833326019...,False,False,,False,False
3,https://live.staticflickr.com/31337/4874675029...,unknown,,,,False
4,https://live.staticflickr.com/31337/4874675865...,unknown,,,,False


In [16]:
unique_urls_df.to_csv('whale_shark_unique_urls_df.csv')

### Load In CSV

In [None]:
# import pandas as pd
# unique_urls_df = pd.read_csv('whale_shark_unique_urls_df.csv')

### Manually Check URLs with Different Wild Statuses + Update in According Collections

In [17]:
# helper functions for cross-checking docs across collections
from IPython.display import Image, display

def get_residing_collections_of_doc(url, species_collections):
    ''' returns list of collections where item/url exists '''
    does_exist = []
    for collection in species_collections:
        wild_status = get_wild_status(url, collection)
        if wild_status is not None:
            does_exist.append(collection)
    
    return does_exist

def display_image(img_url, collection):
    ''' displays image of interest and associated metadata '''
    item = db_obj[collection].find_one({'url': img_url})
    display(Image(img_url, height=500, width=500))
    location = db.coordsToLocation(item['latitude'], item['longitude'])
    
    print('ID: {}\n Title: {}\n Tags: {}\n Location: ({},{}) --> {}\n'.format(item['_id'],
                                              item['title'], 
                                              item['tags'], 
                                              item['latitude'],item['longitude'], location))
    print('Url:{}\n'.format(img_url))
    try:
        print('Confidence of Prediction: {}\n'.format(item['confidence']))
    except KeyError:
        pass

def prompt_for_relevance_classification():
    ''' prompts user for decision on relevance/wild status of photographed encounter'''
    
    print("RELEVANT (enter y=yes/n=no):", end =" ")
    rel = True if input() == "y" else False
                  
    if rel == True:
        wild_response = input("WILD (enter y=yes/u=unknown/n=no): ")
        if wild_response == 'y':
            wild = True
        elif wild_response == 'u':
            wild = 'unknown'
        else:
            wild = False
    else:
        wild = 0
        
    return rel, wild


def update_wild_status(img_url, residing_collections):
    ''' prompts for classification and updates relevant, wild, and double_checked status of our revised image url in mongoDB'''
    rel, wild = prompt_for_relevance_classification()
    
    #only want to update status in collections where the img exists (resides)
    for collection in residing_collections:
        item = db_obj[collection].find_one({'url': img_url})
        db_obj[collection].update_one({'_id': item['_id']}, {'$set': {'relevant': rel, 'wild': wild, 'double_checked': True}})

    print('updated IMG REL/WILD STATUS in collections: ', residing_collections)
    print('Response Updated: {} and {}.\n'.format(rel, wild))

          
def check_wild_status(url, species_collections):
    '''checks, displays, and updates wild status across across collections that image (url) resides in'''
    residing_collections = get_residing_collections_of_doc(url, species_collections)
    
    #check if residing_collections is empty (safety measure, though url must exist in at least one col)
    if not residing_collections:
        return False
    
    display_image(url, residing_collections[0])
    update_wild_status(url, residing_collections)
    
    #update across all necessary collections was successful
    return True

In [19]:
# get names of wild status columns from all collections
wild_status_columns = list(unique_urls_df.columns[1:-1])
#zip the collections and columns list (print out to make sure the correct corresponding column and collections names are in the same tuple)
zipped_columns_and_collections = list(zip(wild_status_columns, whale_shark_collections))

print(zipped_columns_and_collections)

[('whale_shark_specific_wild_status', 'whale shark specific'), ('whale_shark_specific_bbox_wild_status', 'whale shark specific bbox'), ('whale_shark_specific_multilingual_wild_status', 'whale shark specific multilingual'), ('whale_shark_specific_multilingual_bbox_wild_status', 'whale shark specific multilingual bbox')]


### Begin Filtering...
0-452

In [68]:
start = 401
end = 452

In [69]:
# cross-checking with human in the loop
for index, row in unique_urls_df[start:end+1].iterrows():
    statuses = row[wild_status_columns]
    print(set(statuses))
    
    #remove NaN/None entries from statuses before checking for discrepancies in status
    #otherwise if the set is {NaN, wild} --> we are unnecessarily checking bc in all existing collections, the img status is the same
    statuses = [x for x in statuses if pd.isnull(x) == False and x != 'nan']
    
    #determine if necessary to check wild_status (at least one wild status is different from the rest)
    if len(set(statuses)) > 1:
        checked_status = check_wild_status(row['url'], whale_shark_collections)
    
        #update all rows in unique_urls_df to reflect new, consensus wild_status
        for column, collection in zipped_columns_and_collections:
            unique_urls_df.at[index, (column)] = get_wild_status(row['url'], collection)
    else:
        print('No need to check url at index: ', index)
        
    #update checked_wild_status column
    unique_urls_df.at[index, ('wild_status_checked')] = True

{True, None}
No need to check url at index:  401
{None, 'unknown'}
No need to check url at index:  402
{True, None}
No need to check url at index:  403
{None, 'unknown'}
No need to check url at index:  404
{True, None}
No need to check url at index:  405
{True, None}
No need to check url at index:  406
{True, None}
No need to check url at index:  407
{None, 'unknown'}
No need to check url at index:  408
{True, None}
No need to check url at index:  409
{None, 'unknown'}
No need to check url at index:  410
{None, 'unknown'}
No need to check url at index:  411
{True, None}
No need to check url at index:  412
{True, None}
No need to check url at index:  413
{None, 'unknown'}
No need to check url at index:  414
{True, None}
No need to check url at index:  415
{True, None}
No need to check url at index:  416
{True, None}
No need to check url at index:  417
{False, None}
No need to check url at index:  418
{None, 'unknown'}
No need to check url at index:  419
{None, 'unknown'}
No need to chec

In [70]:
unique_urls_df[start:end+2]

Unnamed: 0,url,whale_shark_specific_wild_status,whale_shark_specific_bbox_wild_status,whale_shark_specific_multilingual_wild_status,whale_shark_specific_multilingual_bbox_wild_status,wild_status_checked
401,https://live.staticflickr.com/65535/4968790650...,True,,,,True
402,https://live.staticflickr.com/65535/4970479443...,unknown,,,,True
403,https://live.staticflickr.com/65535/4971155002...,True,,,,True
404,https://live.staticflickr.com/65535/4972155522...,unknown,,,,True
405,https://live.staticflickr.com/65535/4972696863...,True,,,,True
406,https://live.staticflickr.com/65535/4977049870...,True,True,,True,True
407,https://live.staticflickr.com/65535/4977049939...,True,True,,True,True
408,https://live.staticflickr.com/65535/4988613343...,unknown,,,,True
409,https://live.staticflickr.com/65535/4991889039...,True,,,,True
410,https://live.staticflickr.com/65535/4993254501...,unknown,,,,True


In [71]:
# optional: save updated csv
unique_urls_df.to_csv('whale_shark_unique_urls_df.csv')

In [None]:
 db.close()