## Readme: Testing File for the Implementation of MS Classifier with Flickr Data to help speed up classification for ground truths

- will eventually implement these changes into original FlickrPlayground.ipynb
- use reticulated giraffes as test species

In [1]:
#to reflect changes made in modules
%load_ext autoreload
%autoreload 2

## Keys

In [2]:
DB_KEY =  "mongodb+srv://user:BCLobB4rLJucVXG2@wildbook-cmmya.mongodb.net/test?retryWrites=true&w=majority" # connect to database here (see owners for access)
# MS_key = '3c313eb853de41788b3e35e9bcf1ba2e'

In [3]:
import os, sys
# sys.path.append(os.path.join(sys.path[0], '../'))
sys.path.append(os.path.join(os.path.abspath(os.getcwd()), '../'))

#distance visualization
import plotly.graph_objects as go
import plotly.io as pio
import numpy as np
from itertools import chain
import pandas as pd
import matplotlib.pyplot as plt

#import flickr and db modules
from wildbook_social import Flickr, Database

#set up
db = Database(DB_KEY, 'flickr_june_2019')
# db = Database(DB_KEY, 'imgs_for_species_classifier')
fr = Flickr(db)

## Query Terms

In [4]:
# query = 'giraffe'
# saveTo = 'sample giraffe general term'

# query = 'whale'
# saveTo = 'humpback whale general term'
# saveTo = 'humpback whale general term - 30'
# saveTo = 'humpback whale general term - 50'
# saveTo = 'humpback whale general term - 80'
# saveTo = 'humpback whale specific - 30 full collection json'

#flickr_june_2019 db
saveTo = 'humpback whale general'

# To clone collections (except this just created an empty collection w/o docs in it - had to use compass to download data as json file
# import into new collections)
# pipeline = [ {"$match": {}}, 
#              {"$out": "destination_collection"}]

# db.source_collection.aggregate(pipeline)

In [5]:
saveTo

'humpback whale general'

## Get Results from Flickr Photos Search API

In [None]:
#query for images within time frame
# date_since = "2019-03-01"
# res = fr.search(query, date_since, False, saveTo)

In [None]:
# ## added conditions for 'imgs_for_species_classifier' in both methods below to test manual sorting as backup option on data

# amount = 1 # amount to filter through
# db.doManualFiltration(saveTo, amount) 
# db.convertToUTC(saveTo) #convert dateTaken strings of documents in database to datetime objects

## Filter through images stored in MongoDB collections

### Relevance Filtration
- Use MS Classifier to determine if species is in frame (wild and captive encounters)

In [8]:
#rename 'url_l' field in flickr docs to just 'url'
db.renameField(saveTo, 'url_l', 'url')

In [9]:
from wildbook_social import SpeciesClassifier 
from wildbook_social import Image

## instance of the MS Species Classification API and Save Class to reformat Flickr data for API
sc = SpeciesClassifier()
img = Image()

In [10]:
## get the current mongoDB database collection object
db_obj = db.getDB()

### Wild / Captive Filtration
- Manually filter through predicted relevant images to determine if the image contains a wild or captive encounter
- We keep the option of determining if truly relevant in case the image is actually NOT relevant

In [None]:
num_unfiltered = db_obj[saveTo].count_documents({'relevant': None})
print('Num. of Unfiltered imgs in' , saveTo, ':', num_unfiltered)

In [None]:
int(num_unfiltered/100) + 1

In [None]:
num_batches = int(num_unfiltered / 100) + 1
numToClassify = 100
confidence = 30.00
species_keyword = 'Humpback Whale'

for i in range(0,num_batches):
    print('Batch:', i)
    flickr_img_dicts = img.get_flickr_img_dicts(db_obj, saveTo, numToClassify)
    print(len(flickr_img_dicts))
    sc.predict_image_relevancy(db_obj, saveTo, flickr_img_dicts, species_keyword, confidence)
print('Done with all batches')

FIXME: currently,humpback whale specific - 30 full collection, has relevant and wild bool values as strings, so our get_flickr_img_dicts 
function is not returning anything because we'd need to do relevant: "null"
we need to go back and fix the values in these fields back to bool vals. This bool -> string conversion happened when we exported our data from
the flickr db to a csv and into this dummy collection.


In [None]:
## Automated Relevancy Filtration - use the MS Species Classifier API to find subset of images 
## with species of interest in frame these images will be marked as 'relevant: true'
## in corresponding mongoDB collection (saveTo parameter)

# species_keyword = 'Giraffe'

species_keyword = 'Humpback Whale'
confidence = 30.00

sc.predict_image_relevancy(db_obj, saveTo, flickr_img_dicts, species_keyword, confidence)

In [None]:
## Manual Filtration - go through images marked as 'relevant: true' and determine if img is truly relevant (for ex.
## an image marked as containing a giraffe has a Reticulated Giraffe and not a rothschild giraffe (irrelevant))
## If an image is truly relevant, determine if the encounter is wild or captive


##TO DO
amountOfImagesToLookThrough = 1
db.lookThroughRelevantImgsOnly(saveTo, amountOfImagesToLookThrough)

In [None]:
# db.doubleCheckWildImgs(saveTo)

## Visualizing MS Species Classifier Resulta

In [None]:
import ipyplot

In [None]:
## For now, querying collections with rel: true, false, or null but species classifier should now
## assign wild:false for docs with rel:false
images_30 = db_obj["humpback whale general term - 30"].find({"relevant": True}, {"url": 1})
images_30_irrel = db_obj["humpback whale general term - 30"].find({"relevant": False}, {"url": 1})

images_50 = db_obj["humpback whale general term - 50"].find({"relevant": True}, {"url": 1})
images_80 = db_obj["humpback whale general term - 80"].find({"relevant": True}, {"url": 1})

In [None]:
list_of_imgs_30 = list(images_30)
list_of_imgs_30_irrel = list(images_30_irrel)

list_of_imgs_50 = list(images_50)
list_of_imgs_80 = list(images_80)

In [None]:
imgs_url_30 = [dic['url'] for dic in list_of_imgs_30]
imgs_url_30_irrel = [dic['url'] for dic in list_of_imgs_30_irrel]

imgs_url_50 = [dic['url'] for dic in list_of_imgs_50]
imgs_url_80 = [dic['url'] for dic in list_of_imgs_80]

In [None]:
labels_30 = [dic['_id'] for dic in list_of_imgs_30]
labels_30_irrel = [dic['_id'] for dic in list_of_imgs_30_irrel]

labels_50 = [dic['_id'] for dic in list_of_imgs_50]
labels_80 = [dic['_id'] for dic in list_of_imgs_80]

In [None]:
ipyplot.plot_images(imgs_url_30_irrel, labels_30_irrel, max_images = 500, img_width=100)

In [None]:
ipyplot.plot_images(imgs_url_30, labels_30, max_images = 200, img_width=150)

In [None]:
ipyplot.plot_images(imgs_url_50, labels_50, max_images = 200, img_width=150)

In [None]:
ipyplot.plot_images(imgs_url_80, labels_80, max_images = 200, img_width=150)

## Checking for and Getting rid of Duplicate Images in our Collection

In [23]:
saveTo = 'humpback whale specific'

In [24]:
print(saveTo)
## rename 'url_l' field in docs to just 'url'
db.renameField(saveTo, 'url_l', 'url')

humpback whale specific


In [25]:
res = db_obj[saveTo].find()
all_urls = [item['url'] for item in res]
len(all_urls)

20400

In [26]:
import numpy as np
unique_urls = np.unique(all_urls)
len(unique_urls)

5661

In [27]:
# pass in each url and only keep one item
# DELETES DOCUMENTS!!!
for url in unique_urls:
    res = db_obj[saveTo].find({'url': url}, { 'item': 0, 'status': 0, '_id': 1 })
    count = db_obj[saveTo].count_documents({'url': url})
    id_list = [item['_id'] for item in res]
    #only keep the image at id_list[0]
    for i in range(1, len(id_list)):
        db_obj[saveTo].delete_one({'_id': id_list[i]})
    print('We kept: {}'.format(id_list[0]))

We kept: 5e796cef63cb38275b5377d1
We kept: 5f0c467d20e5d6b6736db9c4
We kept: 5f0c466f20e5d6b6736db8dc
We kept: 5f0c466b20e5d6b6736db88f
We kept: 5f0c466920e5d6b6736db86f
We kept: 5f0c466920e5d6b6736db873
We kept: 5f0c466a20e5d6b6736db87b
We kept: 5f0c466a20e5d6b6736db875
We kept: 5f0c466a20e5d6b6736db874
We kept: 5f0c466920e5d6b6736db872
We kept: 5f0c466920e5d6b6736db870
We kept: 5f0c466920e5d6b6736db871
We kept: 5f0c466520e5d6b6736db830
We kept: 5f0c466920e5d6b6736db86e
We kept: 5f0c466520e5d6b6736db831
We kept: 5f0c466520e5d6b6736db827
We kept: 5f0c466220e5d6b6736db7e9
We kept: 5f0c465c20e5d6b6736db77b
We kept: 5e95498fc43f8e6731a2f1d8
We kept: 5f0c465920e5d6b6736db74a
We kept: 5f0c465720e5d6b6736db712
We kept: 5f0c467e20e5d6b6736db9c7
We kept: 5f0c465620e5d6b6736db709
We kept: 5f0c463e20e5d6b6736db54c
We kept: 5f0c463e20e5d6b6736db54b
We kept: 5f0c463e20e5d6b6736db54d
We kept: 5f0c463e20e5d6b6736db54e
We kept: 5f0c463e20e5d6b6736db54f
We kept: 5f0c463a20e5d6b6736db504
We kept: 5f0c4