In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'  # default is ‘last_expr'

%load_ext autoreload
%autoreload 2

In [2]:
import json
import os
from collections import Counter

from tqdm import tqdm
import azure.cosmos.cosmos_client as cosmos_client

# Queries

Example queries against the document DB instance.

## Connect to the Cosmos DB instance

`COSMOS_ENDPOINT` and `COSMOS_KEY` need to be environment variables. 

In [6]:
# Cosmos DB config
config = {
    'ENDPOINT': os.environ.get('COSMOS_ENDPOINT'),
    'PRIMARYKEY': os.environ.get('COSMOS_KEY')
}

# Initialize the Cosmos client
client = cosmos_client.CosmosClient(url_connection=config['ENDPOINT'], auth={
                                    'masterKey': config['PRIMARYKEY']})

container_link = 'dbs/camera-trap/colls/images'  # database link + container link

## Examples

In the examples, we limit the selection to a few entries using the `TOP` keyword. When using the DB to create datasets, delete the TOP keyword and arg.

In [16]:
dataset = 'idfg'

### All images with bounding box annotation

In [26]:
%%time

query = {'query': '''
SELECT TOP 1020 im
FROM images im
WHERE ARRAY_LENGTH(im.annotations.bbox) > 0
'''}

options = {
    'enableCrossPartitionQuery': True
}

result_iterable = client.QueryItems(container_link, query, options)
# if you want to restrict to one dataset, pass in partition_key=dataset

results = []
for item in iter(result_iterable):
    res = item['im']
    results.append(res)

print('Length of results:', len(results))

Length of results: 1020
CPU times: user 115 ms, sys: 9.8 ms, total: 125 ms
Wall time: 1.44 s


In [27]:
results[1000]

{'_attachments': 'attachments/',
 '_etag': '"a00028c7-0000-0500-0000-5d4bd7c30000"',
 '_rid': 'WjB+ALn4oN55NQAAAAAACA==',
 '_self': 'dbs/WjB+AA==/colls/WjB+ALn4oN4=/docs/WjB+ALn4oN55NQAAAAAACA==/',
 '_ts': 1565251523,
 'annotations': {'bbox': [{'bbox_abs': [558.5203531343667,
     476.6385112396647,
     66.36738764096717,
     27.581252006635005],
    'bbox_rel': [0.272, 0.31, 0.0324, 0.0179],
    'category': 'animal'}],
  'species': ['deer']},
 'dataset': 'idfg',
 'datetime': '19-Jan-2016 10:45:00',
 'file_name': 'Beaverhead_elk/AM179/Trip 1/100RECNX/IMG_2061.JPG',
 'height': 1536,
 'id': 'e7056c19-65ad-4bce-9958-865c3071b882',
 'image_id': 'Beaverhead_elk~AM179~Trip 1~100RECNX~IMG_2061',
 'location': 'Beaverhead_elk+AM179',
 'width': 2048}

### All images with the specified species

In [23]:
%%time

species_requested = 'elk'

query = {'query': '''
SELECT TOP 1020 im
FROM images im
WHERE ARRAY_LENGTH(im.annotations.species) > 0 AND ARRAY_CONTAINS(im.annotations.species, "{}")
'''.format(species_requested)}

options = {
    'enableCrossPartitionQuery': True
}

result_iterable = client.QueryItems(container_link, query, options)

results = []
for item in iter(result_iterable):
    res = item['im']
    results.append(res)
    
print('Length of results:', len(results))

Length of results: 1020
CPU times: user 113 ms, sys: 10.2 ms, total: 124 ms
Wall time: 1.92 s


In [25]:
results[1000]

{'_attachments': 'attachments/',
 '_etag': '"a0004da7-0000-0500-0000-5d4bd7a60000"',
 '_rid': 'WjB+ALn4oN42GgAAAAAACA==',
 '_self': 'dbs/WjB+AA==/colls/WjB+ALn4oN4=/docs/WjB+ALn4oN42GgAAAAAACA==/',
 '_ts': 1565251494,
 'annotations': {'bbox': [{'bbox_abs': [0,
     984.3018501754368,
     591.4981615184691,
     522.7193055279513],
    'bbox_rel': [0, 0.64, 0.288, 0.34],
    'category': 'animal'},
   {'bbox_abs': [1558.9874024517837,
     965.9608219112909,
     163.54083535523634,
     276.6438429840845],
    'bbox_rel': [0.761, 0.628, 0.0798, 0.18],
    'category': 'animal'}],
  'species': ['elk']},
 'dataset': 'idfg',
 'datetime': '03-Feb-2016 00:28:50',
 'file_name': 'Beaverhead_elk/AM56/Trip 1/100RECNX/2016-02-03 00-28-50 M 5_5.JPG',
 'height': 1536,
 'id': 'b6c088b5-8f3e-4019-984d-ff7a9b340535',
 'image_id': 'Beaverhead_elk~AM56~Trip 1~100RECNX~2016-02-03 00-28-50 M 5_5',
 'location': 'Beaverhead_elk+AM56',
 'width': 2048}

### Species count

In [29]:
%%time

query = {'query': '''
SELECT TOP 1020 im.annotations.species
FROM images im
WHERE ARRAY_LENGTH(im.annotations.species) > 0
'''}

options = {
    'enableCrossPartitionQuery': True
}

result_iterable = client.QueryItems(container_link, query, options)

species = Counter()
for item in iter(result_iterable):
    res = item['species']
    species.update(res)

CPU times: user 92.1 ms, sys: 7.24 ms, total: 99.3 ms
Wall time: 1.64 s


In [30]:
species

Counter({'Cattle': 3,
         'Moose': 1,
         'deer': 188,
         'elk': 156,
         'empty': 669,
         'human': 4})