# Rescan photos scripts

In [1]:
import time

import firebase_admin
import mpmath
from firebase_admin import credentials
from firebase_admin import firestore

cred = credentials.Certificate(r"C:\!Dev\projects\book_detection\keys\avea-biblosphere-210106-firebase-adminsdk-s3bjc-ff2c6ac275 (1).json")
firebase_admin.initialize_app(cred)

db = firestore.client()

#### Not valid photos

In [61]:
# parameter is missed: photo/reporter/bookplace/id/location/url

res = db.collection('photos').where(u'valid', u'==', False).stream()
not_valid = set()
photos = set()
reporters = set()
bookplaces = set()
ids = set()
locations = set()
urls = set()
for doc in res:
    data = doc.to_dict()

    not_valid.add(doc.id)
    if 'photo' not in data:
        photos.add(doc.id)
    if 'reporter' not in data:
        reporters.add(doc.id)
    if 'bookplace' not in data:
        bookplaces.add(doc.id)
    if 'id' not in data:
        ids.add(doc.id)
    if 'location' not in data:
        locations.add(doc.id)
    if 'url' not in data:
        urls.add(doc.id)

print('not valid:', len(not_valid))
print('parameter is missed:')
print('\tphoto:', len(photos))
print('\treporter:', len(reporters))
print('\tbookplace:', len(bookplaces))
print('\tid:', len(ids))
print('\tlocation:', len(locations))
print('\turl:', len(urls))
#finded

not valid: 469
parameter is missed:
	photo: 0
	reporter: 0
	bookplace: 208
	id: 362
	location: 91
	url: 363


In [49]:
res = db.collection('photos').where(u'recognition_stats.record_in_stats', u'==', False).stream()
finded = set()
count = 0
for doc in res:
    finded.add(doc.id)
    # update = {'valid': True}
    # db.collection('photos').document(doc.id).update(update)
    # update = {'valid': False}
    # db.collection('photos').document(doc.id).update(update)


print('records not in stats:', len(finded))
#finded

records not in stats: 0


In [50]:
res = db.collection('photos').where(u'status', u'==', 'failed').stream()
finded = set()
for doc in res:
    finded.add(doc.id)
print('status == failed:', len(finded))
#finded

status == failed: 22


#### Photos without block 'recognition_stats'

In [53]:
res = db.collection('photos').stream()
finded = set()
for doc in res:
    data = doc.to_dict()
    if 'valid' in data and not data['valid']:
        continue
    if 'recognition_stats' in data:
        continue
    finded.add(doc.id)

print('finded', len(finded))
#finded

finded 25


#### Rescan one photo

In [43]:
import requests
photo_id = '023YA6RIB7A3boAmZBOq'
url = 'https://us-central1-biblosphere-210106.cloudfunctions.net/rescan_photo'

doc = db.collection('photos').document(photo_id).get()
data = doc.to_dict()

res = requests.post(url, json={'photo_id': photo_id})
json = res.json()
print(res.status_code, res.reason)
print('duration:', json['duration'])
print('recognized:', len(json['recognized_books']))
for n, el in enumerate(json['recognized_books'], 1):
    print(f'{n}) {el}')

200 OK
duration: 0:00:00.057949


TypeError: object of type 'NoneType' has no len()

#### Rescan several photos

In [30]:
import requests
url = 'https://us-central1-biblosphere-210106.cloudfunctions.net/rescan_photo'
batch_size = 2

res = db.collection('photos').stream()
count = 1
for doc in res:
    if count > batch_size:
        break

    data = doc.to_dict()
    if 'valid' in data and not data['valid']:
        continue
    if 'recognition_stats' in data:
        continue

    print(f'{count}) Rescaning photo: {doc.id}')

    if not data.keys() >= {'photo', 'reporter', 'bookplace', 'id', 'location', 'url'}:
        print('ERROR: photo/reporter/bookplace/id/location/url parameters missing.', doc.id, '\n')
        update = {'valid': False}
        db.collection('photos').document(doc.id).update(update)
        continue

    old_recognized = data['recognized'] if 'recognized' in data else 0
    res = requests.post(url, json={'photo_id': doc.id})
    json = res.json()
    print(f"duration: {json['duration']}, recognized: {len(json['recognized_books'])}({old_recognized})\n")

    count += 1

1) Rescaning photo: 2sbdY7e3VD8hIVcoeh1s
duration: 0:00:27.369074, recognized: 0(10)

2) Rescaning photo: 3W82C9BUmTxM7D0agG2q
duration: 0:00:22.247033, recognized: 0(0)



#### Rescan several photos (threads)

In [45]:
import numpy as np
import time

def fill_array(size):
    res = db.collection('photos').stream()
    result = []
    count = 1
    for doc in res:

        if count > batch_size * threads:
            break

        data = doc.to_dict()
        if 'valid' in data and not data['valid']:
            continue
        if 'recognition_stats' in data:
            continue

        if not data.keys() >= {'photo', 'reporter', 'bookplace', 'id', 'location', 'url'}:
            print('ERROR: photo/reporter/bookplace/id/location/url parameters missing.', doc.id, '\n')
            update = {'valid': False}
            db.collection('photos').document(doc.id).update(update)
            continue
        result.append(doc.id)
        count += 1

    return result


def rescan_in_thread(name: str, photo_ids: list):
    url = 'https://us-central1-biblosphere-210106.cloudfunctions.net/rescan_photo'
    for n, id in enumerate(photo_ids):
        print(f'{name}: {n+1}) scanning photo: {id}')
        res = requests.post(url, json={'photo_id': id})
        print(res.status_code)



In [52]:
%%time

import requests
from threading import Thread

batch_size = 2
threads = 10

print('--- preparing photo ids ---')
ids = fill_array(batch_size * threads)
print(f'photos: {len(ids)} \n')

print('---starting threads ---')
threads_list = []
for i in range(threads):
    start_ind = i * batch_size
    end_ind = start_ind + batch_size
    thread = Thread(target=rescan_in_thread, args=(f'thread_{i}', ids[start_ind:end_ind]))
    thread.start()
    threads_list.append(thread)

for thread in threads_list:
    thread.join()

print('---finish----')

--- preparing photo ids ---
photos: 20 

---starting threads ---
thread_0: 1) scanning photo: 3NXpOnCp0gq62pxQmxRE
thread_1: 1) scanning photo: 7ov4FV2ixBO98jw3RQDT
thread_2: 1) scanning photo: BC819A7J9qv3hioMGaoK
thread_3: 1) scanning photo: JwI8jqh0VX2pH8tNKW8E
thread_4: 1) scanning photo: NvmuSTVtdqcosETeFURk
thread_5: 1) scanning photo: YHuXF8jOkKpmB7WkN2t4
thread_6: 1) scanning photo: cU1K2ip0eQe6U8chzpjg
thread_7: 1) scanning photo: fMwyBKODipSZ15cts9kl
thread_8: 1) scanning photo: iVM36GWVT5gSpHYkUvxm
thread_9: 1) scanning photo: nIZ4fVF7LEnzLLMDSGDR
200
thread_0: 2) scanning photo: 3s6Am4UxI1EAT8JzHplv
200
thread_2: 2) scanning photo: HPsk4sqTq2YSjqW5FzUX
200
thread_6: 2) scanning photo: e79opeRXT5jMVHk1mTVJ
200
thread_7: 2) scanning photo: gkH8OMD2OjqLVwSWwsNI
200
200
thread_8: 2) scanning photo: m8WzALslkDuriCWgztYf
200
200
200
thread_4: 2) scanning photo: WmOFpQlRTZtSZPzAUf50
200
200
200
thread_1: 2) scanning photo: 89WCVEWsvMP2CaXoWYGw
200
thread_9: 2) scanning photo: nb2Y

In [7]:
from PIL import Image
import requests
from io import BytesIO

photo_id = '2sbdY7e3VD8hIVcoeh1s'

doc = db.collection('photos').document(photo_id).get()
data = doc.to_dict()
print(data['url'])
response = requests.get(data['url'])
#response.content
img = Image.open(BytesIO(response.content))
img.show()

https://firebasestorage.googleapis.com/v0/b/biblosphere-210106.appspot.com/o/images%2FPChIJuxcSoobjoI8R1I9kKQU4n9w%2F1611266267833.jpg?alt=media&token=915ab76c-f645-44b5-a28f-6b5f688015fb


In [64]:
photo_id = '2sbdY7e3VD8hIVcoeh1s'
doc = db.collection('photos').document(photo_id).get()
data = doc.to_dict()
data

{'recognition_stats': {'record_in_stats': True,
  'date': '2021-9-30',
  'detectron_find_books': 27,
  'duration': 22489409,
  'known_books': 3059977,
  'algorithm': 'Detectron build 1.0.1 (2021-09-30)'},
 'count': 7,
 'bookplace': 'PChIJuxcSoobjoI8R1I9kKQU4n9w',
 'recognized': 7,
 'status': 'recognized',
 'total': 17,
 'reporter': 'Syiru36ehIglQqOk2wNpgpNqG982',
 'url': 'https://firebasestorage.googleapis.com/v0/b/biblosphere-210106.appspot.com/o/images%2FPChIJuxcSoobjoI8R1I9kKQU4n9w%2F1611266267833.jpg?alt=media&token=915ab76c-f645-44b5-a28f-6b5f688015fb',
 'id': '2sbdY7e3VD8hIVcoeh1s',
 'location': {'geopoint': <google.cloud.firestore_v1._helpers.GeoPoint at 0x16ac5b7cf10>,
  'geohash': 'd1u0x26gp2s0'},
 'thumbnail': 'https://storage.googleapis.com/biblosphere-210106.appspot.com/thumbnails/PChIJuxcSoobjoI8R1I9kKQU4n9w/1611266267833.jpg',
 'photo': 'images/PChIJuxcSoobjoI8R1I9kKQU4n9w/1611266267833.jpg'}

In [75]:
photo_id = '3s6Am4UxI1EAT8JzHplv'
update = {'valid': False}
db.collection('photos').document(photo_id).update(update)

update_time {
  seconds: 1632503203
  nanos: 298959000
}