# Rescan photos scripts

In [1]:
import time

import firebase_admin
import mpmath
from firebase_admin import credentials
from firebase_admin import firestore

cred = credentials.Certificate(r"C:\!Dev\projects\book_detection\keys\avea-biblosphere-210106-firebase-adminsdk-s3bjc-ff2c6ac275 (1).json")
firebase_admin.initialize_app(cred)

db = firestore.client()

#### Not valid photos

In [2]:
# parameter is missed: photo/reporter/bookplace/id/location/url

res = db.collection('photos').where(u'valid', u'==', False).stream()
not_valid = set()
photos = set()
reporters = set()
bookplaces = set()
ids = set()
locations = set()
urls = set()
for doc in res:
    data = doc.to_dict()

    not_valid.add(doc.id)
    if 'photo' not in data:
        photos.add(doc.id)
    if 'reporter' not in data:
        reporters.add(doc.id)
    if 'bookplace' not in data:
        bookplaces.add(doc.id)
    if 'id' not in data:
        ids.add(doc.id)
    if 'location' not in data:
        locations.add(doc.id)
    if 'url' not in data:
        urls.add(doc.id)

print('not valid:', len(not_valid))
print('parameter is missed:')
print('\tphoto:', len(photos))
print('\treporter:', len(reporters))
print('\tbookplace:', len(bookplaces))
print('\tid:', len(ids))
print('\tlocation:', len(locations))
print('\turl:', len(urls))
#finded

not valid: 469
parameter is missed:
	photo: 0
	reporter: 0
	bookplace: 208
	id: 362
	location: 91
	url: 363


In [4]:
res = db.collection('photos').where(u'recognition_stats.record_in_stats', u'==', False).stream()
finded = set()
count = 0
for doc in res:
    finded.add(doc.id)
    # update = {'valid': True}
    # db.collection('photos').document(doc.id).update(update)
    # update = {'valid': False}
    # db.collection('photos').document(doc.id).update(update)


print('records not in stats:', len(finded))
#finded

records not in stats: 0


In [5]:
res = db.collection('photos').where(u'status', u'==', 'failed').stream()
finded = set()
for doc in res:
    finded.add(doc.id)
print('status == failed:', len(finded))
#finded

status == failed: 20


#### Photos without block 'recognition_stats'

In [6]:
res = db.collection('photos').stream()
finded = set()
for doc in res:
    data = doc.to_dict()
    if 'valid' in data and not data['valid']:
        continue
    if 'recognition_stats' in data:
        continue
    finded.add(doc.id)

print('finded', len(finded))
#finded

finded 25


#### Rescan one photo

In [20]:
import requests
photo_id = 'tqPJHfsFyfq3Xnl4IRAA'
url = 'https://us-central1-biblosphere-210106.cloudfunctions.net/rescan_photo'

doc = db.collection('photos').document(photo_id).get()
data = doc.to_dict()

res = requests.post(url, json={'photo_id': photo_id})
json = res.json()
print(json)
print(res.status_code, res.reason)
print('duration:', json['duration'])
print('recognized:', len(json['recognized_books']))
for n, el in enumerate(json['recognized_books'], 1):
    print(f'{n}) {el}')

{'duration': '0:00:20.369259', 'photo_id': 'tqPJHfsFyfq3Xnl4IRAA', 'recognized_books': ['9785699410965 Как говорить с детьми, чтобы они учились Фабер Адель, Мазлиш Элейн']}
200 OK
duration: 0:00:20.369259
recognized: 1
1) 9785699410965 Как говорить с детьми, чтобы они учились Фабер Адель, Мазлиш Элейн


#### Rescan several photos

In [21]:
import requests
url = 'https://us-central1-biblosphere-210106.cloudfunctions.net/rescan_photo'
batch_size = 5

res = db.collection('photos').stream()
count = 1
for doc in res:
    if count > batch_size:
        break

    data = doc.to_dict()
    if 'valid' in data and not data['valid']:
        continue
    # if 'recognition_stats' in data:
    #    continue

    print(f'{count}) Rescaning photo: {doc.id}')

    if not data.keys() >= {'photo', 'reporter', 'bookplace', 'id', 'location', 'url'}:
        print('ERROR: photo/reporter/bookplace/id/location/url parameters missing.', doc.id, '\n')
        update = {'valid': False}
        db.collection('photos').document(doc.id).update(update)
        continue

    old_recognized = data['recognized'] if 'recognized' in data else 0
    res = requests.post(url, json={'photo_id': doc.id})
    json = res.json()
    print(f"duration: {json['duration']}, recognized: {len(json['recognized_books'])}({old_recognized})\n")

    count += 1

1) Rescaning photo: 03E2LrAHsJucumq0el4Q
duration: 0:00:31.936710, recognized: 12(12)

2) Rescaning photo: 0AypkMBEanU1N63WMPDb
duration: 0:00:25.020822, recognized: 12(12)

3) Rescaning photo: 0Ftx3WNzNuVOEYz6qjI7
duration: 0:00:20.258239, recognized: 9(9)

4) Rescaning photo: 0OpvuxYsnEKWrlCsYQOr
duration: 0:00:28.217935, recognized: 9(9)

5) Rescaning photo: 0TO9zoqBofca2EyVLAA8
duration: 0:00:20.480781, recognized: 5(5)



#### Rescan several photos (threads)

In [13]:
import numpy as np
import time

def fill_array(size):
    res = db.collection('photos').stream()
    result = []
    count = 1
    for doc in res:

        if count > batch_size * threads:
            break

        data = doc.to_dict()
        if 'valid' in data and not data['valid']:
            continue
        if 'recognition_stats' in data:
            continue

        if not data.keys() >= {'photo', 'reporter', 'bookplace', 'id', 'location', 'url'}:
            print('ERROR: photo/reporter/bookplace/id/location/url parameters missing.', doc.id, '\n')
            update = {'valid': False}
            db.collection('photos').document(doc.id).update(update)
            continue
        result.append(doc.id)
        count += 1

    return result


def rescan_in_thread(name: str, photo_ids: list):
    url = 'https://us-central1-biblosphere-210106.cloudfunctions.net/rescan_photo'
    for n, id in enumerate(photo_ids):
        print(f'{name}: {n+1}) scanning photo: {id}')
        res = requests.post(url, json={'photo_id': id})
        print(res.status_code)



In [14]:
%%time

import requests
from threading import Thread

batch_size =
threads = 10

print('--- preparing photo ids ---')
ids = fill_array(batch_size * threads)
print(f'photos: {len(ids)} \n')

print('---starting threads ---')
threads_list = []
for i in range(threads):
    start_ind = i * batch_size
    end_ind = start_ind + batch_size
    thread = Thread(target=rescan_in_thread, args=(f'thread_{i}', ids[start_ind:end_ind]))
    thread.start()
    threads_list.append(thread)

for thread in threads_list:
    thread.join()

print('---finish----')

--- preparing photo ids ---
photos: 20 

---starting threads ---
thread_0: 1) scanning photo: 3NXpOnCp0gq62pxQmxRE
thread_1: 1) scanning photo: 7ov4FV2ixBO98jw3RQDT
thread_2: 1) scanning photo: BC819A7J9qv3hioMGaoK
thread_3: 1) scanning photo: LxriX9M8cHCCEBvpU0zS
thread_4: 1) scanning photo: WmOFpQlRTZtSZPzAUf50
thread_5: 1) scanning photo: ZZ3tvctNHdzBxoLNtucG
thread_6: 1) scanning photo: e79opeRXT5jMVHk1mTVJ
thread_7: 1) scanning photo: gkH8OMD2OjqLVwSWwsNI
thread_8: 1) scanning photo: m8WzALslkDuriCWgztYfthread_9: 1) scanning photo: nb2YmhI43vaDGa2tPhBq

200
thread_1: 2) scanning photo: 89WCVEWsvMP2CaXoWYGw
200
200200

thread_5: 2) scanning photo: cU1K2ip0eQe6U8chzpjg
thread_8: 2) scanning photo: nIZ4fVF7LEnzLLMDSGDR
200
thread_3: 2) scanning photo: NvmuSTVtdqcosETeFURk
200
200
thread_2: 2) scanning photo: HPsk4sqTq2YSjqW5FzUXthread_0: 2) scanning photo: 3s6Am4UxI1EAT8JzHplv

200200
thread_7: 2) scanning photo: iVM36GWVT5gSpHYkUvxm

thread_9: 2) scanning photo: p0mu5NHxaz5FW895pZlQ

In [15]:
from PIL import Image
import requests
from io import BytesIO

photo_id = 'fMwyBKODipSZ15cts9kl'

doc = db.collection('photos').document(photo_id).get()
data = doc.to_dict()
print(data['url'])
response = requests.get(data['url'])
#response.content
img = Image.open(BytesIO(response.content))
img.show()

https://storage.cloud.google.com/biblosphere-210106.appspot.com/images/0000000000000000000000000000/0000000000034.jpg


UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x0000016904F59310>

In [16]:
photo_id = 'fMwyBKODipSZ15cts9kl'
doc = db.collection('photos').document(photo_id).get()
data = doc.to_dict()
data

{'bookplace': '0000000000000000000000000000',
 'status': 'failed',
 'id': 'fMwyBKODipSZ15cts9kl',
 'thumbnail': 'https://storage.googleapis.com/biblosphere-210106.appspot.com/thumbnails/0000000000000000000000000000/0000000000034.jpg',
 'reporter': '0000000000000000000000000000',
 'url': 'https://storage.cloud.google.com/biblosphere-210106.appspot.com/images/0000000000000000000000000000/0000000000034.jpg',
 'photo': 'images/0000000000000000000000000000/0000000000034.jpg',
 'count': 0,
 'location': {'geohash': 'yzzzzzzzz',
  'geopoint': <google.cloud.firestore_v1._helpers.GeoPoint at 0x16902a91e50>}}

In [17]:
photo_id = 'fMwyBKODipSZ15cts9kl'
update = {'valid': False}
db.collection('photos').document(photo_id).update(update)

update_time {
  seconds: 1642236303
  nanos: 724799000
}