## Import libraries

In [1]:
import os
from tqdm.notebook import tqdm
import face_recognition
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_columns', None)

# Load the dataframe and images

## Get system paths

In [2]:
PRADO_PATH = os.getenv("PRADO_PATH")
assert PRADO_PATH is not None, f"Please set the env variable PRADO_PATH"
assert os.path.isdir(PRADO_PATH), "PRADO_PATH must be a dir"

## Read Prado museum dataframe

In [3]:
df = pd.read_csv(os.path.join(PRADO_PATH, "prado.csv"))
df["work_id"] = df['work_image_url'].apply(lambda x: x.split('/')[-1])
print(f"The dataframe has {len(df)} rows")
df.tail()

The dataframe has 13487 rows


Unnamed: 0,work_url,work_image_url,author,author_bio,author_url,author_id,work_title,work_subtitle,work_exposed,work_description,work_tags,technical_sheet_numero_de_catalogo,technical_sheet_autor,technical_sheet_titulo,technical_sheet_fecha,technical_sheet_tecnica,technical_sheet_soporte,technical_sheet_dimension,technical_sheet_serie,technical_sheet_procedencia,bibliography,inventory,expositions,ubication,technical_sheet_autores,technical_sheet_edicion_/_estado,technical_sheet_materia,technical_sheet_ceca,technical_sheet_autora,technical_sheet_lugar_de_produccion,work_id
13482,https://www.museodelprado.es/coleccion/obra-de...,https://content3.cdnprado.net/imagenes/Documen...,"Atribuido Falck, Jeremías","Gdansk/Danzig (Polonia), 1609 - Gdansk/Danzig ...",https://www.museodelprado.es/coleccion/artista...,d351c3c1-e89b-49cf-900d-24085ed1908f,Vista del monasterio de El Escorial,"1662 - 1672. Aguafuerte, Buril, Estampa ilumin...",No expuesto,"Cervera Vera (Escorial en BN no C20-a, Real Bi...",Aguafuerte;Buril;Estampa iluminada;Papel verju...,G002860,"Atribuido Falck, Jeremías",Vista del monasterio de El Escorial,1662 - 1672,Aguafuerte; Buril; Estampa iluminada,"Papel verjurado, ahuesado",Alto: 474 mm;\n\n\n\n\nAncho: 756 mm,,"Colección José María Cervelló, 2003","Cervera Vera, Luis, Las Estampas y el Sumario...",Inv. Colección Cervelló.\nNúm. 166201.\n@@@\nI...,,,,,,,,,c62f7f3e-3ad3-4d9e-9586-b0b389b2d032.jpg
13483,https://www.museodelprado.es/coleccion/obra-de...,https://content3.cdnprado.net/imagenes/Documen...,"Atribuido Kiyonobu, Tori","1664, 1729",https://www.museodelprado.es/coleccion/artista...,9e420297-9fb9-4ef3-b128-96596d5dc191,Daifukucho [Sankai Nagoya],1697. Entalladura [madera a la fibra] sobre pa...,No expuesto,Antigua atribución a Hishikawa Moronobu. En es...,Entalladura [madera a la fibra];Papel japonés;...,G005639,"Atribuido Kiyonobu, Tori",Daifukucho [Sankai Nagoya],1697 (Genroku 10),Entalladura [madera a la fibra],Papel japonés,Alto: 172 mm;\n\n\n\n\nAncho: 240 mm,,Nippon Hanga Kyokai [Socidad de Pintores y Gra...,"Bru, R., Ukiyo-e en Madrid: las estampas del ...","Inv. Dibujos siglo XIX, Casón del Buen Retiro....",DOUANES / VI-G y Cruz de Suiza\nSello.\nAnverso,Estampas japonesas en el Museo del Prado\n ...,,,,,,,6c28accf-e0c0-4bc0-b4c6-3fbb282bcbd8.jpg
13484,https://www.museodelprado.es/coleccion/obra-de...,https://content3.cdnprado.net/imagenes/Documen...,"García, Sergio","Madrid, 1813 - Madrid, 1855\n\nHijo de Nicolás...",https://www.museodelprado.es/coleccion/artista...,d984cdeb-020c-41f9-ae68-3285aebfa25a,Retrato de dama,"Mediados del siglo XIX. , 5,8 x 5 cm",No expuesto,Este retrato femenino es una obra de calidad e...,"Aguada de pigmentos opacos [gouache, témpera];...",O003393,"García, Sergio",Retrato de dama,Mediados del siglo XIX,"Aguada de pigmentos opacos [gouache, témpera];...",Marfil,"Alto: 5,8 cm;\n\n\n\n\nAncho: 5 cm",,"Alcalá Subastas, Madrid 28 de mayo 2015; Donac...","Espinosa Martín, Mari Carmen., Iluminaciones,...",Inv. Nuevas Adquisiciones (iniciado en 1856).\...,"S.García\nManuscrito a pincel.\nAnverso, marge...",,,,,,,,b4126fb6-c5ac-40e3-89a1-d1578914c09b.jpg
13485,https://www.museodelprado.es/coleccion/obra-de...,https://content3.cdnprado.net/imagenes/Documen...,Anónimo,,https://www.museodelprado.es/coleccion/artista...,99d7590d-7ec8-4da4-bc2f-92064214c76c,Vista del río Buñol con algunas construcciones...,Antes de 1918. Gelatina / Colodión sobre plac...,No expuesto,Vista del río Buñol a su paso por la zona del ...,Buñol y alrededores;Gelatina / Colodión;Placa ...,HF05937,Anónimo -Fotógrafo-,Vista del río Buñol con algunas construcciones...,Antes de 1918,Gelatina / Colodión,Placa de vidrio,Alto: 44 mm;\n\n\n\n\nAncho: 106 mm,Buñol y alrededores,"Donación Ana y Cecilio Ellacuria Delgado, here...","Sánchez Torija, Beatriz, Cecilio Pla y su rel...",Inv. Nuevas Adquisiciones (iniciado en 1856).\...,Elemento de arquitectura:\n\n\n\n\n Chimenea d...,,,,,,,España,e7bf2481-522c-4071-9d97-7908aca45831.jpg
13486,https://www.museodelprado.es/coleccion/obra-de...,https://content3.cdnprado.net/imagenes/Documen...,"Goya y Lucientes, Francisco de","Fuendetodos, Zaragoza, 1746 - Burdeos (Francia...",https://www.museodelprado.es/coleccion/artista...,39568a17-81b5-4d6f-84fa-12db60780812,"Letras I B B, o IB unidas como un monograma en...",1771 - 1774. Lápiz negro sobre papel verjurado...,No expuesto,En la página 91 del Cuaderno (D06068/091) apar...,Cuaderno italiano de Goya;Lápiz negro;Papel ve...,D006068/156,"Goya y Lucientes, Francisco de","Letras I B B, o IB unidas como un monograma en...",1771 - 1774,Lápiz negro,Papel verjurado,Alto: 186 mm;\n\n\n\n\nAncho: 130 mm,Cuaderno italiano de Goya,"Colección particular, Palma de Mallorca; Edmun...","Wilson- Bareau, Juliet; Mena Marqués, Manuela,...","Inv. Dibujos, Fondo Antiguo.\nNúm. 1732.\n@@@\...",I B B // [...] o y [?]\nManuscrito con lápiz n...,Roma en el bolsillo. Cuadernos de dibujo y apr...,,,,,,,e8d785a8-1407-4203-b837-2d01e82a36cb.jpg


# Faces to embedding database

In [4]:
DETECTION_MODEL = "hog"  # hog - cnn
COLLECTION_NAME = f"{DETECTION_MODEL}_faces"

## Create ChromaDB collection

In [5]:
import uuid
import chromadb

In [6]:
from chromadb.config import Settings
client = chromadb.Client(Settings(
    chroma_db_impl="duckdb+parquet",
    persist_directory="chromadb" # Optional, defaults to .chromadb/ in the current directory
))
client.persist()

True

In [7]:
# Delete a collection and all associated embeddings, documents, and metadata.
# ⚠️ This is destructive and not reversible
#client.delete_collection(name=COLLECTION_NAME)

In [8]:
collection = client.get_or_create_collection(name=COLLECTION_NAME)
print(f"Items in collection: {collection.count()}")

Items in collection: 0


## Iterate over the dataframe/faces

In [None]:
errors = {"not_found": [], "location_embeddings_mismatch": []}
for _, row in tqdm(df.iterrows(), total=len(df)):
    
    image_id = row["work_id"]

    image_path = os.path.join(PRADO_PATH, "images", image_id)
    if not os.path.exists(image_path):
        # f"Image path not found: '{image_path}'"
        errors["not_found"].append(image_path)
        continue
    
    image = face_recognition.load_image_file(image_path)
    
    # https://face-recognition.readthedocs.io/en/latest/face_recognition.html#face_recognition.api.face_locations
    faces_locations = face_recognition.face_locations(
        image,
        model=DETECTION_MODEL,
        number_of_times_to_upsample=1
    )

    faces_embeddings = face_recognition.face_encodings(
        image,
        known_face_locations=faces_locations,
        num_jitters=1,
        model="large"
    )

    if not len(faces_locations) == len(faces_embeddings):
        # f"{len(faces_locations)} faces founds but {len(faces_embeddings)} embeddings generated"    
        errors["location_embeddings_mismatch"].append(image_path)
        continue
    
    for indx, (face_location, face_embedding) in enumerate(zip(faces_locations, faces_embeddings)):
    
        # Get the location of each face in this image
        top, right, bottom, left = face_location
        width = abs(right - left)
        height = abs(bottom - top)

        collection.add(
            embeddings=face_embedding.tolist(),
            metadatas={
                "image_id": image_id,
                "fl_top": top,
                "fl_right": right,
                "fl_bottom": bottom,
                "fl_left": left,
                "width": width,
                "height": height
            },
            ids=[uuid.uuid4().hex]
        )

  0%|          | 0/13487 [00:00<?, ?it/s]

In [None]:
collection.count()

In [36]:
collection.peek(limit=3)

{'ids': ['2301110bf2a4438a95311e31735520e7',
  '9fe8edec6ae54d4980be6a093fa536d9',
  'eee879fb47894414b3bf17f29569b390'],
 'embeddings': [[-0.11147989332675934,
   0.027456363663077354,
   0.09670601785182953,
   -0.08006956428289413,
   -0.18422652781009674,
   -0.05528806895017624,
   0.010502303950488567,
   -0.11511560529470444,
   0.057787615805864334,
   -0.18120616674423218,
   0.1493380218744278,
   -0.07502460479736328,
   -0.20317897200584412,
   0.12211953848600388,
   -0.02773684449493885,
   0.11262436956167221,
   -0.07674969732761383,
   -0.07970723509788513,
   -0.09246359020471573,
   -0.13784505426883698,
   0.017547380179166794,
   0.14324739575386047,
   -0.07297954708337784,
   0.05694309622049332,
   -0.06750456243753433,
   -0.21837878227233887,
   -0.04442638158798218,
   -0.01011237408965826,
   0.09819069504737854,
   -0.011019645258784294,
   -0.041496772319078445,
   0.026156168431043625,
   -0.15635812282562256,
   0.027410579845309258,
   0.035372231155633

### Query

In [75]:
collection.query(
    query_embeddings=face_embedding.tolist(),
    n_results=2,
    #where={"metadata_field": "is_equal_to_this"},
    #where_document={"$contains":"search_string"}
)

{'ids': [['240cd90bd086477f82c897d94f9c0524',
   '4b76e5bc992b47b8b08428331e15656e']],
 'embeddings': None,
 'documents': [[None, None]],
 'metadatas': [[{'image_id': '02e90b75-fce3-4cef-8d7e-36c716507a1d.jpg',
    'fl_top': 547,
    'fl_right': 561,
    'fl_bottom': 615,
    'fl_left': 493,
    'width': 68,
    'height': 68},
   {'image_id': '02e90b75-fce3-4cef-8d7e-36c716507a1d.jpg',
    'fl_top': 430,
    'fl_right': 264,
    'fl_bottom': 498,
    'fl_left': 196,
    'width': 68,
    'height': 68}]],
 'distances': [[0.0, 0.2524358332157135]]}