# Chunk-Query similarity with Embedding model

In [None]:
import dspy
import nest_asyncio
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

from archaeo_super_prompt.env import getenv_or_throw
# from archaeo_super_prompt.models.similarity import embed

In [None]:
# call this in the notebook to allow in the asyncio.run call in the dspy.Embedder
# sync forward
nest_asyncio.apply()

ollama_localhost_port = getenv_or_throw("LOCAL_LLM_PORT")

# TOOD: set this as a transformer parametre
embedder = dspy.Embedder(
    "ollama/nomic-embed-text",
    api_base=f"http://localhost:{ollama_localhost_port}",
    api_key="",
    batch_size=100,
)


def embed_sync(text: str):
    """Return an embedding vector of shape (1, T), with T a constant embedding
    size depending on the model
    """
    return embedder([text])


query_embeddings = embed_sync("L'istituzione è una descrizione del luogo con \
informazioni utili, non amministrative, per gli archeologi per comprendere \
meglio le caratteristiche del luogo per l'intervento.")
chunk1_embeddings = embed_sync("Piazza Marco Vitteli, Pisa")
chunk2_embeddings = embed_sync("Villa vecchia con due piani")

In [None]:
print(query_embeddings.shape, chunk1_embeddings.shape, chunk2_embeddings.shape)

In [None]:
def get_similarities(chunk_embeddings: np.ndarray,
                     query_embeddings: np.ndarray):
    """
    Chunks : (N, T)
    Query: (1, T)

    Return: (N, 1)
    """
    return cosine_similarity(chunk_embeddings, query_embeddings)


similarities = get_similarities(np.concatenate((chunk1_embeddings,
                                                chunk2_embeddings)),
                                query_embeddings)

In [None]:
similarities