In [1]:
import pandas as pd

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

  from .autonotebook import tqdm as notebook_tqdm


In [27]:
df = pd.read_csv('data.csv')

  df = pd.read_csv('data.csv')


In [28]:
df2 = df[df['type'].isin(['Print','Photograph', 'Painting','Drawing'])]

In [29]:
df1 = df2[['id', 'description', 'image_web']].copy()

In [30]:
df1["embedding"] = df1["description"].fillna("").apply(model.encode)


In [31]:
df1.to_pickle('cleve_data_sample.pkl')

In [32]:
df1.isna().sum()

id                 0
description    27632
image_web      19464
embedding          0
dtype: int64

In [33]:
df1.dropna(inplace=True)

In [34]:
df1.shape

(4877, 4)

In [14]:
df["embed_text"] = (
    df["description"]
    .str.replace("\n", " ")
    .str.strip()
)


In [15]:
text_a = df1.loc[0, "description"]
text_b = df1.loc[4, "description"]

In [16]:
embedding_a = model.encode(text_a)
embedding_b = model.encode(text_b)

embedding_a.shape, embedding_b.shape


((384,), (384,))

In [17]:
from numpy import dot
from numpy.linalg import norm

cosine_similarity = dot(embedding_a, embedding_b) / (
    norm(embedding_a) * norm(embedding_b)
)

cosine_similarity


np.float32(0.3190035)

In [18]:
# Compare painting to itself
dot(embedding_a, embedding_a) / (norm(embedding_a) ** 2)


np.float32(1.0000001)

In [19]:
test_text_a = "lively bustling coastal impressionist"
test_text_b = "ominous oppressive mountainous baroque"

In [20]:
embedding_a = model.encode(text_a)
embedding_b = model.encode(test_text_b)

embedding_a.shape, embedding_b.shape


((384,), (384,))

In [21]:
cosine_similarity = dot(embedding_a, embedding_b) / (
    norm(embedding_a) * norm(embedding_b)
)

cosine_similarity

np.float32(0.2151523)

In [22]:
embedding_a = model.encode(text_a)
embedding_b = model.encode(test_text_a)

embedding_a.shape, embedding_b.shape


((384,), (384,))

In [23]:
cosine_similarity = dot(embedding_a, embedding_b) / (
    norm(embedding_a) * norm(embedding_b)
)

cosine_similarity

np.float32(0.46468592)

In [35]:
import numpy as np
from numpy.linalg import norm

def find_best_painting(
    mood,
    atmosphere,
    setting,
    style,
    df,
    model,
    text_col="description",
    image_col="image_web",
    embedding_col="embedding",
):
    if df.empty:
        raise ValueError("df is empty")

    user_text = " ".join([mood, atmosphere, setting, style]).strip()
    user_embedding = np.array(model.encode(user_text))
    user_norm = norm(user_embedding)

    if user_norm == 0:
        raise ValueError("User embedding has zero norm")

    if embedding_col in df.columns:
        embeddings = df[embedding_col].apply(lambda e: np.array(e))
    else:
        embeddings = df[text_col].fillna("").apply(lambda t: np.array(model.encode(t)))

    emb_matrix = np.vstack(embeddings.to_list())
    emb_norms = norm(emb_matrix, axis=1)
    denom = user_norm * emb_norms

    similarities = np.full(len(df), -1.0, dtype=float)
    valid = denom != 0
    similarities[valid] = (emb_matrix[valid] @ user_embedding) / denom[valid]

    best_index = int(np.argmax(similarities))
    best_row = df.iloc[best_index]

    return {
        "id": best_row.get("id"),
        "description": best_row.get(text_col),
        "image_url": best_row.get(image_col),
        "similarity": float(similarities[best_index]),
        "query": user_text,
    }


In [50]:
result = find_best_painting(
    mood="dark",
    atmosphere="stormy",
    setting="squirrel",
    style="modern",
    df=df1,
    model=model,
    embedding_col="embedding"
)

result


{'id': np.int64(137872),
 'description': 'Despite the chess-playing monkeyâ€™s grim end, the baby parrots continued to cavort with the fox cubs. One day the mother fox discovered that her cubs had been eaten by a panther. Blaming the parrots, she lured a hunter to their tree. The hunter climbed the tree and ensnared the mother parrot and her babies. The mother instructed her young to play dead, then pleaded with the hunter to take her alone, saying that since she knew the art of healing she would fetch a high price. Lush vegetation and a stream of water painted with a soft brush indicate a fresh new stylistic vision that represents a departure from Indian and Persian styles that came before.',
 'image_url': 'https://openaccess-cdn.clevelandart.org/1962.279.35.a/1962.279.35.a_web.jpg',
 'similarity': 0.42961329221725464,
 'query': 'dark stormy squirrel modern'}