In [5]:

from PIL import Image
from io import BytesIO
import numpy as np
import tempfile
import pickle
import uuid

In [20]:
for filename in tqdm(sorted(os.listdir(DATA_DIR))):
        if filename.endswith(".jpg"):
                print(filename)

100%|██████████| 7/7 [00:00<00:00, 129912.07it/s]


In [17]:
# import pickle

# with open("data/mock_profiles.pkl", "rb") as f:
#     profiles = pickle.load(f)


✅ Loaded 0 profiles



In [7]:

import os
import uuid
import pickle
import numpy as np
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
import faiss

In [8]:


# ---------------------------
# Config / paths
# ---------------------------
DATA_DIR = "data"
EMBEDDINGS_DIR = "embeddings"
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)

EMBEDDINGS_FILE = os.path.join(EMBEDDINGS_DIR, "embeddings.npy")
METADATA_FILE = os.path.join(EMBEDDINGS_DIR, "metadata.pkl")
FAISS_INDEX_FILE = os.path.join(EMBEDDINGS_DIR, "faiss_index.index")

# ---------------------------
# Sample image DB
# ---------------------------
IMAGE_DB = [
    {"path": "pic1.jpg", "label": "Wealthy", "networth": 500000},
    {"path": "pic2.jpg", "label": "Not Wealthy", "networth": 6000},
    {"path": "pic3.jpg", "label": "Wealthy", "networth": 700000},
    {"path": "pic4.jpg", "label": "Not Wealthy", "networth": 5000},
    {"path": "pic5.jpg", "label": "Wealthy", "networth": 600000},
    {"path": "pic6.jpg", "label": "Not Wealthy", "networth": 2000},
]

# ---------------------------
# Load CLIP model once
# ---------------------------
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def compute_clip_embedding(image: Image.Image) -> np.ndarray:
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embedding = clip_model.get_image_features(**inputs)
    embedding = embedding / embedding.norm(p=2, dim=-1, keepdim=True)  # normalize for cosine
    return embedding[0].cpu().numpy().astype("float32")  # NumPy array, ready for FAISS


def load_and_resize_image(path, size=(224, 224)):
    img = Image.open(path).convert("RGB")
    img = img.resize(size, Image.BICUBIC)
    return img

# ---------------------------
# Generate embeddings & metadata
# ---------------------------
embedding_list = []
metadata_list = []

for item in IMAGE_DB:
    uid = str(uuid.uuid4())
    img_path = os.path.join(DATA_DIR, item["path"])
    img=load_and_resize_image(img_path)
    emb = compute_clip_embedding(img)

    embedding_list.append(emb)

    metadata_list.append({
        "id": uid,
        "filename": item["path"],
        "label": item["label"],
        "networth": item["networth"]
    })

# Save embeddings as NumPy array
embeddings_array = np.stack(embedding_list)
np.save(EMBEDDINGS_FILE, embeddings_array)

# Save metadata
with open(METADATA_FILE, "wb") as f:
    pickle.dump(metadata_list, f)

# ---------------------------
# Build FAISS index
# ---------------------------
dim = embeddings_array.shape[1]
index = faiss.IndexFlatIP(dim)  # cosine similarity using inner product
faiss.normalize_L2(embeddings_array)  # Important for cosine similarity
index.add(embeddings_array)


# Save FAISS index
faiss.write_index(index, FAISS_INDEX_FILE)

print("Embeddings, metadata, and FAISS index saved successfully.")


Embeddings, metadata, and FAISS index saved successfully.


In [10]:
embedding_list[0]['id']

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [53]:
import faiss
import numpy as np
import pickle

In [54]:
import faiss
import numpy as np
import pickle

# Load embedding store
with open("app/embedding_store.pkl", "rb") as f:
    data = pickle.load(f)

embeddings = np.array([item["embedding"] for item in data]).astype("float32")
ids = np.array(range(len(data)))

# Create FAISS index (L2 or cosine)
index = faiss.IndexFlatIP(embeddings.shape[1])  # Inner product for cosine
faiss.normalize_L2(embeddings)  # Important for cosine similarity
index.add(embeddings)

# Save
faiss.write_index(index, "app/faiss_index.index")

# # Save metadata separately
# with open("app/metadata.pkl", "wb") as f:
#     pickle.dump(data, f)

In [66]:
data[0]

{'id': 'pic1.jpeg',
 'embedding': array([ 2.48834074e-01,  2.00178176e-02, -6.67418420e-01,  2.21818894e-01,
         5.08008122e-01, -4.11142886e-01,  1.78222805e-02,  1.59614652e-01,
         2.14969277e-01, -3.07403088e-01,  4.44079041e-01, -1.41415760e-01,
         7.36602783e-01, -1.66664839e-01,  6.11325920e-01, -4.61693630e-02,
         1.50555968e-02,  4.41217542e-01,  7.64539301e-01, -9.08671021e-02,
        -5.89052677e-01,  1.67245537e-01,  2.01479256e-01, -2.37750024e-01,
        -4.45756972e-01,  1.99434549e-01, -1.13720000e-02,  1.00015640e-01,
         8.17388818e-02,  1.34276420e-01, -1.95572004e-01,  3.14170241e-01,
         8.65175501e-02, -1.45974249e-01,  2.00292110e-01,  1.68341845e-01,
        -4.40652668e-03,  4.98539597e-01, -2.54890025e-01, -1.82138652e-01,
        -7.99991846e-01,  1.83169782e-01, -2.62621462e-01, -1.17970377e-01,
        -4.05394733e-02,  6.76733196e-01,  2.20628083e-02, -4.06065471e-02,
         2.45170832e-01,  1.61430866e-01,  5.26924849e-

In [56]:
import faiss
import numpy as np
import pickle

In [47]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
# from app.database import embedding_store



def get_top_k_with_estimation(query_emb: np.ndarray, k: int = 3):
    scores = []
    
    for item in embedding_store:
        sim = cosine_similarity([query_emb], [item["embedding"]])[0][0]
        scores.append({
            "net_worth": item["networth"],
            "label": item['label'],
            "similarity_score_": float(sim)
        })

    # Get top-k similar profiles
    top_k = sorted(scores, key=lambda x: x["similarity_score_"], reverse=True)[:k]

    # Estimate net worth and label of input image
    est_net_worth = np.mean([x["net_worth"] for x in top_k])
    est_label = "Wealthy" if est_net_worth >= 1_000_000 else "Not Wealthy"

    # Final output
    result = {
        "estimated_wealth": round(est_net_worth, 2),
        "estimated_label": est_label,
        "top_3_similar_matches": top_k
    }

    return result


In [59]:
img = Image.open("data/pic1.jpeg").convert("RGB")
samplemb = compute_clip_embedding(img).numpy()
results = get_top_k_with_estimation(samplemb)


In [60]:
results

{'estimated_net_worth': np.float64(566666.67),
 'estimated_label': 'Not Wealthy',
 'top_3_similar_matches': [{'net_worth': 500000,
   'label': 'Wealthy',
   'similarity_score_': 1.0000001192092896},
  {'net_worth': 500000,
   'label': 'Not Wealthy',
   'similarity_score_': 0.6646890640258789},
  {'net_worth': 700000,
   'label': 'Wealthy',
   'similarity_score_': 0.6425244808197021}]}

In [92]:
index = faiss.read_index("app/faiss_index.index")


In [93]:
index

<faiss.swigfaiss.IndexFlatIP; proxy of <Swig Object of type 'faiss::IndexFlatIP *' at 0x1584d1930> >

In [90]:
# Load index and metadat
index = faiss.read_index("app/faiss_index.index")
with open("app/embedding_store.pkl", "rb") as f:
    metadata = pickle.load(f)

def get_top_k_with_estimation_faiss(query_emb: np.ndarray, k=3):
    query = query_emb.astype("float32").reshape(1, -1)
    faiss.normalize_L2(query)

    sim_scores, indices = index.search(query, k)
    top_k = []

    for i, idx in enumerate(indices[0]):
        meta = metadata[idx]
        
        top_k.append({
            "net_wealth": meta["networth"],
            "label": "Wealthy" if meta["networth"] >= 1_000_000 else "Not Wealthy",
            "similarity_score": float(sim_scores[0][i])

        })

    estimated = np.mean([x["net_wealth"] for x in top_k])
    mean_similarity = np.mean([x["similarity_score"] for x in top_k])

    #input profile is wealthy if mean similarity for top 3 mtaches>=0.6
    label = "Wealthy" if mean_similarity >=0.6 else "Not Wealthy"

    return {
        "estimated_net_worth": round(float(estimated), 2),
        "estimated_wealth_level": label,
        "top_3_matches": top_k
    }

In [91]:
results=get_top_k_with_estimation_faiss(samplemb)
results

{'estimated_net_worth': 566666.67,
 'estimated_wealth_level': 'Wealthy',
 'top_3_matches': [{'net_wealth': 500000,
   'label': 'Not Wealthy',
   'similarity_score': 1.000000238418579},
  {'net_wealth': 500000,
   'label': 'Not Wealthy',
   'similarity_score': 0.6646890640258789},
  {'net_wealth': 700000,
   'label': 'Not Wealthy',
   'similarity_score': 0.6425244808197021}]}

In [87]:
top_k=[]
for i, idx in enumerate(indices[0]):
    meta = metadata[idx]
    top_k.append({
        "id": meta["id"],
        "net_wealth": meta["networth"],
        "label": "Wealthy" if meta["networth"] >= 1_000_000 else "Not Wealthy",
        "similarity_score": float(sim_scores[0][i])

    })
    estimated = np.mean([x["networth"] for x in top_k])


KeyError: 'networth'

In [76]:
meta['networth']

700000

In [69]:
results = get_top_k_with_estimation_faiss(samplemb)


KeyError: 'networth'