In [13]:
import os
import pymongo
import torch
import chromadb
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import requests
from io import BytesIO
from tqdm import tqdm

In [14]:
client = pymongo.MongoClient(os.getenv('MONGO_CONNECTION_STRING'))
db = client["monin"]
collection = db["drinks"]

In [15]:
collection.find({})[0]

{'_id': ObjectId('67cdeca2d61baaa7449cc9f0'),
 'name': 'Hawaiian Pearl Bubble Tea',
 'recipie': ['30 mL Monin Hawaiian Island Syrup',
  '150 mL tea',
  '60 mL club soda',
  '45 mL tapioca bobas',
  'Fill serving glass full of ice.',
  'Pour ingredients into serving glass in order listed.',
  'Stir gently to mix.',
  'Add garnish and serve.'],
 'category': 'bubble_teas',
 'image_url': 'https://monin.blob.core.windows.net/recipe/images/mrd/drink/5125b732-6dea-4287-a76e-fdb0588dc394-2.png',
 'drink_url': 'https://monin.us/products/hawaiian-pearl-bubble-tea',
 'taste': ['Citrusy', 'Sour']}

In [None]:
chroma_client = chromadb.PersistentClient(path="./chromadb")
collection_text = chroma_client.get_or_create_collection(name="drinks_text")
collection_image = chroma_client.get_or_create_collection(name="drinks_image")
collection_combined = chroma_client.get_or_create_collection(name="drinks_combined")

device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

def get_text_embedding(text_list):
    inputs = processor(text=text_list, return_tensors="pt", padding=True).to(device)
    with torch.no_grad():
        embeddings = model.get_text_features(**inputs)
    return embeddings.cpu().numpy().tolist()

def get_image_embedding(image_url):
    try:
        response = requests.get(image_url, timeout=5)
        image = Image.open(BytesIO(response.content)).convert("RGB")
        inputs = processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            embedding = model.get_image_features(**inputs)
        return embedding.cpu().numpy().tolist()[0]
    except Exception as e:
        print(f"Error processing image {image_url}: {e}")
        return None

existing_ids = set(collection_text.get(include=[])['ids']) & set(collection_image.get(include=[])['ids']) & set(collection_combined.get(include=[])['ids'])
mongo_docs = list(collection.find())
filtered_docs = [doc for doc in mongo_docs if str(doc["_id"]) not in existing_ids]

for doc in tqdm(filtered_docs, desc="Processing new documents"):
    doc_id = str(doc["_id"])
    drink_url = doc["drink_url"]
    taste_text = ", ".join(doc["taste"]) if "taste" in doc else ""
    image_url = doc.get("image_url", "")

    taste_embedding = get_text_embedding([taste_text])[0] if taste_text else None
    image_embedding = get_image_embedding(image_url) if image_url else None

    if taste_embedding:
        collection_text.add(
            ids=[doc_id],
            uris=[drink_url],
            embeddings=[taste_embedding]
        )

    if image_embedding:
        collection_image.add(
            ids=[doc_id],
            uris=[drink_url],
            embeddings=[image_embedding]
        )

    if taste_embedding and image_embedding:
        combined_embedding = taste_embedding + image_embedding
        collection_combined.add(
            ids=[doc_id],
            uris=[drink_url],
            embeddings=[combined_embedding]
        )
        

    # stored_data = collection_combined.get(ids=[doc_id], include=["embeddings", "uris", "documents", "metadatas"])
    # print(f"Stored in ChromaDB: {stored_data}")

Processing new documents:   0%|          | 0/2004 [00:00<?, ?it/s]Add of existing embedding ID: 67cdeca2d61baaa7449cc9f0
Add of existing embedding ID: 67cdeca2d61baaa7449cc9f0
Add of existing embedding ID: 67cdeca2d61baaa7449cc9f0
Processing new documents:   0%|          | 1/2004 [00:01<1:02:28,  1.87s/it]Add of existing embedding ID: 67cdeca4d61baaa7449cc9f1
Add of existing embedding ID: 67cdeca4d61baaa7449cc9f1
Add of existing embedding ID: 67cdeca4d61baaa7449cc9f1
Processing new documents:   0%|          | 2/2004 [00:03<55:00,  1.65s/it]  Add of existing embedding ID: 67cdeca6d61baaa7449cc9f2
Add of existing embedding ID: 67cdeca6d61baaa7449cc9f2
Add of existing embedding ID: 67cdeca6d61baaa7449cc9f2
Processing new documents:   0%|          | 3/2004 [00:05<56:04,  1.68s/it]Add of existing embedding ID: 67cdeca8d61baaa7449cc9f3
Add of existing embedding ID: 67cdeca8d61baaa7449cc9f3
Add of existing embedding ID: 67cdeca8d61baaa7449cc9f3
Processing new documents:   0%|          | 4/200

Data successfully added to ChromaDB.





### Getting from chromadb, be careful, by default it returns only ids, others will be shown as None.
Example, how to deal with it:
```
stored_data = collection_combined.get(ids=[doc_id], include=["embeddings", "uris", "documents", "metadatas"])
```


In [37]:
local_text_data = collection_text.get(include=["embeddings", "uris", "documents", "metadatas"])
local_image_data = collection_image.get(include=["embeddings", "uris", "documents", "metadatas"])
local_combined_data = collection_combined.get(include=["embeddings", "uris", "documents", "metadatas"])


chroma_client_remote = chromadb.HttpClient(host=os.environ.get('CHROMA_URL'), port=8000,
                         settings=chromadb.config.Settings(
                            chroma_client_auth_provider="chromadb.auth.token_authn.TokenAuthClientProvider", 
                            chroma_client_auth_credentials=os.environ.get('CHROMA_TOKEN')
                         ))

collection_text_remote = chroma_client_remote.get_or_create_collection(name="drinks_text")
collection_image_remote = chroma_client_remote.get_or_create_collection(name="drinks_image")
collection_combined_remote = chroma_client_remote.get_or_create_collection(name="drinks_combined")

collection_text_remote.add(
    ids=local_text_data["ids"],
    uris=local_text_data["uris"],
    embeddings=local_text_data["embeddings"]
)

collection_image_remote.add(
    ids=local_image_data["ids"],
    uris=local_image_data["uris"],
    embeddings=local_image_data["embeddings"]
)

collection_combined_remote.add(
    ids=local_combined_data["ids"],
    uris=local_combined_data["uris"],
    embeddings=local_combined_data["embeddings"]
)