In [1]:
!pip install "docarray[hnswlib]" torch torchvision aiohttp aiofiles IPython git+https://github.com/facebookresearch/ImageBind.git



Collecting docarray[hnswlib]
  Downloading docarray-0.40.0-py3-none-any.whl (270 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.2/270.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting aiofiles
  Downloading aiofiles-24.1.0-py3-none-any.whl (15 kB)
Collecting orjson>=3.8.2 (from docarray[hnswlib])
  Downloading orjson-3.10.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (141 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m141.1/141.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting types-requests>=2.28.11.6 (from docarray[hnswlib])
  Downloading types_requests-2.32.0.20240712-py3-none-any.whl (15 kB)
Collecting typing-inspect>=0.8.0 (from docarray[hnswlib])
  Downloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Collecting hnswlib>=0.7.0 (from docarray[hnswlib])
  Downloading hnswlib-0.8.0.tar.gz (36 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?2

In [2]:

import asyncio
from glob import glob
from urllib.parse import urlparse
from IPython.display import Image, display, HTML

import torch
import pandas as pd
import aiohttp
from aiofiles import open as aioopen
from docarray import DocList, BaseDoc
from docarray.documents import TextDoc, ImageDoc, AudioDoc
from docarray.typing import NdArray
from docarray.index import HnswDocumentIndex
from imagebind import data as ibdata
from imagebind.models import imagebind_model
from imagebind.models.imagebind_model import ModalityType
import nest_asyncio

nest_asyncio.apply()



In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = imagebind_model.imagebind_huge(pretrained=True).eval().to(device)

Downloading imagebind weights to .checkpoints/imagebind_huge.pth ...


  0%|          | 0.00/4.47G [00:00<?, ?B/s]

In [4]:
def embed(doc):
    with torch.no_grad():
        modality = {
            TextDoc: (ModalityType.TEXT, ibdata.load_and_transform_text),
            ImageDoc: (ModalityType.VISION, ibdata.load_and_transform_vision_data),
            AudioDoc: (ModalityType.AUDIO, ibdata.load_and_transform_audio_data)
        }.get(type(doc))

        if not modality:
            raise ValueError('Unsupported document type')

        modality_type, transform_func = modality
        input_data = [doc.text if isinstance(doc, TextDoc) else doc.url]
        embedding = model({modality_type: transform_func(input_data, device)})[modality_type]
        doc.embedding = embedding.cpu().numpy()[0]
    return doc

In [6]:
data = pd.read_json('/content/jendol.json')
image_urls = data['Images'].tolist()
save_directory = 'jendol_images'

In [7]:
async def download_image(session, url, save_path):
    async with session.get(url) as response:
        if response.status == 200:
            async with aioopen(save_path, 'wb') as f:
                await f.write(await response.read())
            print(f"Downloaded: {url}")
        else:
            print(f"Failed to download: {url}")

async def download_all_images(urls, save_dir):
    os.makedirs(save_dir, exist_ok=True)
    async with aiohttp.ClientSession() as session:
        tasks = [download_image(session, url, os.path.join(save_dir, os.path.basename(urlparse(url).path))) for url in urls]
        await asyncio.gather(*tasks)

asyncio.run(download_all_images(image_urls, save_directory))


Downloaded: https://jendolstores.com/wp-content/uploads/2020/09/download-2-2.jpg
Downloaded: https://jendolstores.com/wp-content/uploads/2020/09/2018-6002234-300x300.jpg
Downloaded: https://jendolstores.com/wp-content/uploads/2020/09/download-30-4.jpg
Downloaded: https://jendolstores.com/wp-content/uploads/2020/09/download-31-2.jpg
Downloaded: https://jendolstores.com/wp-content/uploads/2020/07/chicco-300x300.jpg
Downloaded: https://jendolstores.com/wp-content/uploads/2020/09/51M1-osS17L._AC_SY400_-262x300.jpg
Downloaded: https://jendolstores.com/wp-content/uploads/2020/07/cream-1-300x300.jpg
Downloaded: https://jendolstores.com/wp-content/uploads/2020/09/11951764140846e58bfe06e853d7e873d6e0ff31-300x300.jpg
Downloaded: https://jendolstores.com/wp-content/uploads/2020/07/ccccc-300x300.jpg
Downloaded: https://jendolstores.com/wp-content/uploads/2021/06/8058664041220_chicco_baby_moments_lotion-550x550-1-300x300.jpeg
Downloaded: https://jendolstores.com/wp-content/uploads/2020/07/chicco-ba

In [8]:
# Add embeddings to the DataFrame
data['embedding'] = data['Images'].apply(lambda url: embed(ImageDoc(url=os.path.join(save_directory, os.path.basename(urlparse(url).path)))).embedding)



In [9]:
data

Unnamed: 0,Names,Images,Prices,embedding
0,AVENGERS CHARACTER LUCH BAG,https://jendolstores.com/wp-content/uploads/20...,"₦1,700.00","[0.0012239747, -0.04389332, 0.0023488062, -0.0..."
1,BIC COLOURING PENCIL – 12Pcs,https://jendolstores.com/wp-content/uploads/20...,₦950.00,"[-0.04343031, -0.0032660211, 0.02805495, -0.01..."
2,BIC EVOLUTION HB PENCILS – 12Pcs,https://jendolstores.com/wp-content/uploads/20...,₦850.00,"[-0.053956322, -0.019853754, -0.012699741, 0.0..."
3,BIC EVOLUTION HB PENCILS – 4Pcs,https://jendolstores.com/wp-content/uploads/20...,₦400.00,"[0.0009875927, -0.008061172, -0.017589679, -0...."
4,CENTRUM PLASTIC COLOUR PENCILS – 18Pcs,https://jendolstores.com/wp-content/uploads/20...,₦950.00,"[0.027205933, -0.034107, -0.015803106, -0.0297..."
...,...,...,...,...
2681,TWISCO CHOCOLATE DRINK – 500g,https://jendolstores.com/wp-content/uploads/20...,"₦18,600.00","[-0.0065121674, -0.03461764, 0.0016597096, -0...."
2682,TWISCO CHOCOLATE DRINK Refill – 500g,https://jendolstores.com/wp-content/uploads/20...,"₦1,000.00","[0.0033355912, -0.013372767, 0.023826227, -0.0..."
2683,1 Piece Red Sensamite Luggage – Biggest Bag,https://jendolstores.com/wp-content/uploads/20...,"₦1,000.00","[0.013023899, -0.050826963, -0.007943656, 0.02..."
2684,1 Piece Red Sensamite Luggage – Medium Size,https://jendolstores.com/wp-content/uploads/20...,"₦1,000.00","[0.013023899, -0.050826963, -0.007943656, 0.02..."


In [10]:
#save the data as json
data.to_json('/content/embedded_jendol.json')

In [11]:
class ProductDoc(BaseDoc):
    name: str
    price: str
    image_url: str
    embedding: NdArray[1024] = None

doc_index = HnswDocumentIndex[ProductDoc](work_dir='/store')

batch_size = 1000

for i in range(0, len(data), batch_size):
    batch = data.iloc[i:i+batch_size]
    docs = DocList[ProductDoc](
        ProductDoc(name=row['Names'], price=row['Prices'], image_url=row['Images'], embedding=row['embedding'])
        for _, row in batch.iterrows()
    )
    doc_index.index(docs)

print(f"Indexed {len(data)} documents in total.")

Indexed 2686 documents in total.


In [38]:
query_embedding = embed(TextDoc(text='deodorant')).embedding
matches = doc_index.find(query_embedding, search_field='embedding', limit=3)

In [39]:
for match in matches.documents:
    display(Image(url=match.image_url))
    display(HTML(f"<b>{match.name}</b><br>Price: {match.price}"))

In [28]:
# search using image
query_embedding = embed(ImageDoc(url='/content/pepsi.jpeg')).embedding
matches = doc_index.find(query_embedding, search_field='embedding', limit=5)


In [29]:
for match in matches.documents:
    display(Image(url=match.image_url))
    display(HTML(f"<b>{match.name}</b><br>Price: {match.price}"))