In [None]:
import warnings
warnings.filterwarnings('ignore')
from datasets import load_dataset
from pinecone_text.sparse import BM25Encoder
from pinecone import Pinecone, ServerlessSpec
from DLAIUtils import Utils

from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm
import torch
import os

utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

utils = Utils()
INDEX_NAME = utils.create_dlai_index_name('dl-ai')

pinecone = Pinecone(api_key=PINECONE_API_KEY)

if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
  pinecone.delete_index(INDEX_NAME)
pinecone.create_index(
  INDEX_NAME,
  dimension=512,
  metric="dotproduct",
  spec=ServerlessSpec(cloud='aws', region='us-west-2')
)
index = pinecone.Index(INDEX_NAME)

In [None]:
fashion = load_dataset(
    "ashraq/fashion-product-images-small",
    #only the training portion of the dataset is loaded.
    split="train"
)
fashion

In [None]:
images = fashion['image']
metadata = fashion.remove_columns('image')
images[900]

In [None]:
#....
metadata = metadata.to_pandas()
metadata.head()

In [None]:
#Create the Sparse Vector Using BM25
#BM25 is a popular information retrieval function that helps in ranking documents based on the query terms appearing in each document.
#It's widely used in search engines and recommendation systems for its effectiveness in capturing term importance
bm25 = BM25Encoder()
bm25.fit(metadata['productDisplayName'])
metadata['productDisplayName'][0]

bm25.encode_queries(metadata['productDisplayName'][0])
bm25.encode_documents(metadata['productDisplayName'][0])

In [1]:
#Create the Dense Vector Using CLIP( (Contrastive Language–Image Pretraining) model)

#a variant of the CLIP model, which has been trained to understand both images and text
model = SentenceTransformer('sentence-transformers/clip-ViT-B-32',
    device=device)

#prints the model's details
model
dense_vec = model.encode([metadata['productDisplayName'][0]])
dense_vec.shape

len(fashion)

In [None]:
#creating a hybrid index that combines both sparse and dense vector embeddings

#Defines the batch size for processing and the total number of data items to be processed.
batch_size = 100
fashion_data_num = 1000

for i in tqdm(range(0, min(fashion_data_num,len(fashion)), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(fashion))
    # extract metadata batch
    meta_batch = metadata.iloc[i:i_end]
    meta_dict = meta_batch.to_dict(orient="records")
    # concatinate all metadata field except for id and year to form a single string
    meta_batch = [" ".join(x) for x in meta_batch.loc[:, ~meta_batch.columns.isin(['id', 'year'])].values.tolist()]
    # extract image batch
    img_batch = images[i:i_end]
    # create sparse BM25 vectors
    sparse_embeds = bm25.encode_documents([text for text in meta_batch])
    # create dense vectors
    dense_embeds = model.encode(img_batch).tolist()
    # create unique IDs
    ids = [str(x) for x in range(i, i_end)]

    upserts = []
    # loop through the data and create dictionaries for uploading documents to pinecone index
    for _id, sparse, dense, meta in zip(ids, sparse_embeds, dense_embeds, meta_dict):
        upserts.append({
            'id': _id,
            'sparse_values': sparse,
            'values': dense,
            'metadata': meta
        })
    # upload the documents to the new hybrid index
    index.upsert(upserts)

# show index description after uploading the documents
index.describe_index_stats()

#Run Your Query


In [None]:
#perform a hybrid search using both sparse and dense vectors to query a Pinecone index
#This process effectively leverages both the keyword-based precision of sparse vectors and the semantic understanding of dense vectors

query = "dark blue french connection jeans for men"

#Generating Sparse Vector with BM25
sparse = bm25.encode_queries(query)
#Utilizes the CLIP model (loaded earlier as model) to encode the query into a dense vector
dense = model.encode(query).tolist()

#Executes a query on the Pinecone index using both the dense and sparse vectors
result = index.query(
    top_k=14,
    vector=dense,
    sparse_vector=sparse,
    include_metadata=True
)

# Iterates through the matches in the query result.
# Extracts the image corresponding to each match using the id field from the result. The id here is assumed to be an index or identifier that can be used to locate the corresponding image in the images dataset.
# imgs is a list containing the images of the top matching fashion items based on the query.
imgs = [images[int(r["id"])] for r in result["matches"]]
imgs

In [None]:
from IPython.core.display import HTML
from io import BytesIO
from base64 import b64encode

# function to display product images in Jupyter Notebook
def display_result(image_batch):
    figures = []
    for img in image_batch:

        #Converting Images to Base64-Encoded PNGs
        b = BytesIO()
        img.save(b, format='png')
        figures.append(f'''
            <figure style="margin: 5px !important;">
              <img src="data:image/png;base64,{b64encode(b.getvalue()).decode('utf-8')}" style="width: 90px; height: 120px" >
            </figure>
        ''')
    return HTML(data=f'''
        <div style="display: flex; flex-flow: row wrap; text-align: center;">
        {''.join(figures)}
        </div>
    ''')

In [None]:
display_result(imgs)


In [None]:
def hybrid_scale(dense, sparse, alpha: float):
    """Hybrid vector scaling using a convex combination

    alpha * dense + (1 - alpha) * sparse

    Args:
        dense: Array of floats representing
        sparse: a dict of `indices` and `values`
        alpha: float between 0 and 1 where 0 == sparse only
               and 1 == dense only
    """
    if alpha < 0 or alpha > 1:
        raise ValueError("Alpha must be between 0 and 1")
    # scale sparse and dense vectors to create hybrid search vecs
    hsparse = {
        'indices': sparse['indices'],
        'values':  [v * (1 - alpha) for v in sparse['values']]
    }
    hdense = [v * alpha for v in dense]
    return hdense, hsparse

In [None]:
#More Dense

question = "dark blue french connection jeans for men"
#Closer to 0==more sparse, closer to 1==more dense
hdense, hsparse = hybrid_scale(dense, sparse, alpha=1)
result = index.query(
    top_k=6,
    vector=hdense,
    sparse_vector=hsparse,
    include_metadata=True
)
imgs = [images[int(r["id"])] for r in result["matches"]]
display_result(imgs)

for x in result["matches"]:
    print(x["metadata"]['productDisplayName'])

In [None]:
#More Sparse

question = "dark blue french connection jeans for men"
#Closer to 0==more sparse, closer to 1==more dense
hdense, hsparse = hybrid_scale(dense, sparse, alpha=0)
result = index.query(
    top_k=6,
    vector=hdense,
    sparse_vector=hsparse,
    include_metadata=True
)
imgs = [images[int(r["id"])] for r in result["matches"]]
display_result(imgs)

for x in result["matches"]:
    print(x["metadata"]['productDisplayName'])

In [None]:
#More Dense or More Sparse?

question = "dark blue french connection jeans for men"
#Closer to 0==more sparse, closer to 1==more dense
hdense, hsparse = hybrid_scale(dense, sparse, alpha=1)
result = index.query(
    top_k=6,
    vector=hdense,
    sparse_vector=hsparse,
    include_metadata=True
)
imgs = [images[int(r["id"])] for r in result["matches"]]
display_result(imgs)

for x in result["matches"]:
    print(x["metadata"]['productDisplayName'])
