## 1. Importacion de liberias

In [60]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from langchain_core.documents import Document
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
from rank_bm25 import BM25Okapi
from langchain_core.documents import Document
from langchain_community.retrievers import BM25Retriever, QdrantSparseVectorRetriever
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain.retrievers import EnsembleRetriever

In [61]:
import json
from typing import List, Tuple

from qdrant_client.models import (
    Distance,
    NamedSparseVector,
    NamedVector,
    SparseVector,
    PointStruct,
    SearchRequest,
    SparseIndexParams,
    SparseVectorParams,
    VectorParams,
    ScoredPoint,
)
from transformers import AutoTokenizer

import fastembed
from fastembed import SparseEmbedding, SparseTextEmbedding, TextEmbedding

## 2. Review data

In [63]:
# Definir la ruta relativa desde la ubicación de tu notebook
path = os.path.join('..', 'data')

# Verificar si la ruta existe y es una carpeta
if os.path.exists(path) and os.path.isdir(path):
    # Lista para almacenar los DataFrames
    dataframes = []
    
    # Recorrer la carpeta y leer archivos CSV
    for file in os.listdir(path):
        file_path = os.path.join(path, file)
        if file.endswith('.csv') and os.path.isfile(file_path):
            df = pd.read_csv(file_path)
            dataframes.append(df)
    
    # Concatenar todos los DataFrames en uno solo
    concatenated_df = pd.concat(dataframes, ignore_index=True)
    # Asegurarse de que el DataFrame tiene una columna de categoría
    if 'main_category' in concatenated_df.columns:
        # Filtrar 1000 filas con 10 categorías distintas
        grouped = concatenated_df.groupby('main_category')
        df_category = grouped.apply(lambda x: x.sample(min(len(x), 20))).reset_index(drop=True)
        
        # Si hay más de 1000 filas, tomar una muestra de 200
        if len(df_category) > 1000:
            df_category = df_category.sample(200).reset_index(drop=True)

  df_category = grouped.apply(lambda x: x.sample(min(len(x), 20))).reset_index(drop=True)


In [64]:
sampled_df = pd.DataFrame()
df_category['combined_text'] = (df_category['main_category'] + "\n"+ df_category['sub_category'])
sampled_df['combined_text'] = df_category['combined_text'].unique()

In [65]:
sampled_df['id'] = sampled_df.index

In [66]:
sampled_df

Unnamed: 0,combined_text,id
0,accessories\nGold & Diamond Jewellery,0
1,accessories\nBags & Luggage,1
2,accessories\nWatches,2
3,accessories\nFashion & Silver Jewellery,3
4,accessories\nJewellery,4
...,...,...
93,women's clothing\nEthnic Wear,93
94,women's clothing\nLingerie & Nightwear,94
95,women's shoes\nShoes,95
96,women's shoes\nFashion Sandals,96


## Sparse Embeddings

In [67]:
sparse_model_name = "prithvida/Splade_PP_en_v1"
dense_model_name = "BAAI/bge-large-en-v1.5"
# This triggers the model download
sparse_model = SparseTextEmbedding(model_name=sparse_model_name, batch_size=32)
dense_model = TextEmbedding(model_name=dense_model_name, batch_size=32)

  sparse_model = SparseTextEmbedding(model_name=sparse_model_name, batch_size=32)
Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]
Fetching 5 files: 100%|██████████| 5/5 [00:00<?, ?it/s]


In [68]:
def make_sparse_embedding(texts: List[str]):
    return list(sparse_model.embed(texts, batch_size=32))


def get_tokens_and_weights(sparse_embedding, model_name):
    # Find the tokenizer for the model
    tokenizer_source = None
    for model_info in SparseTextEmbedding.list_supported_models():
        if model_info["model"].lower() == model_name.lower():
            tokenizer_source = model_info["sources"]["hf"]
            break
        else:
            raise ValueError(f"Model {model_name} not found in the supported models.")

    tokenizer = AutoTokenizer.from_pretrained(tokenizer_source)
    token_weight_dict = {}
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights
    token_weight_dict = dict(
        sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True)
    )
    return token_weight_dict

## Dense Embeddings

In [69]:
def make_dense_embedding(texts: List[str]):
    return list(dense_model.embed(texts))

## Obtencion de Sparce Vector and Embbeddings

In [70]:
product_texts = sampled_df["combined_text"].tolist()

In [71]:
sampled_df["sparse_embedding"] = make_sparse_embedding(product_texts)

In [72]:
sampled_df["dense_embedding"] = make_dense_embedding(product_texts)

## Conect to vectordatabase

In [73]:
# Conectar a Qdrant
# Obtener las variables de entorno
QDRANT_URL = os.getenv('QDRANT_URL')
QDRANT_API_KEY = os.getenv('QDRANT_API_KEY')
# Inicializar el cliente de Qdrant
client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY
)

In [74]:

client = QdrantClient(":memory:")

# Crear una colección en Qdrant
collection_name = "hybrid_search"

client.create_collection(
    collection_name,
    vectors_config={
        "text-dense": VectorParams(
            size=1024,  # OpenAI Embeddings
            distance=Distance.COSINE,
        )
    },
    sparse_vectors_config={
        "text-sparse": SparseVectorParams(
            index=SparseIndexParams(
                on_disk=False,
            )
        )
    },
)

True

## Generacion de puntos

In [75]:
def make_points(df: pd.DataFrame) -> List[PointStruct]:
    sparse_vectors = df["sparse_embedding"].tolist()
    product_texts = df["combined_text"].tolist()
    dense_vectors = df["dense_embedding"].tolist()
    rows = df.to_dict(orient="records")
    points = []
    for idx, (text, sparse_vector, dense_vector) in enumerate(
        zip(product_texts, sparse_vectors, dense_vectors)
    ):
        sparse_vector = SparseVector(
            indices=sparse_vector.indices.tolist(), values=sparse_vector.values.tolist()
        )
        point = PointStruct(
            id=idx,
            payload={
                "text": text,
                "id": rows[idx]["id"],
            },  # Add any additional payload if necessary
            vector={
                "text-sparse": sparse_vector,
                "text-dense": dense_vector.tolist(),
            },
        )
        points.append(point)
    return points


points: List[PointStruct] = make_points(sampled_df)

In [76]:
client.upsert(collection_name, points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

## Search

In [77]:
from typing import List
import numpy as np

def search(query_text: str):
    # Generar vectores esparcidos (sparse) y densos (dense)
    query_sparse_vectors: List[SparseEmbedding] = make_sparse_embedding([query_text])
    query_dense_vector: List[np.ndarray] = make_dense_embedding([query_text])

    # Verificar que los vectores no estén vacíos
    if not query_sparse_vectors or not query_dense_vector:
        raise ValueError("Los vectores generados están vacíos.")

    # Ejecutar la búsqueda con los vectores generados
    search_results = client.search_batch(
        collection_name=collection_name,
        requests=[
            SearchRequest(
                vector=NamedVector(
                    name="text-dense",
                    vector=query_dense_vector[0].tolist(),  # Convertir el vector en una lista
                ),
                limit=10,
                with_payload=True,
            ),
            SearchRequest(
                vector=NamedSparseVector(
                    name="text-sparse",
                    vector=SparseVector(
                        indices=query_sparse_vectors[0].indices.tolist(),  # Convertir índices en una lista
                        values=query_sparse_vectors[0].values.tolist(),    # Convertir valores en una lista
                    ),
                ),
                limit=10,
                with_payload=True,
            ),
        ],
    )

    return search_results

# Ejemplo de uso
query_text = "Naturelix Detox Bath Natural Dog"
search_results = search(query_text)


In [78]:
search_results

[[ScoredPoint(id=22, version=0, score=0.5735021308119368, payload={'text': 'beauty & health\nPersonal Care Appliances', 'id': 22}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id=20, version=0, score=0.5615359937387494, payload={'text': 'beauty & health\nBeauty & Grooming', 'id': 20}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id=19, version=0, score=0.5604631010204252, payload={'text': 'beauty & health\nHousehold Supplies', 'id': 19}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id=21, version=0, score=0.5583277133834655, payload={'text': 'beauty & health\nLuxury Beauty', 'id': 21}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id=42, version=0, score=0.5547079140584126, payload={'text': 'home, kitchen, pets\nRefurbished & Open Box', 'id': 42}, vector=None, shard_key=None, order_value=None),
  ScoredPoint(id=79, version=0, score=0.5539104130258146, payload={'text': 'toys & baby products\nBaby Bath, Skin & Grooming', 

## Ranking

In [79]:
def rrf(rank_lists, alpha=60, default_rank=1000):
    """
    Optimized Reciprocal Rank Fusion (RRF) using NumPy for large rank lists.

    :param rank_lists: A list of rank lists. Each rank list should be a list of (item, rank) tuples.
    :param alpha: The parameter alpha used in the RRF formula. Default is 60.
    :param default_rank: The default rank assigned to items not present in a rank list. Default is 1000.
    :return: Sorted list of items based on their RRF scores.
    """
    # Consolidate all unique items from all rank lists
    all_items = set(item for rank_list in rank_lists for item, _ in rank_list)

    # Create a mapping of items to indices
    item_to_index = {item: idx for idx, item in enumerate(all_items)}

    # Initialize a matrix to hold the ranks, filled with the default rank
    rank_matrix = np.full((len(all_items), len(rank_lists)), default_rank)

    # Fill in the actual ranks from the rank lists
    for list_idx, rank_list in enumerate(rank_lists):
        for item, rank in rank_list:
            rank_matrix[item_to_index[item], list_idx] = rank

    # Calculate RRF scores using NumPy operations
    rrf_scores = np.sum(1.0 / (alpha + rank_matrix), axis=1)

    # Sort items based on RRF scores
    sorted_indices = np.argsort(-rrf_scores)  # Negative for descending order

    # Retrieve sorted items
    sorted_items = [(list(item_to_index.keys())[idx], rrf_scores[idx]) for idx in sorted_indices]

    return sorted_items

In [80]:
def rank_list(search_result: List[ScoredPoint]):
    return [(point.id, rank + 1) for rank, point in enumerate(search_result)]


dense_rank_list, sparse_rank_list = rank_list(search_results[0]), rank_list(search_results[1])
rrf_rank_list = rrf([dense_rank_list, sparse_rank_list])
rrf_rank_list

[(62, 0.031099324975891997),
 (79, 0.031024531024531024),
 (42, 0.031009615384615385),
 (61, 0.030621785881252923),
 (22, 0.017336838849365915),
 (20, 0.01707242848447961),
 (19, 0.016816412099430966),
 (21, 0.016568396226415094),
 (7, 0.01632801161103048),
 (63, 0.016094911377930246),
 (18, 0.015868769360743454),
 (88, 0.015868769360743454),
 (38, 0.01564927857935627),
 (5, 0.0154361498496035),
 (15, 0.01522911051212938),
 (17, 0.01522911051212938)]

In [81]:
def find_point_by_id(
    client: QdrantClient, collection_name: str, rrf_rank_list: List[Tuple[int, float]]
):
    return client.retrieve(
        collection_name=collection_name, ids=[item[0] for item in rrf_rank_list]
    )


find_point_by_id(client, collection_name, rrf_rank_list)

[Record(id=62, payload={'text': 'pet supplies\nDog supplies', 'id': 62}, vector=None, shard_key=None, order_value=None),
 Record(id=79, payload={'text': 'toys & baby products\nBaby Bath, Skin & Grooming', 'id': 79}, vector=None, shard_key=None, order_value=None),
 Record(id=42, payload={'text': 'home, kitchen, pets\nRefurbished & Open Box', 'id': 42}, vector=None, shard_key=None, order_value=None),
 Record(id=61, payload={'text': 'pet supplies\nAll Pet Supplies', 'id': 61}, vector=None, shard_key=None, order_value=None),
 Record(id=22, payload={'text': 'beauty & health\nPersonal Care Appliances', 'id': 22}, vector=None, shard_key=None, order_value=None),
 Record(id=20, payload={'text': 'beauty & health\nBeauty & Grooming', 'id': 20}, vector=None, shard_key=None, order_value=None),
 Record(id=19, payload={'text': 'beauty & health\nHousehold Supplies', 'id': 19}, vector=None, shard_key=None, order_value=None),
 Record(id=21, payload={'text': 'beauty & health\nLuxury Beauty', 'id': 21}, v