In [11]:
import os
import json
import faiss
import tiktoken
import numpy as np
import pandas as pd

from openai import OpenAI
from bs4 import BeautifulSoup
from rich import print, inspect

relevant_columns = {
    "_SkuId (Not changeable)": ("ID SKU", "id_sku"),
    "_SkuName": ("Nombre", "nombre"),
    "_ProductShortDescription": ("Descripción Corta", "descripcion_corta"),
    "_ProductDescription": ("Descripción Larga", "descripcion_larga"),
    "_Keywords": ("Palabras Clave", "palabras_clave"),
    "_MetaTagDescription": ("Descripción Meta", "descripcion_meta"),
    "_DepartamentName": ("Departamento", "departamento"),
    "_CategoryName": ("Categoría", "categoria"),
    "_Brand": ("Marca", "marca"),
}

index_file = "embeddings.faiss"
ids_file = "ids_faiss.npy"


def load_env():
    # Load the .env file
    env_file_path = ".env"
    openai_api_key = None

    if os.path.exists(env_file_path):
        with open(env_file_path) as f:
            for line in f:
                if line.startswith("OPENAI_API_KEY"):
                    openai_api_key = line.strip().split("=")[1]
                    break

    if openai_api_key:
        print("OpenAI API Key loaded successfully.")
        os.environ["OPENAI_API_KEY"] = openai_api_key
    else:
        print("OpenAI API Key not found.")


def load_excel(path):
    df = pd.read_excel(path, usecols=relevant_columns.keys(), dtype=str)
    df.fillna("No Info", inplace=True)
    return df


def get_string_from_row(row, verbose=False):
    final_string = []
    for column, value in relevant_columns.items():
        row_value = row[column]

        if column == "_SkuId (Not changeable)":
            continue

        if column == "_ProductDescription":
            row_value = BeautifulSoup(row_value, "html.parser").get_text(separator=", ")

        if column == "_Keywords":
            keywords = row_value.split(",")
            row_value = ", ".join(
                [keyword.strip() for keyword in keywords if not keyword.isdigit()]
            )

        if row_value != "No Info":
            final_string.append(f"{value[0]}: {row_value}")

        if verbose:
            print(f"{value[0]}: {row_value}")

    return "; ".join(final_string)


def get_embedding(text, model="text-embedding-3-small"):
    return client.embeddings.create(input=[text], model=model).data[0].embedding


def get_tokens_length(text, encoding="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding)
    num_tokens = len(encoding.encode(text))
    return num_tokens


def create_batch_file(data, output_file="batch.jsonl"):
    total_tokens = 0
    with open(output_file, "w") as f:
        for idx, row in data.iterrows():
            string_row = get_string_from_row(row)
            tokens = get_tokens_length(string_row)
            total_tokens += tokens

            payload = {
                "custom_id": row["_SkuId (Not changeable)"],
                "method": "POST",
                "url": "/v1/embeddings",
                "body": {
                    "model": "text-embedding-3-small",
                    "input": string_row,
                },
            }
            f.write(json.dumps(payload, ensure_ascii=True) + "\n")

        print("Batch file created successfully.")
        print(f"Total tokens: {total_tokens}")
        print(f"Promedio de tokens: {total_tokens / len(data):.2f}")


def upload_batch_file(batch_file="batch.jsonl", verbose=False):
    batch_input_file = client.files.create(file=open(batch_file, "rb"), purpose="batch")
    print("Batch file uploaded successfully.")
    if verbose:
        print(batch_input_file)
    return batch_input_file


def create_batch_online(batch_id, description="Normal batch"):
    """
    Create a batch with the given batch_id and description.
    Max requests: 50 000
    Max file size: 200MB
    """
    batch_metadata = client.batches.create(
        input_file_id=batch_id,
        endpoint="/v1/embeddings",
        completion_window="24h",
        metadata={"description": description},
    )
    print("Batch created successfully.")

    return batch_metadata


def check_status_batch(batch_id):
    batch_status = client.batches.retrieve(batch_id)
    return batch_status


def get_results(batch_id):
    batch_status = check_status_batch(batch_id)
    if batch_status.status != "completed":
        print("Batch not completed yet.")
        return None

    output_file_id = batch_status.output_file_id
    results = client.files.content(output_file_id)

    with open("results.jsonl", "w") as f:
        f.write(results.text)


def read_large_file(file_path):
    with open(file_path, "r") as f:
        for line in f:
            yield line


def save_embeddings(file_path):
    ids_faiss = []
    embeddings = []
    vector_dim = 0
    for line in read_large_file(file_path):
        data = json.loads(line)

        ids_faiss.append(data["custom_id"])
        embedding = data["response"]["body"]["data"][0]["embedding"]
        vector_dim = len(embedding)
        embeddings.append(np.array(embedding, dtype=np.float32))

    index = faiss.IndexFlatL2(vector_dim)
    if embeddings:
        embeddings_matrix = np.vstack(embeddings)
        index.add(embeddings_matrix)

    faiss.write_index(index, index_file)
    np.save(ids_file, np.array(ids_faiss))

    print("Embeddings saved successfully.")
    return index, ids_faiss


def cargar_faiss_desde_disco(input_file=None):
    if os.path.exists(index_file) and os.path.exists(ids_file):
        index = faiss.read_index(index_file)
        ids_faiss = np.load(ids_file).tolist()
        print("Índice FAISS cargado desde disco.")
        return index, ids_faiss
    else:
        print("No se encontró un índice en disco. Creando uno nuevo.")
        if not input_file:
            print("No se especificó un archivo de embeddings.")
            return None, None
        return save_embeddings(input_file)


def buscar_faiss(query, top_k=5):
    query_embedding = get_embedding(query)
    query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)

    results = [(ids_faiss[idx], dist) for idx, dist in zip(indices[0], distances[0])]
    for sku_id, dist in results:
        row = data[data["_SkuId (Not changeable)"] == sku_id]
        print(f"Distancia: {dist:.2f}")
        for column, value in relevant_columns.items():
            print(f"{value[0]}: {row[column].values[0]}")


data = load_excel("./text_search/wong_catalogo_small.xlsx")
index, ids_faiss = cargar_faiss_desde_disco("results.jsonl")
client = OpenAI()

In [12]:
buscar_faiss("agua sin gas", top_k=5)

In [5]:
# random_row = data.sample(1)

result_iter = read_large_file("results.jsonl")
first_result = json.loads(next(result_iter))
first_result["response"]["body"]["data"][0]["embedding"] = "EMBEDDING"

print(first_result)