In [1]:
import os
import json
import glob
import faiss
import tiktoken
import numpy as np
import pandas as pd

from openai import OpenAI
from bs4 import BeautifulSoup
from datetime import datetime
from rich import print, inspect
from rich.progress import Progress

relevant_columns = {
    "_SkuId (Not changeable)": ("ID SKU", "id_sku"),
    "_SkuName": ("Nombre", "nombre"),
    "_ProductShortDescription": ("Descripción Corta", "descripcion_corta"),
    "_ProductDescription": ("Descripción Larga", "descripcion_larga"),
    "_Keywords": ("Palabras Clave", "palabras_clave"),
    "_MetaTagDescription": ("Descripción Meta", "descripcion_meta"),
    "_DepartamentName": ("Departamento", "departamento"),
    "_CategoryName": ("Categoría", "categoria"),
    "_Brand": ("Marca", "marca"),
}

index_file = "embeddings.faiss"
index = None
ids_file = "ids_faiss.npy"
ids_faiss = None
data = None


def load_env():
    # Load the .env file
    env_file_path = ".env"
    openai_api_key = None

    if os.path.exists(env_file_path):
        with open(env_file_path) as f:
            for line in f:
                if line.startswith("OPENAI_API_KEY"):
                    openai_api_key = line.strip().split("=")[1]
                    break

    if openai_api_key:
        print("OpenAI API Key loaded successfully.")
        os.environ["OPENAI_API_KEY"] = openai_api_key
    else:
        print("OpenAI API Key not found.")


load_env()


def load_excel(path):
    df = pd.read_excel(path, usecols=relevant_columns.keys(), dtype=str)
    df.fillna("No Info", inplace=True)
    return df


def create_file_from_excel(data, samples=1000, output_file="catalogo.xlsx"):
    data.sample(samples).to_excel(output_file, index=False)
    print(f"File {output_file} created successfully.")


def get_string_from_row(row, verbose=False):
    final_string = []
    for column, value in relevant_columns.items():
        if hasattr(row[column], "values"):
            row_value = row[column].values[0]
        else:
            row_value = row[column]

        if column == "_SkuId (Not changeable)":
            continue

        if column == "_ProductDescription" or column == "_MetaTagDescription":
            row_value = BeautifulSoup(row_value, "html.parser").get_text(separator=", ")

        if column == "_Keywords":
            keywords = row_value.split(",")
            keywords = [keyword.strip() for keyword in keywords]
            row_value = ", ".join(
                [keyword for keyword in keywords if not keyword.isdigit()]
            )

        if row_value != "No Info":
            final_string.append(f"{value[0]}: {row_value}")

        if verbose:
            print(f"{value[0]}: {row_value}")

    return "; ".join(final_string)


def get_embedding(text, model="text-embedding-3-small"):
    response = client.embeddings.create(input=[text], model=model)
    embedding = response.data[0].embedding
    tokens = response.usage.total_tokens
    return embedding, tokens


def get_tokens_length(text, encoding="cl100k_base"):
    encoding = tiktoken.get_encoding(encoding)
    num_tokens = len(encoding.encode(text))
    return num_tokens


def create_batch_files(data, output_file_prefix="batch"):
    total_tokens = 0
    current_tokens = 0
    file_count = 0

    with Progress() as progress:
        task = progress.add_task(f"[red]Batch N{file_count}", total=1_000_000)
        output_file = f"{output_file_prefix}_{file_count}.jsonl"
        f = open(output_file, "w")
        
        for idx, row in data.iterrows():
            string_row = get_string_from_row(row)
            tokens = get_tokens_length(string_row)
            current_tokens += tokens

            if current_tokens > 1_000_000:
                previous_tokens = current_tokens - tokens
                print(f"Batch {file_count} created successfully with {previous_tokens} tokens.")
                total_tokens += previous_tokens
                current_tokens = tokens

                f.close()
                file_count += 1

                task = progress.add_task(f"[red]Batch N{file_count}", total=1_000_000)
                output_file = f"{output_file_prefix}_{file_count}.jsonl"
                f = open(output_file, "w")


            payload = {
                "custom_id": row["_SkuId (Not changeable)"],
                "method": "POST",
                "url": "/v1/embeddings",
                "body": {
                    "model": "text-embedding-3-small",
                    "input": string_row,
                },
            }

            f.write(json.dumps(payload, ensure_ascii=True) + "\n")
            progress.update(task, advance=tokens)

        f.close()
        print("Batch files created successfully.")
        print(f"Total tokens: {total_tokens}")


def upload_batch_file(batch_file="batch.jsonl", verbose=False):
    batch_input_file = client.files.create(file=open(batch_file, "rb"), purpose="batch")
    print(f"{batch_file} file uploaded successfully.")
    if verbose:
        print(batch_input_file)
    return batch_input_file


def create_batch_online(batch_id, description="Normal batch", verbose=False):
    """
    Create a batch with the given batch_id and description.
    Max requests: 50 000
    Max file size: 200MB
    """
    batch_metadata = client.batches.create(
        input_file_id=batch_id,
        endpoint="/v1/embeddings",
        completion_window="24h",
        metadata={"description": description},
    )
    print(f"Batch {batch_id} created successfully.")

    if verbose:
        print(batch_metadata.model_dump())

    return batch_metadata


def check_status_batch(batch_id):
    batch_status = client.batches.retrieve(batch_id)
    return batch_status


def get_results(batch_id, folder="results"):
    batch_status = check_status_batch(batch_id)
    if batch_status.status != "completed":
        print("Batch not completed yet.")
        return None

    output_file_id = batch_status.output_file_id
    results = client.files.content(output_file_id)

    # file_name = f"results_{batch_id}.jsonl"
    file_name = f"{folder}/{batch_id}.jsonl"
    with open(file_name, "w") as f:
        f.write(results.text)


def read_large_file(file_path):
    with open(file_path, "r") as f:
        for line in f:
            yield line


def save_embeddings(file_paths, return_index=False):
    ids_faiss = []
    embeddings = []
    vector_dim = 0

    for file_path in file_paths:
        for line in read_large_file(file_path):
            data = json.loads(line)

            ids_faiss.append(data["custom_id"])
            embedding = data["response"]["body"]["data"][0]["embedding"]
            vector_dim = len(embedding)
            embeddings.append(np.array(embedding, dtype=np.float32))

    index = faiss.IndexFlatL2(vector_dim)
    if embeddings:
        embeddings_matrix = np.vstack(embeddings)
        index.add(embeddings_matrix)

    faiss.write_index(index, index_file)
    np.save(ids_file, np.array(ids_faiss))

    print("Embeddings saved successfully.")
    if return_index:
        return index, ids_faiss


def cargar_faiss_desde_disco(input_file=None):
    if os.path.exists(index_file) and os.path.exists(ids_file):
        index = faiss.read_index(index_file)
        ids_faiss = np.load(ids_file).tolist()
        print("Índice FAISS cargado desde disco.")
        return index, ids_faiss
    else:
        print("No se encontró un índice en disco. Creando uno nuevo.")
        if not input_file:
            print("No se especificó una lista de embeddings para cargar.")
            return None, None
        return save_embeddings(input_file)


def buscar_faiss(query, top_k=5):
    query_embedding, tokens = get_embedding(query)
    price_tokens_1M = 0.02 # 0.02 dolares por cada 1M de tokens
    price = price_tokens_1M * tokens / 1_000_000
    print(f"Tokens: {tokens}, Costo: ${price:.10f}")

    query_embedding = np.array(query_embedding, dtype=np.float32).reshape(1, -1)
    distances, indices = index.search(query_embedding, top_k)

    results = [(ids_faiss[idx], dist) for idx, dist in zip(indices[0], distances[0])]
    for sku_id, dist in results:
        row = data[data["_SkuId (Not changeable)"] == sku_id]
        print(f"Distancia: {dist:.2f}")
        out_str = ""
        for column, value in relevant_columns.items():
            out_str += f"{value[0]}: {row[column].values[0]}\n"
        print(out_str)


def unix_to_readable(unix_timestamp):
    if not unix_timestamp:
        return "No Info"
    return datetime.fromtimestamp(unix_timestamp).strftime('%Y-%m-%d %H:%M:%S')

def save_jobs(jobs, output_file="jobs.txt"):
    with open(output_file, "w") as f:
        for job in jobs:
            f.write(f"{job[0]},{job[1]},{job[2]}\n")

def load_jobs(input_file="jobs.txt"):
    jobs = []
    with open(input_file, "r") as f:
        for line in f:
            job = line.strip().split(",")
            jobs.append(job)
    return jobs


# index, ids_faiss = cargar_faiss_desde_disco("results.jsonl")
client = OpenAI()

In [2]:
data = load_excel("./text_search/wong_catalogo_prueba.xlsx")

In [None]:
create_file_from_excel(data, samples=1000, output_file="./text_search/wong_catalogo_1000.xlsx")

In [2]:
data = load_excel("./text_search/wong_catalogo_1000.xlsx")

In [None]:
num_rows, num_columns = data.shape
print(f"Number of rows: {num_rows}")
print(f"Number of columns: {num_columns}")

In [None]:
random_row = data.sample(1)
string_row = get_string_from_row(random_row, verbose=True)
print(string_row)
tokens = get_tokens_length(string_row)
print(f"Tokens: {tokens}")

In [None]:
create_batch_files(data)

In [None]:
import glob

batch_files = glob.glob("batch_*.jsonl")
print(batch_files)
jobs = []
for batch_file in batch_files:
    batch_id = upload_batch_file(batch_file)
    jobs.append([batch_id.id, 'No Started', ''])

save_jobs(jobs)

In [None]:
jobs = load_jobs('jobs.txt')

for job in jobs:
    if job[1] == "Completed":
        continue

    elif job[1] != "No Started":
        status = check_status_batch(job[2])
        if status.status == "completed":
            print(f"Batch {job[2]} completed.")
            job[1] = "Completed"
        elif status.status == "finalizing":
            print(f"Batch {job[2]} is finalizing.")
            job[1] = "Finalizing"
        elif status.status == "failed":
            print(f"Batch {job[2]} failed.")
            print("Changing status to No Started.")
            job[1] = "No Started"
        else:
            print(f"Batch {job[2]} not completed yet.")
            completed = status.request_counts.completed
            total = status.request_counts.total
            print(f"Completed: {completed}, Total: {total}")
            # print(status.model_dump())
        break

    elif job[1] == "No Started":
        print(f"Starting batch with file {job[0]}")
        batch_online_id = create_batch_online(job[0], description=f"Batch {job[0]}")
        job[1] = "Started"
        job[2] = batch_online_id.id
        break

print(jobs)
save_jobs(jobs)


In [117]:
jobs = load_jobs('jobs.txt')
for job in jobs:
    if job[1] != "Completed":
        continue

    get_results(job[2], folder="results")

In [None]:
# Get all .jsonl files in the results folder
results_files = glob.glob("results/*.jsonl")

save_embeddings(results_files)

In [3]:
index, ids_faiss = cargar_faiss_desde_disco()

In [None]:
# Get file size in bytes
file_size = os.path.getsize("embeddings.faiss")
print(f"File size: {file_size / 1024 / 1024:.2f} MB")
file_size = os.path.getsize("ids_faiss.npy")
print(f"File size: {file_size / 1024:.2f} KB")

In [6]:
buscar_faiss("Alcaparras Gourmet", top_k=5)

In [None]:
# random_row = data.sample(1)

result_iter = read_large_file("results.jsonl")
first_result = json.loads(next(result_iter))
first_result["response"]["body"]["data"][0]["embedding"] = "EMBEDDING"

print(first_result)