# Preparar embeddings de OpenAI

Este notebook utiliza modelos de OpenAI para generar embeddings para un conjunto de palabras comunes en español.

In [None]:
import csv
import logging
import json
import os

from azure.identity import AzureCliCredential, get_bearer_token_provider
import openai
import dotenv

logging.basicConfig(level=logging.INFO)
dotenv.load_dotenv()

# configuramos openai
AZURE_OPENAI_ENDPOINT = os.getenv("AZURE_OPENAI_ENDPOINT")
AZURE_OPENAI_EMBEDDING_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMBEDDING_DEPLOYMENT")
AZURE_OPENAI_EMBEDDING_DIMENSIONS = os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS") 
azure_credential = AzureCliCredential()
token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2023-05-15",
    azure_endpoint=AZURE_OPENAI_ENDPOINT,
    azure_ad_token_provider=token_provider,
)

def get_embeddings(words):
    """Calculate embeddings using OpenAI in a batch (all words at once)"""
    word_vectors = {}
    dimensions_args = (
        {"dimensions": int(AZURE_OPENAI_EMBEDDING_DIMENSIONS)}
        if AZURE_OPENAI_EMBEDDING_DIMENSIONS
        else {}
    )

    embeddings_response = openai_client.embeddings.create(
        model=AZURE_OPENAI_EMBEDDING_DEPLOYMENT,
        input=words,
        **dimensions_args
    )
    for word, embedding_object in zip(words, embeddings_response.data):
        word_vectors[word] = embedding_object.embedding
    return word_vectors

In [None]:
# abrimos el archivo de peliculas
# calculamos embeddings de las peliculas
from tqdm import tqdm

new_movie_vectors = {}
with open("embeddings/peliculas_text-embedding-ada-002.json") as f:
    disney_vectors = json.load(f)
    for movie, embedding in tqdm(disney_vectors.items(), desc="Computing new embeddings"):
        new_movie_vectors[movie] = get_embeddings([movie])[movie]
# Write new embeddings to openai_movies
filename = f"embeddings/openai_peliculas_{AZURE_OPENAI_EMBEDDING_DEPLOYMENT}-{AZURE_OPENAI_EMBEDDING_DIMENSIONS}.json"
with open(filename, "w") as f:
    json.dump(new_movie_vectors, f, indent=4)

In [None]:
# abrimos sustanivos csv
words = []
with open('embeddings/sustantivos-mas-usados.csv') as f:
    reader = csv.reader(f)
    next(reader)
    for row in reader:
        words.append(row[0])

# y calculamos embeddings
word_vectors = get_embeddings(words)

# guardamos los embeddings en un archivo
filename = f"embeddings/sustantivos2_{AZURE_OPENAI_EMBEDDING_DEPLOYMENT}-{AZURE_OPENAI_EMBEDDING_DIMENSIONS}.json"

with open(filename, 'w') as f:
    json.dump(word_vectors, f, indent=4)
