# Uso de modelos de embeddings de OpenAI

## Instalación y carga de librerías

In [1]:
!pip install openai
!pip install tiktoken

Collecting openai
  Downloading openai-1.69.0-py3-none-any.whl (599 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m599.1/599.1 KB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting jiter<1,>=0.4.0
  Downloading jiter-0.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (352 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m352.9/352.9 KB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting sniffio
  Using cached sniffio-1.3.1-py3-none-any.whl (10 kB)
Collecting distro<2,>=1.7.0
  Using cached distro-1.9.0-py3-none-any.whl (20 kB)
Collecting anyio<5,>=3.5.0
  Downloading anyio-4.9.0-py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.9/100.9 KB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting httpx<1,>=0.23.0
  Using cached httpx-0.28.1-py3-none-any.whl (73 kB)
Collecting pydantic<3,>=1.9.0
  Downloading pydantic-2.11.0-py3-none-any.whl (442 

In [14]:
from openai import OpenAI
import os
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

In [2]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
client_openai = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

## Cargar dataset

In [3]:
df = pd.read_csv('generic-food.csv')

In [4]:
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables


## Evaluar cantidad de tokens a procesar

In [5]:
import tiktoken

def num_tokens_from_string(text, encodig_name):
    encoding = tiktoken.get_encoding(encodig_name)
    num_tokens = len(encoding.encode(text))
    return num_tokens


In [6]:
df['total_tokens'] = df['FOOD NAME'].apply(lambda x : num_tokens_from_string(x,'cl100k_base') )

In [7]:
sum(df['total_tokens'])

2947

## Generando emebeddings

In [11]:
def get_embedding(text, model = "text-embedding-ada-002"):
  text = text.replace('\n', '')
  response = client_openai.embeddings.create(
      input=text,
      model=model
  )
  return response.data[0].embedding

In [15]:
df['ada_embeddings'] =  df['FOOD NAME'].progress_apply(lambda x :get_embedding(x) )

  0%|          | 0/906 [00:00<?, ?it/s]

100%|██████████| 906/906 [05:19<00:00,  2.83it/s]  


In [16]:
df.head()

Unnamed: 0,FOOD NAME,SCIENTIFIC NAME,GROUP,SUB GROUP,total_tokens,ada_embeddings
0,Angelica,Angelica keiskei,Herbs and Spices,Herbs,2,"[0.006170166190713644, -0.010142647661268711, ..."
1,Savoy cabbage,Brassica oleracea var. sabauda,Vegetables,Cabbages,4,"[0.005480199586600065, -0.00497033866122365, 0..."
2,Silver linden,Tilia argentea,Herbs and Spices,Herbs,3,"[-0.004524254705756903, 0.020042214542627335, ..."
3,Kiwi,Actinidia chinensis,Fruits,Tropical fruits,2,"[-0.004660817328840494, -0.010033704340457916,..."
4,Allium (Onion),Allium,Vegetables,Onion-family vegetables,6,"[0.013233134523034096, -0.01992831565439701, 0..."


In [17]:
embedding_prueba = get_embedding('esto es una prueba de embeddings para openAI')

In [18]:
len(embedding_prueba)

1536

## Creando datasets para visualizar

In [19]:
df_embeddings = pd.DataFrame(list(df['ada_embeddings']))

In [20]:
df_embeddings.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1526,1527,1528,1529,1530,1531,1532,1533,1534,1535
0,0.00617,-0.010143,0.004632,-0.028876,-0.005418,0.013093,-0.016356,0.007352,-0.010089,-0.028104,...,0.024281,-0.011475,0.004445,-0.022603,-0.008817,0.031007,0.008318,-0.004019,-0.02259,-0.015104
1,0.00548,-0.00497,0.014879,-0.010877,-0.002445,0.021254,-0.008902,-0.024307,-0.018034,-0.039711,...,0.017996,-0.019817,0.026833,-0.032631,-0.019509,0.020035,-0.025666,0.010473,-0.007734,0.000761
2,-0.004524,0.020042,-0.002014,-0.024094,-0.016662,0.020498,-0.041319,-0.00191,-0.001223,-0.035416,...,0.002936,-0.022309,0.038609,-0.026388,-0.032438,0.021142,-0.000291,-0.020525,-0.011577,-0.033457
3,-0.004661,-0.010034,-0.00914,-0.010014,-0.007865,0.023123,-0.01441,-0.016287,-0.001691,-0.036354,...,0.030994,-0.012144,0.022424,-0.01193,-0.005402,0.010668,-0.021854,-0.001701,-0.01406,-0.020715
4,0.013233,-0.019928,0.002172,-0.010036,-0.018671,0.022313,-0.018094,-0.015277,-0.008759,-0.020544,...,0.020138,-0.034537,0.031655,-0.01098,-0.019339,0.000338,0.010279,0.001162,-0.001145,0.012421


In [21]:
df_embeddings.to_csv('embedding_food.tsv',sep='\t',index=False, header=False)

In [22]:
df[['FOOD NAME','GROUP','SUB GROUP']].to_csv('labels_food.tsv',sep='\t',index=False, header=True)