## Generate embeddings with the OpenAI SDK

### Set up the OpenAI client with GitHub Models

In [1]:
#%pip install python-dotenv
#%pip install openai

import os
import dotenv
import openai

dotenv.load_dotenv()

openai_client = openai.OpenAI(
    base_url="https://models.inference.ai.azure.com",
    api_key=os.environ["GITHUB_TOKEN"]
)
MODEL_NAME = "text-embedding-3-small"

### Generate embeddings using OpenAI client

In [11]:
content_input = "Hoja de vida: Lionel Messi. Futbolista argentino, considerado uno de los mejores jugadores de fútbol de todos los tiempos."

In [5]:
embeddings_response = openai_client.embeddings.create(
    model=MODEL_NAME,
    input=content_input,
)
embedding = embeddings_response.data[0].embedding

print(len(embedding))
print(embedding)

1536
[0.01875680685043335, -0.01207759603857994, -0.014209441840648651, 0.0042787352576851845, 0.008226518519222736, -0.018825575709342957, 0.005970028694719076, 0.024499038234353065, -0.006876922678202391, -0.008492999710142612, -0.018688037991523743, 0.022315613925457, -0.002520821988582611, -0.07571491599082947, 0.0075646149925887585, 0.0547403059899807, -0.04641922935843468, 0.010727999731898308, 0.0068855187855660915, 0.0044356151483953, 0.01106324978172779, 0.030894575640559196, -0.009662076830863953, 0.00861764419823885, 0.055427998304367065, 0.007740836124867201, -0.016298307105898857, -0.0036898988764733076, -0.011536037549376488, 0.014415749348700047, 0.04745076596736908, -0.017794037237763405, 0.005316720809787512, -0.010650633834302425, -0.01947888359427452, -0.023605037480592728, 0.021060576662421227, -0.014020326547324657, -0.0013968748971819878, -0.02718103677034378, 0.0019556249026209116, -0.012980191968381405, 0.018636461347341537, 0.0577661506831646, 0.009894172661006

### Vector similarity

In [3]:
def cosine_similarity(v1, v2):

  dot_product = sum(
    [a * b for a, b in zip(v1, v2)])
  
  magnitude = (
    sum([a**2 for a in v1]) *
    sum([a**2 for a in v2])) ** 0.5

  return dot_product / magnitude

In [12]:
embeddings_response = openai_client.embeddings.create(
    model=MODEL_NAME,
    input=content_input,
)
content_embedding = embeddings_response.data[0].embedding
print(content_embedding)

[0.01874757558107376, -0.014915247447788715, -0.011708425357937813, 0.006682347971946001, 0.005184656009078026, -0.010924339294433594, -0.005924691911786795, 0.03307255730032921, -0.014457129873335361, 0.0005310748820193112, -0.022694434970617294, 0.018959015607833862, -0.004288243595510721, -0.07058532536029816, 0.01264228019863367, 0.04563554748892784, -0.04767945408821106, 0.022588714957237244, 0.012677519582211971, -0.003032825654372573, 0.011109348386526108, 0.021901538595557213, -0.014897627755999565, 0.014959297142922878, 0.06618035584688187, -0.0017619902500882745, -0.012977058067917824, -0.01072171051055193, -0.01226345170289278, 0.013514464721083641, 0.052190151065588, -0.013796383515000343, 0.0053035905584692955, -0.009016985073685646, -0.02380448766052723, -0.020403847098350525, 0.025566477328538895, -0.013250166550278664, -0.0010984907858073711, -0.025390278548002243, 0.0117172347381711, -0.016641996800899506, 0.013743523508310318, 0.05719420313835144, 0.016536278650164604

In [13]:
embeddings_response = openai_client.embeddings.create(
    model=MODEL_NAME,
    input="Diego Zumárraga Mera",
)
query_embedding1 = embeddings_response.data[0].embedding
print(query_embedding1)

[0.04424569383263588, 0.050422701984643936, -0.041180066764354706, -0.008556302636861801, -0.06373759359121323, -0.010935595259070396, 0.001229682588018477, 0.07865392416715622, -0.0027524870820343494, -0.04895852506160736, 0.004672793671488762, -0.025897685438394547, -0.007292303256690502, -0.025897685438394547, 0.031456995755434036, 0.05371711030602455, -0.041546110063791275, 0.008516266942024231, 0.012605675496160984, 0.03955573961138725, 0.03347024321556091, -0.01880556344985962, 0.02327817678451538, -0.03653587028384209, 0.017181238159537315, 0.00832180492579937, 0.03735947236418724, 0.03003857098519802, 0.024021705612540245, 0.014264317229390144, -0.0014420172665268183, -0.025074085220694542, 0.015728497877717018, 0.007635470479726791, -0.022935008630156517, -0.018016278743743896, -0.05417466536164284, 0.04124870151281357, -0.05284775048494339, -0.01046088058501482, 0.0041466038674116135, -0.009969008155167103, 0.011930780485272408, 0.0470367856323719, -0.0025666048750281334, 0.0

In [14]:
# Compare the two vectors
similarity = cosine_similarity(query_embedding1, content_embedding)
print(f"Similarity: {similarity:.4f}")

Similarity: 0.1799


In [15]:
embeddings_response = openai_client.embeddings.create(
    model=MODEL_NAME,
    input="Resume la hoja de vida de Diego Zumárraga Mera",
)
query_embedding2 = embeddings_response.data[0].embedding
print(query_embedding2)

[0.04746444150805473, 0.07716769725084305, 0.013239097781479359, 0.010300321504473686, -0.07539157569408417, -0.005494867917150259, -0.0009596300660632551, 0.09721914678812027, -0.011153326369822025, -0.016931556165218353, 0.0010852437699213624, -0.02112646773457527, 0.0010107519337907434, -0.018754415214061737, 0.040219746530056, 0.0629120022058487, -0.027015704661607742, 0.03003043308854103, 0.031853292137384415, 0.04199586808681488, 0.011170853860676289, 0.00023388244153466076, -0.012643163092434406, 0.011094900779426098, 0.022341707721352577, -0.009347994811832905, 0.012666532769799232, 0.015576096251606941, 0.03961212933063507, -0.055106427520513535, -0.026151014491915703, -0.042323045432567596, 0.031900033354759216, 0.020729178562760353, -0.04440297558903694, -0.01840386539697647, -0.004300077445805073, 0.01998133957386017, -0.034657690674066544, -0.00618720380589366, 0.033185381442308426, 0.008565099909901619, 0.03813982009887695, 0.055760789662599564, 0.01728210598230362, 0.026

In [16]:
# Compare the two vectors
similarity = cosine_similarity(query_embedding2, content_embedding)
print(f"Similarity: {similarity:.4f}")

Similarity: 0.3429
