# Vector embeddings with OpenAI

## Setup OpenAI API

In [1]:
import os

import azure.identity
import dotenv
import openai

# Set up OpenAI client based on environment variables
dotenv.load_dotenv()
AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE")
AZURE_OPENAI_ADA_DEPLOYMENT = os.getenv("AZURE_OPENAI_ADA_DEPLOYMENT")

azure_credential = azure.identity.DefaultAzureCredential()
token_provider = azure.identity.get_bearer_token_provider(azure_credential,
    "https://cognitiveservices.azure.com/.default")
openai_client = openai.AzureOpenAI(
    api_version="2024-06-01",
    azure_endpoint=f"https://{AZURE_OPENAI_SERVICE}.openai.azure.com",
    azure_ad_token_provider=token_provider)


## Vector representations

In [2]:
sentence = "A dog just walked past my house and yipped yipped like a Martian"

response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentence)

vector = response.data[0].embedding

In [3]:
vector

[-0.01423619594424963,
 -0.0071890209801495075,
 -0.02720867283642292,
 0.012327720411121845,
 -0.0037782657891511917,
 0.022089315578341484,
 0.009626195766031742,
 -0.018027357757091522,
 -0.0006479790317825973,
 -0.025016503408551216,
 0.01601572148501873,
 -0.0027934021782130003,
 0.01059332862496376,
 -0.005583580583333969,
 0.011515328660607338,
 0.011657175607979298,
 0.023946210741996765,
 0.01013555284589529,
 0.016699161380529404,
 0.02705393172800541,
 -0.011025315150618553,
 0.022463273257017136,
 0.019303971901535988,
 -0.023546462878584862,
 -0.015409651212394238,
 -0.002952979179099202,
 0.021392978727817535,
 -0.015074377879500389,
 0.0009292535251006484,
 -0.009465007111430168,
 0.014081454835832119,
 -0.005967210046947002,
 -0.03871110454201698,
 0.005361140239983797,
 -0.02168956771492958,
 -0.020722433924674988,
 0.016531525179743767,
 -0.010748070664703846,
 0.009323161095380783,
 -0.023198293522000313,
 -0.005083895288407803,
 0.007479161024093628,
 0.005751216784

In [4]:
len(vector)

1536

### Document similarity modeled as cosine distance

In [5]:
import numpy as np
import pandas as pd


def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

sentences1 = ['The new movie is awesome',
             'The new movie is awesome',
             'The new movie is awesome']

sentences2 = ['The new movie is awesome',
              'This recent movie is so good',
              'djkshsjdkhfsjdfkhsd']

def get_embeddings(sentences):
    embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=sentences)
    return [embedding_object.embedding for embedding_object in embeddings_response.data]

embeddings1 = get_embeddings(sentences1)
embeddings2 = get_embeddings(sentences2)

for i in range(len(sentences1)):
    print(f"{sentences1[i]} \t\t {sentences2[i]} \t\t Score: {cosine_similarity(embeddings1[i], embeddings2[i]):.4f}")

The new movie is awesome 		 The new movie is awesome 		 Score: 1.0000
The new movie is awesome 		 This recent movie is so good 		 Score: 0.9191
The new movie is awesome 		 djkshsjdkhfsjdfkhsd 		 Score: 0.7461


### Vector search

In [6]:
import json

# Load in vectors for movie titles
with open('openai_movies.json') as json_file:
    movie_vectors = json.load(json_file)

In [7]:
# Compute vector for query
query = "101 Dalmations"

embeddings_response = openai_client.embeddings.create(model=AZURE_OPENAI_ADA_DEPLOYMENT, input=[query])
vector = embeddings_response.data[0].embedding

# Compute cosine similarity between query and each movie title
scores = []
for movie in movie_vectors:
    scores.append((movie, cosine_similarity(vector, movie_vectors[movie])))

# Display the top 10 results
df = pd.DataFrame(scores, columns=['Movie', 'Score'])
df = df.sort_values('Score', ascending=False)
df.head(10)

Unnamed: 0,Movie,Score
8,101 Dalmatians,0.979849
335,102 Dalmatians,0.948893
28,The Fox and the Hound,0.86151
6,Lady and the Tramp,0.851573
48,The Great Mouse Detective,0.84171
468,Beverly Hills Chihuahua,0.840783
15,The Aristocats,0.83997
558,The Good Dinosaur,0.838789
135,Aladdin,0.837235
391,Teacher's Pet: The Movie,0.836816
