# Multi-modal vector embeddings

A vector embedding can also represent non-textual data, such as images.

In [None]:
import os

import requests
from PIL import Image
import dotenv
import matplotlib.pyplot as plt

dotenv.load_dotenv()

# API Key authentication
AZURE_AIVISION_API_KEY = ""
AZURE_COMPUTER_VISION_URL = f"https://multimodalaiservice.cognitiveservices.azure.com/"

def get_model_params():
    return {"api-version": "2024-02-01", "model-version": "2023-04-15"}

def get_auth_headers():
    return {"Ocp-Apim-Subscription-Key": AZURE_AIVISION_API_KEY}

def get_image_embedding(image_file):
    url = f"{AZURE_COMPUTER_VISION_URL}/computervision/retrieval:vectorizeImage"
    headers = get_auth_headers()
    headers["Content-Type"] = "application/octet-stream"
    
    # Read the image file as binary data and send directly
    with open(image_file, "rb") as image_data:
        response = requests.post(url, headers=headers, params=get_model_params(), data=image_data)
    
    if response.status_code != 200:
        print(image_file, response.status_code, response.json())
    return response.json()["vector"]


In [None]:
import json

vectors = {}
for image_file in os.listdir("./data"):
    # Filter only .png files (case-insensitive)
    if image_file.lower().endswith('.png'):
        image_path = f"./data/{image_file}"
        try:
            image_embedding = get_image_embedding(image_path)
            vectors[image_file] = image_embedding
            print(f"✓ Processed: {image_file}")
        except Exception as e:
            print(f"✗ Error with {image_file}: {e}")

# save the embeddings to a file
with open("./data/images_ai-vision.json", "w") as f:
    json.dump(vectors, f)
    

In [None]:
Image.open("./data/planeMotor.png")

In [None]:
vectors["planeMotor.png"]

In [None]:
len(vectors["planeMotor.png"])

# Multi-modal Vectors Analysis


In [None]:
with open('./data/images_ai-vision.json') as f:
    image_vectors = json.load(f)


## Most similar to a target image


In [None]:
import pandas as pd

def cosine_similarity(v1, v2):
    """Calculate the cosine similarity between two vectors"""
    dot_product = sum([a * b for a, b in zip(v1, v2)])
    magnitude = (sum([a**2 for a in v1]) * sum([a**2 for a in v2])) ** 0.5
    return dot_product / magnitude

def most_similar(target_vector: str, vectors: dict) -> list[list]:
    """Return the most similar images and their similarities relative to the given images"""
    similarities = {w: cosine_similarity(target_vector, vector) for w, vector in vectors.items()}
    most_similar = sorted(similarities, key=similarities.get, reverse=True)
    return pd.DataFrame([(vector_key, similarities[vector_key]) for vector_key in most_similar], columns=['vector key', 'similarity'])


In [None]:
# Render the target image
target_image = "planeMotor.png"
plt.imshow(plt.imread(f"./data/{target_image}"))


In [None]:
most_similar_df = most_similar(image_vectors[target_image], image_vectors)[0:3]
most_similar_df


In [None]:
# Now render each of those images
for image_name in most_similar_df['vector key'][1:]:
    plt.imshow(plt.imread(f'./data/{image_name}'))
    plt.axis('off')
    plt.show()


## Search with text


In [None]:
def get_text_embedding(text):
    url = f"{AZURE_COMPUTER_VISION_URL}/computervision/retrieval:vectorizeText"
    headers = get_auth_headers()
    headers["Content-Type"] = "application/json"
    return requests.post(url, headers=headers, params=get_model_params(),
                         json={"text": text}).json()["vector"]


In [None]:
embedding = get_text_embedding("Wing") # Examples: "Motor" "Shoe" "Plane"
most_similar_df = most_similar(embedding, image_vectors)

for image_name in most_similar_df['vector key'][0:3]:
    plt.imshow(plt.imread(f'./data/{image_name}'))
    plt.axis('off')
    plt.show()
