# Azure Cognitive Search Vector Search Code Sample with Azure AI Vision Image Embedding API
This code demonstrates how to create Azure Cognitive Search Index with Azure AI Vision Image Embedding API and Azure Python SDK
## Prerequisites
To run the code, install the following packages. Please use the latest pre-release version `pip install azure-search-documents --pre`. This sample currently uses version `11.4.0b11`.

In [None]:
! pip install azure-search-documents==11.4.0b11 --quiet
! pip install python-dotenv ipython --quiet
! pip install azure-storage-blob --quiet

## Import required libraries and environment variables

In [None]:
# Import required libraries  
import os  
import json  
import requests
import http.client, urllib.parse
import shutil
from dotenv import load_dotenv  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient  

from azure.search.documents.indexes.models import (  
 
    ExhaustiveKnnParameters,  
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    HnswParameters,  
    HnswVectorSearchAlgorithmConfiguration,
    IndexingParameters,
    SimpleField,
    SearchField,  
    SearchFieldDataType,  
    SearchIndex,  
    SearchIndexer,
    SearchIndexerDataSourceConnection, 
    SearchIndexerDataContainer,  
    VectorSearch,  
    VectorSearchAlgorithmKind,  
    VectorSearchProfile,  
)

from azure.storage.blob import BlobServiceClient
  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
aiVisionApiKey = os.getenv("AZURE_AI_VISION_API_KEY")  
aiVisionRegion = os.getenv("AZURE_AI_VISION_REGION")
blob_connection_string = os.getenv("BLOB_CONNECTION_STRING")  
container_name = os.getenv("BLOB_CONTAINER_NAME")
credential = AzureKeyCredential(key)

# Image Embedding

Vectorize images with image embedding API 

In [None]:
def get_image_vector(image_path, key, region):
    headers = {
        'Ocp-Apim-Subscription-Key': key,
    }

    params = urllib.parse.urlencode({
        'model-version': '2023-04-15',
    })

    MAX_RETRIES = 5

    for attempt in range(MAX_RETRIES):
        try:
            if image_path.startswith(('http://', 'https://')):
                headers['Content-Type'] = 'application/json'              
                body = json.dumps({"url": image_path})
            else:
                headers['Content-Type'] = 'application/octet-stream'
                with open(image_path, "rb") as filehandler:
                    image_data = filehandler.read()
                    body = image_data

            conn = http.client.HTTPSConnection(f'{region}.api.cognitive.microsoft.com', timeout=3)
            conn.request("POST", "/computervision/retrieval:vectorizeImage?api-version=2023-04-01-preview&%s" % params, body, headers)
            response = conn.getresponse()
            data = json.load(response)
            conn.close()

            if response.status != 200:
                print(f"Error processing image {image_path}: {data.get('message', '')}")
                return None

            return data.get("vector")

        except (requests.exceptions.Timeout, http.client.HTTPException) as e:
            if attempt < MAX_RETRIES - 1:
                print(f"Timeout/Error for {image_path}. Retrying {attempt+1}/{MAX_RETRIES}...")
                continue
            else:
                print(f"Error after {MAX_RETRIES} attempts for {image_path}.")
                return None

input_json_file = '../data/images/apples/input.json'
output_folder = '../data/images/apples/output'
if os.path.exists(output_folder):
    shutil.rmtree(output_folder)
os.makedirs(output_folder)

with open(input_json_file, 'r') as infile:
    images = json.load(infile)

results = []
file_count = 1
images_per_file = 50

for idx, image_data in enumerate(images):
    if image_data.get('image_path', None):
        try:
            # Assuming get_image_vector is defined elsewhere and synchronous
            vector = get_image_vector(
                os.path.join(os.path.dirname(input_json_file), image_data.get('image_path', None)),
                aiVisionApiKey,
                aiVisionRegion
            )
        except Exception as e:
            print(f"Error processing image at index {idx}: {e}")
            vector = None
        
        # Only save result if vector is present AND description is present
        description_present = image_data.get('description', '')
        
        if vector and description_present:
            result = {
                "id": idx,
                "image_vector": vector,
                "description": description_present  # Always add the description, even if it's an empty string
            }

            results.append(result)

    print(f"Processed image {idx+1}/{len(images)}")

    if (idx + 1) % images_per_file == 0 or (idx + 1) == len(images):
        print(f"Saving results to file {file_count}")
        output_file_path = os.path.join(output_folder, f"output_{file_count}.json")
        with open(output_file_path, 'w') as outfile:
            json.dump(results, outfile, indent=4)
        results = []
        file_count += 1

print(f"Results are saved to {output_folder}")


In [None]:
# Initialize a BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)

# Create a container client to interact with the specified container
container_client = blob_service_client.get_container_client(container_name)

# Get the folder name from the local path
folder_name = os.path.basename(output_folder)

# Iterate over all files in the directory
for root, dirs, files in os.walk(output_folder):
    for filename in files:
        # Create a full path to the file
        file_path = os.path.join(root, filename)

        # Create a blob name that maintains the local directory structure
        blob_name = os.path.join(folder_name, os.path.relpath(file_path, output_folder)).replace("\\", "/")

        # Create a blob client for the file
        blob_client = container_client.get_blob_client(blob_name)

        # Upload the file to Azure Blob Storage
        print(f"Uploading file to blob: {blob_name}")
        with open(file_path, "rb") as data:
            blob_client.upload_blob(data, overwrite=True)

print(f"All files from {output_folder} have been uploaded to the {container_name} container within the {folder_name} folder.")

In [None]:
# Create a data source 
ds_client = SearchIndexerClient(service_endpoint, credential)

data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=SearchIndexerDataContainer(name=container_name, query=folder_name),  # Specify the virtual folder path here
)
data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

In [None]:
# Create a search index 
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)  
fields = [  
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),  
    SearchField(name="description", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),  
    SearchField(
        name="image_vector",  
        hidden=True,
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
        searchable=True,
        vector_search_dimensions=1024,  
        vector_search_profile="myHnswProfile"
    ),  
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswVectorSearchAlgorithmConfiguration(  
            name="myHnsw",  
            kind=VectorSearchAlgorithmKind.HNSW,  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=1000,  
                metric="cosine",  
            ),  
        ),  
            ExhaustiveKnnVectorSearchAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,  
            parameters=ExhaustiveKnnParameters(  
                metric="cosine",  
            ),  
        ), 
    ],  
   profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm="myHnsw",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm="myExhaustiveKnn",  
        ),  
    ],  
)  
  
# Create the search index with the vector search configuration  
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  

In [None]:
# Define your data source with parsing_mode within the IndexingParameters
indexing_parameters = IndexingParameters(
    configuration={
        "dataToExtract": "contentAndMetadata",
        "parsingMode": "jsonArray"
    }
)

# Create an indexer  
indexer_name = f"{index_name}-indexer"  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to process images",  
    target_index_name=index_name,  
    data_source_name=data_source.name,
    parameters=indexing_parameters
)  
  
indexer_client = SearchIndexerClient(service_endpoint, credential)  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} created')