# Load Image File vectors from Storage Account into CogSearch 
### The Images from Storage Account are loaded into CogSearch by following below steps
- Establish a connection with Storage Account using the Python SDK.
- Retrieve the required image files from Storage Account container using the file stream download method.
- Use Azure AI Vision to vectorize the Image files from Storage Account.
- Index the vector chunks into Azure Cognitive Search.
- Repeat the process for all the required files.

#### Using the Azure Storage Python SDK  to fetch the file stream and use AI Vision to embed the image in memory and create a vector index
Inspired from Below Repos
- Azure Python SDK https://github.com/Azure/azure-sdk-for-python/tree/main/sdk/storage/azure-storage-blob
- Refer to https://github.com/MSUSAzureAccelerators/Azure-Cognitive-Search-Azure-OpenAI-Accelerator/blob/main/04-Complex-Docs.ipynb
- Refer to https://github.com/Azure/azureml-examples

In [None]:
!pip install -r requirements.txt

In [1]:
!az login

[
  {
    "cloudName": "AzureCloud",
    "homeTenantId": "d02378ec-1688-46d5-8540-1c28b5f470f6",
    "id": "c17a0f1e-eff5-4d9a-b56b-20d006070ed2",
    "isDefault": true,
    "managedByTenants": [],
    "name": "Azure for Students",
    "state": "Enabled",
    "tenantId": "d02378ec-1688-46d5-8540-1c28b5f470f6",
    "user": {
      "name": "s222521972@deakin.edu.au",
      "type": "user"
    }
  }
]




### Import required libraries and environment variables

In [10]:
# Import required libraries  
import os
import json
import requests
from config import *
import http.client, urllib.parse
from tenacity import retry, stop_after_attempt, wait_fixed
from dotenv import load_dotenv  
from azure.core.credentials import AzureKeyCredential
# from azure.identity import DefaultAzureCredential
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient  
from azure.search.documents.models import (
    RawVectorQuery,
)
from azure.search.documents.indexes.models import (  
 
    ExhaustiveKnnParameters,  
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    HnswParameters,  
    HnswVectorSearchAlgorithmConfiguration,
    SimpleField,
    SearchField,  
    SearchFieldDataType,  
    SearchIndex,  
    VectorSearch,  
    VectorSearchAlgorithmKind,  
    VectorSearchProfile,  
)
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient

from IPython.display import Image, display
from tqdm import tqdm
import requests
  
load_dotenv()  
service_endpoint = AZ_SEARCH_ENDPOINT
index_name = "bot_index"
api_version = AZ_SEARCH_VERSION
key = AZ_SEARCH_KEY

aiVisionApiKey = AZ_VISION_KEY
aiVisionRegion = AZ_VISION_REGION
credential = AzureKeyCredential(key)
# Setup the Payloads header for cog search
headers = {'Content-Type': 'application/json','api-key': AZ_SEARCH_KEY}
params = {'api-version': AZ_SEARCH_VERSION}

### Get Image vectors using AI Vision

In [4]:
# @retry(stop=stop_after_attempt(5), wait=wait_fixed(1))
def get_image_vector(image_path, key, region):
    print("Processing image: ",image_path)
    headers = {
        'Ocp-Apim-Subscription-Key': key,
    }

    params = urllib.parse.urlencode({
        'model-version': '2023-04-15',
    })

    # try:
    if image_path.startswith(('http://', 'https://')):
        headers['Content-Type'] = 'application/json'              
        body = json.dumps({"url": image_path})
        
    else:
        headers['Content-Type'] = 'application/octet-stream'
        with open(image_path, "rb") as filehandler:
            image_data = filehandler.read()
            body = image_data

    conn = http.client.HTTPSConnection(f'{region}.api.cognitive.microsoft.com', timeout=3)
    conn.request("POST", "/computervision/retrieval:vectorizeImage?api-version=2023-04-01-preview&%s" % params, body, headers)
    response = conn.getresponse()
    data = json.load(response)
    conn.close()

    print(data)
    if response.status != 200:
        raise Exception(f"Error processing image {image_path}: {data.get('message', '')}")
    return data.get("vector")

    # except (requests.exceptions.Timeout, http.client.HTTPException) as e:
    #     print(f"Timeout/Error for {image_path}. Retrying...")
    #     raise


### Read the metadata Json blob and index the images

In [5]:
import json

# Define a function that takes filename as input
def read_json_blob(filename, file_url, idx):
    result = {}
    # try:
    vector = get_image_vector(
        file_url,
        aiVisionApiKey,
        aiVisionRegion
    )
    # except Exception as e:
    #     print(f"Error processing image at index {idx}: {e}")
    #     vector = None
    
    # Only save result if vector is present AND description is present
    description_present = IMG_DESCRIPTION.get(file_url, '')
    
    if vector and description_present:
        result = {
            "id": idx,
            "image_vector": vector,
            "description": description_present  # Always add the description, even if it's an empty string
        }
    return result


### List the blobs in your container using the ContainerClient

In [6]:
from azure.storage.blob import BlobClient

#define a list for holding the file details 
files_to_index = []

container = ContainerClient.from_connection_string(conn_str=AZ_STORAGE_CONNECTION_STRING, container_name=AZ_STORAGE_CONTAINER_NAME)

blob_list = container.list_blobs()
idx = 0
for blob in blob_list:
    # print(blob)
    files_to_index.append({"idx":idx,"file_name": blob.name,"file_url": (AZ_STORAGE_BASE_URL+ AZ_STORAGE_CONTAINER_NAME + "/" + blob.name)})
    idx += 1

### Get the file content Stream for the blobs async and use the AI  intelligence to create vectors

In [7]:
for item in files_to_index:
    item["image_map"]= read_json_blob(filename=item["file_name"], file_url=item["file_url"], idx=item["idx"])

Processing image:  https://sit788felixstorage.blob.core.windows.net/bot-image/img_1.jpg
{'vector': [-0.7402344, 0.49536133, 0.35351562, -1.8935547, -2.109375, -0.65722656, 0.23864746, 0.0096206665, -1.3193359, -0.2705078, 0.93408203, -1.7177734, 1.1962891, 0.16833496, 1.2050781, -6.34375, 2.1464844, -0.296875, 0.9301758, 0.6254883, -0.45825195, 1.3974609, -1.8242188, 2.4902344, 1.3066406, 0.6621094, -40.5, -0.26464844, 0.14526367, 1.40625, -1.1220703, 0.001461029, -2.0058594, -1.5390625, -0.9741211, -0.39331055, -1.3095703, -0.13757324, 0.15466309, -3.203125, 0.09442139, 2.828125, -0.9243164, -1.7402344, -0.78222656, 0.8955078, -1.3154297, -0.13964844, 0.17260742, -0.55908203, 0.2709961, 0.67089844, -1.7246094, 1.1787109, -1.4619141, -0.62109375, -1.5986328, -1.109375, -0.007820129, -2.0332031, -0.5541992, 0.45996094, -0.15209961, -0.49145508, 0.7290039, -0.17358398, -0.39331055, -0.55029297, 1.1113281, 0.69628906, 3.671875, 0.012557983, 2.0371094, -0.3552246, 2.2441406, 0.119384766, 3

In [8]:
for item in files_to_index:
    print(item)

{'idx': 0, 'file_name': 'img_1.jpg', 'file_url': 'https://sit788felixstorage.blob.core.windows.net/bot-image/img_1.jpg', 'image_map': {'id': 0, 'image_vector': [-0.7402344, 0.49536133, 0.35351562, -1.8935547, -2.109375, -0.65722656, 0.23864746, 0.0096206665, -1.3193359, -0.2705078, 0.93408203, -1.7177734, 1.1962891, 0.16833496, 1.2050781, -6.34375, 2.1464844, -0.296875, 0.9301758, 0.6254883, -0.45825195, 1.3974609, -1.8242188, 2.4902344, 1.3066406, 0.6621094, -40.5, -0.26464844, 0.14526367, 1.40625, -1.1220703, 0.001461029, -2.0058594, -1.5390625, -0.9741211, -0.39331055, -1.3095703, -0.13757324, 0.15466309, -3.203125, 0.09442139, 2.828125, -0.9243164, -1.7402344, -0.78222656, 0.8955078, -1.3154297, -0.13964844, 0.17260742, -0.55908203, 0.2709961, 0.67089844, -1.7246094, 1.1787109, -1.4619141, -0.62109375, -1.5986328, -1.109375, -0.007820129, -2.0332031, -0.5541992, 0.45996094, -0.15209961, -0.49145508, 0.7290039, -0.17358398, -0.39331055, -0.55029297, 1.1113281, 0.69628906, 3.671875, 

### Create an index

Create your search index schema and vector search configuration:

In [11]:
# Create a search index 
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)  
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),  
    SearchField(name="description", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),  
    SearchField(
        name="image_vector",  
        hidden=True,
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single), 
        searchable=True,
        vector_search_dimensions=1024,  
        vector_search_profile="myHnswProfile"
    ),  
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswVectorSearchAlgorithmConfiguration(  
            name="myHnsw",  
            kind=VectorSearchAlgorithmKind.HNSW,  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=1000,  
                metric="cosine",  
            ),  
        ),  
            ExhaustiveKnnVectorSearchAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,  
            parameters=ExhaustiveKnnParameters(  
                metric="cosine",  
            ),  
        ), 
    ],  
   profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm="myHnsw",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm="myExhaustiveKnn",  
        ),  
    ],  
)
  
# Create the search index with the vector search configuration  
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created") 

bot_index created


### Push Image vector data to the index

In [16]:
for item in files_to_index:
    print("Uploading chunks from",item["file_name"])
    img = item['image_map']
    upload_payload = {
        "value": [
            {
                "id": str(img["id"]),
                "image_vector": img["image_vector"],
                "description": img["description"],
                "@search.action": "upload"
            },
        ]
    }
    #print(upload_payload)
    r = requests.post(AZ_SEARCH_ENDPOINT + "/indexes/" + index_name + "/docs/index",
                         data=json.dumps(upload_payload), headers=headers, params=params)
    if r.status_code != 200:
        print(r.status_code)
        print(r.text)
        # except Exception as e:
        #     print("Exception:",e)
        #     #print(content)

Uploading chunks from img_1.jpg
{"@odata.context":"https://sit788-felix-cognitive-search.search.windows.net/indexes('bot_index')/$metadata#Collection(Microsoft.Azure.Search.V2024_03_01_Preview.IndexResult)","value":[{"key":"0","status":true,"errorMessage":null,"statusCode":200}]}
Uploading chunks from img_2.jpg
{"@odata.context":"https://sit788-felix-cognitive-search.search.windows.net/indexes('bot_index')/$metadata#Collection(Microsoft.Azure.Search.V2024_03_01_Preview.IndexResult)","value":[{"key":"1","status":true,"errorMessage":null,"statusCode":200}]}
Uploading chunks from img_3.jpg
{"@odata.context":"https://sit788-felix-cognitive-search.search.windows.net/indexes('bot_index')/$metadata#Collection(Microsoft.Azure.Search.V2024_03_01_Preview.IndexResult)","value":[{"key":"2","status":true,"errorMessage":null,"statusCode":200}]}
Uploading chunks from img_4.jpg
{"@odata.context":"https://sit788-felix-cognitive-search.search.windows.net/indexes('bot_index')/$metadata#Collection(Microso