# Azure AI Search integrated vectorization sample
This code demonstrates how to use Azure AI Search as a vector store by automatically chunking and generating embeddings using the AzureOpenAIEmbedding skill as part of the skillset pipeline in Azure AI Search. 
## Prerequisites
To run the code, install the following packages. This sample currently uses version `11.4.0b12`. Please note, that integrated vectorization feature is in preview and has not been published to [azure-search-documents](https://pypi.org/project/azure-search-documents/#description) on pypi. If you'd like to use this feature, please reference the whl file. We hope to publish an updated version soon!

In [1]:
! pip install whl/azure_search_documents-11.4.0b12-py3-none-any.whl  --quiet
! pip install openai azure-storage-blob python-dotenv --quiet

## Import required libraries and environment variables

In [2]:
# Import required libraries  
from azure.core.credentials import AzureKeyCredential  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient  
from azure.search.documents.models import (
    QueryAnswerType,
    QueryCaptionType,
    QueryLanguage,
    QueryType,
    RawVectorQuery,
    VectorizableTextQuery,
    VectorFilterMode,    
)
from azure.search.documents.indexes.models import (  
    AzureOpenAIEmbeddingSkill,  
    AzureOpenAIParameters,  
    AzureOpenAIVectorizer,  
    ExhaustiveKnnParameters,  
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    FieldMapping,  
    HnswParameters,  
    HnswVectorSearchAlgorithmConfiguration,  
    IndexProjectionMode,  
    InputFieldMappingEntry,  
    OutputFieldMappingEntry,  
    PrioritizedFields,    
    SearchField,  
    SearchFieldDataType,  
    SearchIndex,  
    SearchIndexer,  
    SearchIndexerDataContainer,  
    SearchIndexerDataSourceConnection,  
    SearchIndexerIndexProjectionSelector,  
    SearchIndexerIndexProjections,  
    SearchIndexerIndexProjectionsParameters,  
    SearchIndexerSkillset,  
    SemanticConfiguration,  
    SemanticField,  
    SemanticSettings,  
    SplitSkill,  
    VectorSearch,  
    VectorSearchAlgorithmKind,  
    VectorSearchAlgorithmMetric,  
    VectorSearchProfile,  
)  

from azure.storage.blob import BlobServiceClient  
import openai  


In [3]:
index_name = "cogsrch-index-files"
# Name of the container in your Blob Storage Datasource ( in credentials.env)
BLOB_CONTAINER_NAME = "demo-vbd-mercedes"

#AZURE_SEARCH_SERVICE_ENDPOINT=YOUR-SEARCH-SERVICE-ENDPOINT
#AZURE_SEARCH_INDEX_NAME=YOUR-SEARCH-SERVICE-INDEX-NAME
#AZURE_SEARCH_ADMIN_KEY=YOUR-SEARCH-SERVICE-ADMIN-KEY

In [4]:
from dotenv import load_dotenv  
import os  
  
# Configure environment variables  
load_dotenv("credentials.env")
service_endpoint =os.getenv("AZURE_SEARCH_ENDPOINT")  #os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
index_name ="cogsrch-index-files" # os.getenv("AZURE_SEARCH_INDEX_NAME")  
key = os.getenv("AZURE_SEARCH_KEY")  #remose _ADMIN_ here
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION")  
model: str = "text-embedding-ada-002"  
blob_connection_string = os.getenv("BLOB_CONNECTION_STRING")  
container_name ="demo-vbd-mercedes" # os.getenv("BLOB_CONTAINER_NAME")  
credential = AzureKeyCredential(key)  

## Connect to Blob Storage  
Retrieve documents from Blob Storage. You can use the sample documents in the [documents](../data/documents) folder.  

In [5]:
# Connect to Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(container_name)
blobs = container_client.list_blobs()

first_blob = next(blobs)
blob_url = container_client.get_blob_client(first_blob).url
print(f"URL of the first blob: {blob_url}")

URL of the first blob: https://storagedemoopenai.blob.core.windows.net/demo-vbd-mercedes/2602036007%20%5BOriginal%20en%5D%20Canada%20Federal%20Gaz%20I%202011-02-26%20mercury.pdf.pdf


In [6]:
print(f"{index_name}-blob")

cogsrch-index-files-blob


## Connect your Blob storage to a data source in Azure AI Search

In [7]:
# Create a data source 
ds_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))
container = SearchIndexerDataContainer(name=container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container
)
data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'cogsrch-index-files-blob' created or updated


## Create a search index

In [8]:
# Create a search index  
index_client = SearchIndexClient(endpoint=service_endpoint, credential=credential)  
fields = [  
    SearchField(name="parent_id", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),  
    SearchField(name="title", type=SearchFieldDataType.String),  
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),  
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),  
    SearchField(name="vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile="myHnswProfile"),
    SearchField(name="filter", type=SearchFieldDataType.String, sortable=True, filterable=True, facetable=True),    
]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswVectorSearchAlgorithmConfiguration(  
            name="myHnsw",  
            kind=VectorSearchAlgorithmKind.HNSW,  
            parameters=HnswParameters(  
                m=4,  
                ef_construction=400,  
                ef_search=500,  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
        ExhaustiveKnnVectorSearchAlgorithmConfiguration(  
            name="myExhaustiveKnn",  
            kind=VectorSearchAlgorithmKind.EXHAUSTIVE_KNN,  
            parameters=ExhaustiveKnnParameters(  
                metric=VectorSearchAlgorithmMetric.COSINE,  
            ),  
        ),  
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm="myHnsw",  
            vectorizer="myOpenAI",  
        ),  
        VectorSearchProfile(  
            name="myExhaustiveKnnProfile",  
            algorithm="myExhaustiveKnn",  
            vectorizer="myOpenAI",  
        ),  
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            name="myOpenAI",  
            kind="azureOpenAI",  
            azure_open_ai_parameters=AzureOpenAIParameters(  
                resource_uri=os.getenv("AZURE_OPENAI_ENDPOINT"),  
                deployment_id=model,  
                api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
            ),  
        ),  
    ],  
)  
  
semantic_config = SemanticConfiguration(  
    name="my-semantic-config",  
    prioritized_fields=PrioritizedFields(  
        prioritized_content_fields=[SemanticField(field_name="chunk")]  
    ),  
)  
  
# Create the semantic settings with the configuration  
semantic_settings = SemanticSettings(configurations=[semantic_config])  
  
# Create the search index with the semantic settings  
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search, semantic_settings=semantic_settings)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  


cogsrch-index-files created


## Create a skillset

In [9]:
# Create a skillset  
skillset_name = f"{index_name}-skillset"  
  
split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=2048,  
    page_overlap_length=20,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)  
  
embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",  
    resource_uri=os.getenv("AZURE_OPENAI_ENDPOINT"),  
    deployment_id=model,  
    api_key=os.getenv("AZURE_OPENAI_API_KEY"),  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="vector")  
    ],  
)  
  
index_projections = SearchIndexerIndexProjections(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="vector", source="/document/pages/*/vector"),  
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=[split_skill, embedding_skill],  
    index_projections=index_projections,  
)  
  
client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  


cogsrch-index-files-skillset created


## Create an indexer

In [10]:
# Create an indexer  
indexer_name = f"{index_name}-indexer"  
  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,  
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")]  
)  
  
indexer_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} created')  


 cogsrch-index-files-indexer created


## Perform a vector similarity search

This example shows a pure vector search using the vectorizable text query, all you need to do is pass in text and your vectorizer will handle the query vectorization.

In [12]:
# Pure Vector Search
query = "Welche Regeln gelten für Motoren bei Land Transporten?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k=2, fields="vector", exhaustive=True)
# Use the below query to pass in the raw vector query instead of the query vectorization
# vector_query = RawVectorQuery(vector=generate_embeddings(query), k=3, fields="vector")
  
results = search_client.search(  
    search_text=None,  
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    #filter="filter eq 'id1' ",
    #filter="filter/any(filter: search.in(filter, 'group_id1, group_id2'))"  ,
    top=1
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Content: {result['chunk']}")   


parent_id: aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGVtby12YmQtbWVyY2VkZXMvMjYwNTI0MTYzOSUyMFtPcmlnaW5hbCUyMGVuXSUyMENBTiUyMFRTRCUyMDIwNiUyMFJldiUyMDIlMjBFTiUyMDIwMTAtMDYtMDMucGRmLnBkZg2
chunk_id: 7f98ad2134ee_aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGVtby12YmQtbWVyY2VkZXMvMjYwNTI0MTYzOSUyMFtPcmlnaW5hbCUyMGVuXSUyMENBTiUyMFRTRCUyMDIwNiUyMFJldiUyMDIlMjBFTiUyMDIwMTAtMDYtMDMucGRmLnBkZg2_pages_0
Score: 0.80743897
Content: Transport Canada Transports Canada 
Safety and Security Sécurité et sûreté 

 
Road Safety  Sécurité routière 
 
 

Standards and Regulations Division 
 

 

 

TECHNICAL STANDARDS DOCUMENT 

No. 206, Revision 2 

 

 

Door Locks and Door Retention 
Components 

 
 

The text of this document is based on Federal Motor 
Vehicle Safety Standard No. 206, Door Locks and Door 

Retention Components, as published in the Federal 
Register on February 19, 2010 (Vol. 75, No. 33, p. 7370). 

 
 
 

Effective Date:  March 11, 2008 
Man

## Perform a hybrid search

In [13]:
# Hybrid Search
query = "Welche Regeln gelten für Motoren bei Land Transporten?"  
  
search_client = SearchClient(service_endpoint, index_name, credential=credential)
vector_query = VectorizableTextQuery(text=query, k=3, fields="vector", exhaustive=True)
  
results = search_client.search(  
    search_text=query,  
    vector_queries= [vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    top=3
)  
  
for result in results:  
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    #print(f"Content: {result['chunk']}")  


parent_id: aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGVtby12YmQtbWVyY2VkZXMvMjYwMjAzNjAwNyUyMFtPcmlnaW5hbCUyMGVuXSUyMENhbmFkYSUyMEZlZGVyYWwlMjBHYXolMjBJJTIwMjAxMS0wMi0yNiUyMG1lcmN1cnkucGRmLnBkZg2
chunk_id: 77e6ceb46773_aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGVtby12YmQtbWVyY2VkZXMvMjYwMjAzNjAwNyUyMFtPcmlnaW5hbCUyMGVuXSUyMENhbmFkYSUyMEZlZGVyYWwlMjBHYXolMjBJJTIwMjAxMS0wMi0yNiUyMG1lcmN1cnkucGRmLnBkZg2_pages_45
Score: 0.01666666753590107
parent_id: aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGVtby12YmQtbWVyY2VkZXMvMjYwNTI0MTYzOSUyMFtPcmlnaW5hbCUyMGVuXSUyMENBTiUyMFRTRCUyMDIwNiUyMFJldiUyMDIlMjBFTiUyMDIwMTAtMDYtMDMucGRmLnBkZg2
chunk_id: 7f98ad2134ee_aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGVtby12YmQtbWVyY2VkZXMvMjYwNTI0MTYzOSUyMFtPcmlnaW5hbCUyMGVuXSUyMENBTiUyMFRTRCUyMDIwNiUyMFJldiUyMDIlMjBFTiUyMDIwMTAtMDYtMDMucGRmLnBkZg2_pages_0
Score: 0.01666666753590107
parent_id: aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS

## Perform a hybrid search + semantic reranking

In [14]:
# Semantic Hybrid Search
query = "Welche Regeln gelten für Motoren bei Land Transporten?"

search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
vector_query = VectorizableTextQuery(text=query, k=2, fields="vector", exhaustive=True)

results = search_client.search(  
    search_text=query,
    vector_queries=[vector_query],
    select=["parent_id", "chunk_id", "chunk"],
    query_type=QueryType.SEMANTIC, query_language=QueryLanguage.EN_US, semantic_configuration_name='my-semantic-config', query_caption=QueryCaptionType.EXTRACTIVE, query_answer=QueryAnswerType.EXTRACTIVE,
    top=2
)

semantic_answers = results.get_answers()
for answer in semantic_answers:
    if answer.highlights:
        print(f"Semantic Answer: {answer.highlights}")
    else:
        print(f"Semantic Answer: {answer.text}")
    print(f"Semantic Answer Score: {answer.score}\n")

for result in results:
    print(f"parent_id: {result['parent_id']}")  
    print(f"chunk_id: {result['chunk_id']}")  
    print(f"Score: {result['@search.score']}")  
    print(f"Reranker Score: {result['@search.reranker_score']}")
    #print(f"Content: {result['chunk']}")  

    captions = result["@search.captions"]
    if captions:
        caption = captions[0]
        if caption.highlights:
            print(f"Caption: {caption.highlights}\n")
        else:
            print(f"Caption: {caption.text}\n")


parent_id: aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGVtby12YmQtbWVyY2VkZXMvMjYwNTI0MTYzOSUyMFtPcmlnaW5hbCUyMGVuXSUyMENBTiUyMFRTRCUyMDIwNiUyMFJldiUyMDIlMjBFTiUyMDIwMTAtMDYtMDMucGRmLnBkZg2
chunk_id: 7f98ad2134ee_aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGVtby12YmQtbWVyY2VkZXMvMjYwNTI0MTYzOSUyMFtPcmlnaW5hbCUyMGVuXSUyMENBTiUyMFRTRCUyMDIwNiUyMFJldiUyMDIlMjBFTiUyMDIwMTAtMDYtMDMucGRmLnBkZg2_pages_0
Score: 0.01666666753590107
Reranker Score: 1.558976650238037
Caption: effective date:  march 11, 2008  mandatory compliance date: september 1, 20111         standards research and development branch  road safety and<em> motor vehicle regulation</em> directorate<em>   transport</em> canada  ottawa, ontario   k1a 0n5                         1 note: the previous mandatory compliance date was september 1, 2009.   hkl linien  hkl linien …

parent_id: aHR0cHM6Ly9zdG9yYWdlZGVtb29wZW5haS5ibG9iLmNvcmUud2luZG93cy5uZXQvZGVtby12YmQtbWVyY2VkZXMvMjYwMjAzNjAwNyUyMFtP