## Ingestion to Azure Cognitive Search via Python SDK

### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../../common/generate_embeddings.ipynb) 

#### Set environment variables

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

acs_endpoint  = os.getenv("ACS_ENDPOINT")
if acs_endpoint is None or acs_endpoint == "":
    print("ACS_ENDPOINT environment variable not set.")
    exit()

acs_api_version  = os.getenv("ACS_API_VERSION")
if acs_api_version is None or acs_api_version == "":
    print("ACS_API_VERSION environment variable not set.")
    exit()

acs_key  = os.getenv("ACS_KEY")
if acs_key is None or acs_key == "":
    print("ACS_KEY environment variable not set.")
    exit()

text_index_name = 'text-sample'
doc_index_name = 'doc-sample'
image_index_name = 'image-sample'

#### Helper methods

In [2]:
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes.models import (
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings,  
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmConfiguration,
    VectorSearchAlgorithmKind,
    VectorSearchProfile,
    VectorSearchVectorizer,
    VectorSearchVectorizerKind,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    HnswVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnVectorSearchAlgorithmConfiguration,
    ExhaustiveKnnParameters,
    SearchIndex,  
    SearchField,  
    SearchFieldDataType,  
    SimpleField,  
    SearchableField,  
    SearchIndex,  
    SemanticConfiguration,  
    PrioritizedFields,  
    SemanticField,  
    SearchField,  
    SemanticSettings, 
    VectorSearch,  
    HnswVectorSearchAlgorithmConfiguration,
    HnswParameters,  
    VectorSearch,
    VectorSearchAlgorithmKind,
    VectorSearchAlgorithmMetric,
    VectorSearchProfile,
    AzureOpenAIParameters,
    AzureOpenAIVectorizer,
)

def get_index_client() -> SearchIndexClient:
    return SearchIndexClient(acs_endpoint, AzureKeyCredential(acs_key))

def create_index(index_name, fields, vector_search, semantic_title_field_name, semantic_content_field_names):
    semantic_settings = SemanticSettings(
        configurations=[SemanticConfiguration(
            name='default',
            prioritized_fields=PrioritizedFields(
                title_field=SemanticField(field_name=semantic_title_field_name), prioritized_content_fields=[SemanticField(field_name=field_name) for field_name in semantic_content_field_names]))])
    index = SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search,
        semantic_settings=semantic_settings)
    index_client = get_index_client()
    return index_client.create_index(index)

#### Create text-sample ACS index

In [5]:
vector_search = VectorSearch(
    algorithms=[
        HnswVectorSearchAlgorithmConfiguration(
            name="hnsw_config",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric="cosine"
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="hnsw_profile",
            algorithm="hnsw_config"
            # vectorizer="myOpenAI"
        )
    ]  
)

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String, filterable=True),
    SearchField(name="title_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile="hnsw_profile"),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile="hnsw_profile")    
]

index = create_index(text_index_name, fields, vector_search=vector_search, semantic_title_field_name="title", semantic_content_field_names=["content"])

#### Create doc-sample ACS index

In [6]:
vector_search = VectorSearch(
    algorithms=[
        HnswVectorSearchAlgorithmConfiguration(
            name="hnsw_config",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric="cosine"
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="hnsw_profile",
            algorithm="hnsw_config"
            # vectorizer="myOpenAI"
        )
    ]  
)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="chunk_content", type=SearchFieldDataType.String),
    SearchField(name="chunk_content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile="hnsw_profile")   
]

index = create_index(doc_index_name, fields, vector_search=vector_search, semantic_title_field_name="chunk_content", semantic_content_field_names=["chunk_content"])

#### Create image-sample ACS index

In [7]:
vector_search = VectorSearch(
    algorithms=[
        HnswVectorSearchAlgorithmConfiguration(
            name="hnsw_config",
            kind=VectorSearchAlgorithmKind.HNSW,
            parameters=HnswParameters(
                m=4,
                ef_construction=400,
                ef_search=500,
                metric="cosine"
            )
        )
    ],
    profiles=[
        VectorSearchProfile(
            name="hnsw_profile",
            algorithm="hnsw_config"
            # vectorizer="myOpenAI"
        )
    ]  
)
# vector_search = VectorSearch(
#     algorithm_configurations=[
#         HnswVectorSearchAlgorithmConfiguration(
#             name="vector_config",
#             kind="hnsw",
#             parameters={
#                 "m": 4,
#                 "efConstruction": 400,
#                 "efSearch": 1000,
#                 "metric": "cosine"
#             }
#         )
#     ]
# )
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="image", type=SearchFieldDataType.String),
    SearchField(name="image_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1024, vector_search_profile="hnsw_profile")   
]

index = create_index(image_index_name, fields, vector_search=vector_search, semantic_title_field_name="image", semantic_content_field_names=["image"])

#### Ingest to text-sample

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to structure the dataframe according to the ACS columns.

In [8]:
import requests
import json
import pandas as pd
from azure.search.documents import SearchClient  

text_df = pd.read_json('../../data/text/product_docs_embeddings.json') 

batch_size = 10
total_records = text_df.shape[0]
fields = text_df.columns.to_numpy()
text_df['id'] = text_df['id'].astype(str)

records = []

for index, row in text_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records.append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        search_client = SearchClient(acs_endpoint, text_index_name, AzureKeyCredential(acs_key))
        result = search_client.upload_documents(documents=records)
        records = []

### Ingest to doc-sample

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to structure the dataframe according to the ACS columns.

In [9]:
import requests
import json
from azure.search.documents import SearchClient  

doc_df = pd.read_json('../../data/docs/employee_handbook_embeddings.json')

batch_size = 10
total_records = doc_df.shape[0]
fields = doc_df.columns.to_numpy()
doc_df['id'] = doc_df['id'].astype(str)

records = []

for index, row in doc_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records.append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        search_client = SearchClient(acs_endpoint, doc_index_name, AzureKeyCredential(acs_key))
        result = search_client.upload_documents(documents=records)
        records = []

### Ingest to image-sample

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to structure the dataframe according to the ACS columns.

In [10]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

image_df = pd.read_json('../../data/images/images_embeddings.json')

batch_size = 10
total_records = image_df.shape[0]
fields = image_df.columns.to_numpy()
image_df['id'] = image_df['id'].astype(str)

records = []

for index, row in image_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records.append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        search_client = SearchClient(acs_endpoint, image_index_name, AzureKeyCredential(acs_key))
        result = search_client.upload_documents(documents=records)
        records = []