## Data Pipeline - Azure AI Search using Python SDK

### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../../common/generate_embeddings.ipynb) 

#### Set environment variables

In [1]:
import os
from dotenv import load_dotenv

load_dotenv()

ais_endpoint  = os.getenv("AIS_ENDPOINT")
if ais_endpoint is None or ais_endpoint == "":
    print("AIS_ENDPOINT environment variable not set.")
    exit()

ais_api_version  = os.getenv("AIS_API_VERSION")
if ais_api_version is None or ais_api_version == "":
    print("AIS_API_VERSION environment variable not set.")
    exit()

ais_key  = os.getenv("AIS_KEY")
if ais_key is None or ais_key == "":
    print("AIS_KEY environment variable not set.")
    exit()

text_index_name = 'text-sample'
doc_index_name = 'doc-sample'
image_index_name = 'image-sample'

#### Helper methods

In [20]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient

from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration
)

def get_index_client() -> SearchIndexClient:
    return SearchIndexClient(ais_endpoint, AzureKeyCredential(ais_key))

def create_index(index_name, fields, vector_search, semantic_title_field_name, semantic_content_field_names):
    index = SearchIndex(
        name=index_name,
        fields=fields,
        vector_search=vector_search
        )
    index_client = get_index_client()
    return index_client.create_index(index)

#### Create text-sample AI Search index

In [14]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.indexes import SearchIndexClient

from azure.search.documents.indexes.models import (
    SearchIndex,
    SearchField,
    SearchFieldDataType,
    SimpleField,
    SearchableField,
    VectorSearch,
    VectorSearchProfile,
    HnswAlgorithmConfiguration
)

vector_search = VectorSearch(
    profiles=[
        VectorSearchProfile(
            name="hnsw_profile", algorithm_configuration_name="hnsw-algorithms-config"
        )
    ],
    algorithms=[HnswAlgorithmConfiguration(name="hnsw-algorithms-config")],
)

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="title", type=SearchFieldDataType.String),
    SearchableField(name="content", type=SearchFieldDataType.String),
    SearchableField(name="category", type=SearchFieldDataType.String, filterable=True),
    SearchField(name="title_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile_name="hnsw_profile"),
    SearchField(name="content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile_name="hnsw_profile")    
]

index = create_index(text_index_name, fields, vector_search=vector_search, semantic_title_field_name="title", semantic_content_field_names=["content"])

#### Create doc-sample AI Search index

In [15]:
vector_search = VectorSearch(
    profiles=[
        VectorSearchProfile(
            name="hnsw_profile", algorithm_configuration_name="hnsw-algorithms-config"
        )
    ],
    algorithms=[HnswAlgorithmConfiguration(name="hnsw-algorithms-config")],
)

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="chunk_content", type=SearchFieldDataType.String),
    SearchField(name="chunk_content_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1536, vector_search_profile_name="hnsw_profile")   
]

index = create_index(doc_index_name, fields, vector_search=vector_search, semantic_title_field_name="chunk_content", semantic_content_field_names=["chunk_content"])

#### Create image-sample AI Search index

In [16]:
vector_search = VectorSearch(
    profiles=[
        VectorSearchProfile(
            name="hnsw_profile", algorithm_configuration_name="hnsw-algorithms-config"
        )
    ],
    algorithms=[HnswAlgorithmConfiguration(name="hnsw-algorithms-config")],
)

fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True),
    SearchableField(name="image", type=SearchFieldDataType.String),
    SearchField(name="image_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), searchable=True, vector_search_dimensions=1024, vector_search_profile_name="hnsw_profile")   
]

index = create_index(image_index_name, fields, vector_search=vector_search, semantic_title_field_name="image", semantic_content_field_names=["image"])

#### Ingest to text-sample

This cell first prepares a dataframe that has similar columns as the AI Search Index. If the dataframe doesn't have the same columns (column names or numbers) as the Index, add a preprocessing step to structure the dataframe according to the Index columns.

In [17]:
import requests
import json
import pandas as pd
from azure.search.documents import SearchClient  

text_df = pd.read_json('../../data/text/product_docs_embeddings.json') 

batch_size = 10
total_records = text_df.shape[0]
fields = text_df.columns.to_numpy()
text_df['id'] = text_df['id'].astype(str)

records = []

for index, row in text_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records.append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        search_client = SearchClient(ais_endpoint, text_index_name, AzureKeyCredential(ais_key))
        result = search_client.upload_documents(documents=records)
        records = []

### Ingest to doc-sample

This cell first prepares a dataframe that has similar columns as the AI Search Index. If the dataframe doesn't have the same columns (column names or numbers) as the Index, add a preprocessing step to structure the dataframe according to the Index columns.

In [18]:
import requests
import json
from azure.search.documents import SearchClient  
import pandas as pd

doc_df = pd.read_json('../../data/docs/employee_handbook_embeddings.json')

batch_size = 10
total_records = doc_df.shape[0]
fields = doc_df.columns.to_numpy()
doc_df['id'] = doc_df['id'].astype(str)

records = []

for index, row in doc_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records.append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        search_client = SearchClient(ais_endpoint, doc_index_name, AzureKeyCredential(ais_key))
        result = search_client.upload_documents(documents=records)
        records = []

### Ingest to image-sample

This cell first prepares a dataframe that has similar columns as the AI Search Index. If the dataframe doesn't have the same columns (column names or numbers) as the Index, add a preprocessing step to structure the dataframe according to the Index columns.

In [19]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

image_df = pd.read_json('../../data/images/images_embeddings.json')

batch_size = 10
total_records = image_df.shape[0]
fields = image_df.columns.to_numpy()
image_df['id'] = image_df['id'].astype(str)

records = []

for index, row in image_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records.append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        search_client = SearchClient(ais_endpoint, image_index_name, AzureKeyCredential(ais_key))
        result = search_client.upload_documents(documents=records)
        records = []