## Data Pipeline - Azure AI Search using REST Endpoint

The samples uses Azure AI Search API Version _2023-11-01_.

### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../../common/generate_embeddings.ipynb) 

#### Set environment variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

ais_endpoint  = os.getenv("AIS_ENDPOINT")
if ais_endpoint is None or ais_endpoint == "":
    print("AIS_ENDPOINT environment variable not set.")
    exit(1)

ais_api_version  = os.getenv("AIS_API_VERSION")
if ais_api_version is None or ais_api_version == "":
    print("AIS_API_VERSION environment variable not set.")
    exit(1)

ais_key  = os.getenv("AIS_KEY")
if ais_key is None or ais_key == "":
    print("AIS_KEY environment variable not set.")
    exit(1)

ais_index_definition_text = 'index_definition/index_definition_text.json'
ais_index_definition_doc = 'index_definition/index_definition_doc.json'
ais_index_definition_image = 'index_definition/index_definition_image.json'

#### Helper methods

In [None]:
import requests
import json

def insert_record(ais_endpoint, ais_index, data, ais_key, ais_api_version):
    url = f"{ais_endpoint}/indexes/{ais_index}/docs/index?api-version={ais_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": ais_key
    }    
    response = requests.post(url, data=data, headers=headers)
    print(response.status_code)
    print(response.content)

def create_index(ais_endpoint, json_content, index_name, api_key, ais_api_version):
    url = f"{ais_endpoint}/indexes/{index_name}?api-version={ais_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    response = requests.request('PUT', url, headers=headers, data=json_content)
    print(response.status_code)
    print(response.content)

def get_ais_index_name(ais_index_definition):
    index_json_content = read_json_file(ais_index_definition)
    index_json = json.loads(index_json_content)
    index_name = index_json['name']

    return index_name

def read_json_file(file_path):
    with open(file_path, "r") as file:
        return file.read()

#### Create AI Search index

- [Add vector fields to a search index](https://learn.microsoft.com/en-us/azure/search/vector-search-how-to-create-index)

In [None]:
## Create text_sample index
index_definition = read_json_file(ais_index_definition_text)
index_name = get_ais_index_name(ais_index_definition_text)

create_index(ais_endpoint, index_definition, index_name, ais_key, ais_api_version)

## Create doc_sample index
index_definition = read_json_file(ais_index_definition_doc)
index_name = get_ais_index_name(ais_index_definition_doc)

create_index(ais_endpoint, index_definition, index_name, ais_key, ais_api_version)

## Create image_sample index
index_definition = read_json_file(ais_index_definition_image)
index_name = get_ais_index_name(ais_index_definition_image)

create_index(ais_endpoint, index_definition, index_name, ais_key, ais_api_version)

#### Ingest text sample with embeddings

This cell first prepares a dataframe that has similar columns as the AI Search Index. If the dataframe doesn't have the same columns (column names or numbers) as the Index, add a preprocessing step to structure the dataframe according to the Index columns.

In [None]:
import requests
import json
import pandas as pd

text_df = pd.read_json('../../data/text/product_docs_embeddings.json')
index_name = get_ais_index_name(ais_index_definition_text)

batch_size = 10
total_records = text_df.shape[0]
fields = text_df.columns.to_numpy()
text_df['id'] = text_df['id'].astype(str)

records = {
    'value': []
}

for index, row in text_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records['value'].append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        json_data = json.dumps(records)
        insert_record(ais_endpoint, index_name, json_data, ais_key, ais_api_version)
        records['value'] = []

#### Ingest doc sample with embeddings

This cell first prepares a dataframe that has similar columns as the AI Search Index. If the dataframe doesn't have the same columns (column names or numbers) as the Index, add a preprocessing step to structure the dataframe according to the Index columns.

In [None]:
doc_df = pd.read_json('../../data/docs/employee_handbook_embeddings.json')
index_name = get_ais_index_name(ais_index_definition_doc)

batch_size = 10
total_records = doc_df.shape[0]
fields = doc_df.columns.to_numpy()
doc_df['id'] = doc_df['id'].astype(str)

records = {
    'value': []
}

for index, row in doc_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records['value'].append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        json_data = json.dumps(records)
        insert_record(ais_endpoint, index_name, json_data, ais_key, ais_api_version)
        records['value'] = []

#### Ingest image sample with embeddings

This cell first prepares a dataframe that has similar columns as the AI Search Index. If the dataframe doesn't have the same columns (column names or numbers) as the Index, add a preprocessing step to structure the dataframe according to the Index columns.

In [None]:
image_df = pd.read_json('../../data/images/images_embeddings.json')
index_name = get_ais_index_name(ais_index_definition_image)

batch_size = 10
total_records = text_df.shape[0]
fields = image_df.columns.to_numpy()
image_df['id'] = image_df['id'].astype(str)

records = {
    'value': []
}

for index, row in image_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records['value'].append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        json_data = json.dumps(records)
        insert_record(ais_endpoint, index_name, json_data, ais_key, ais_api_version)
        records['value'] = []