## Ingestion to Azure Cognitive Search via REST Endpoint

### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../../common/generate_embeddings.ipynb) 

### Set environment variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

acs_endpoint  = os.getenv("ACS_ENDPOINT")
if acs_endpoint is None or acs_endpoint == "":
    print("ACS_ENDPOINT environment variable not set.")
    exit()

acs_api_version  = os.getenv("ACS_API_VERSION")
if acs_api_version is None or acs_api_version == "":
    print("ACS_API_VERSION environment variable not set.")
    exit()

acs_key  = os.getenv("ACS_KEY")
if acs_key is None or acs_key == "":
    print("ACS_KEY environment variable not set.")
    exit()

aoai_endpoint  = os.getenv("AOAI_ENDPOINT")
if aoai_endpoint is None or aoai_endpoint == "":
    print("AOAI_ENDPOINT environment variable not set.")
    exit()

aoai_api_version  = os.getenv("AOAI_API_VERSION")
if aoai_api_version is None or aoai_api_version == "":
    print("AOAI_API_VERSION environment variable not set.")
    exit()

aoai_embedding_deployed_model  = os.getenv("AOAI_EMBEDDING_DEPLOYED_MODEL")
if aoai_embedding_deployed_model is None or aoai_embedding_deployed_model == "":
    print("AOAI_EMBEDDING_DEPLOYED_MODEL environment variable not set.")
    exit()

acs_index_definition_text = 'index_definition/index_definition_text.json'
acs_index_definition_doc = 'index_definition/index_definition_doc.json'
acs_index_definition_image = 'index_definition/index_definition_image.json'

#### Helper methods

In [None]:
import requests
import json

def insert_record(acs_endpoint, acs_index, data, acs_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{acs_index}/docs/index?api-version={acs_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": acs_key
    }    
    response = requests.post(url, data=data, headers=headers)
    print(response.status_code)
    print(response.content)

def create_index(acs_endpoint, json_content, index_name, api_key, acs_api_version):
    url = f"{acs_endpoint}/indexes/{index_name}?api-version={acs_api_version}"
    headers = {
        "Content-Type": "application/json",
        "api-key": api_key
    }
    response = requests.request('PUT', url, headers=headers, data=json_content)
    print(response.status_code)
    print(response.content)

def get_acs_index_name(acs_index_definition):
    index_json_content = read_json_file(acs_index_definition)
    index_json = json.loads(index_json_content)
    index_name = index_json['name']

    return index_name

def read_json_file(file_path):
    with open(file_path, "r") as file:
        return file.read()

#### Create ACS index

In [None]:
## Create text_sample index
index_definition = read_json_file(acs_index_definition_text)
index_name = get_acs_index_name(acs_index_definition_text)

create_index(acs_endpoint, index_definition, index_name, acs_key, acs_api_version)

## Create doc_sample index
index_definition = read_json_file(acs_index_definition_doc)
index_name = get_acs_index_name(acs_index_definition_doc)

create_index(acs_endpoint, index_definition, index_name, acs_key, acs_api_version)

## Create image_sample index
index_definition = read_json_file(acs_index_definition_image)
index_name = get_acs_index_name(acs_index_definition_image)

create_index(acs_endpoint, index_definition, index_name, acs_key, acs_api_version)

#### Ingest text sample with embeddings

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to structure the dataframe according to the ACS columns.

In [None]:
import requests
import json
import pandas as pd

text_df = pd.read_json('../../data/text/product_docs_embeddings.json')
index_name = get_acs_index_name(acs_index_definition_text)

batch_size = 10
total_records = text_df.shape[0]
fields = text_df.columns.to_numpy()
text_df['id'] = text_df['id'].astype(str)

records = {
    'value': []
}

for index, row in text_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records['value'].append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        json_data = json.dumps(records)
        insert_record(acs_endpoint, index_name, json_data, acs_key, acs_api_version)
        records['value'] = []

#### Ingest doc sample with embeddings

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to structure the dataframe according to the ACS columns.

In [None]:
doc_df = pd.read_json('../../data/docs/employee_handbook_embeddings.json')
index_name = get_acs_index_name(acs_index_definition_doc)

batch_size = 10
total_records = doc_df.shape[0]
fields = doc_df.columns.to_numpy()
doc_df['id'] = doc_df['id'].astype(str)

records = {
    'value': []
}

for index, row in doc_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records['value'].append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        json_data = json.dumps(records)
        insert_record(acs_endpoint, index_name, json_data, acs_key, acs_api_version)
        records['value'] = []

#### Ingest image sample with embeddings

This cell works because the dataframe and the ACS Index both have same columns. If the dataframe doesn't have the same columns (column names or numbers) as the ACS Index, add a preprocessing step to structure the dataframe according to the ACS columns.

In [None]:
image_df = pd.read_json('../../data/images/images_embeddings.json')
index_name = get_acs_index_name(acs_index_definition_image)

batch_size = 10
total_records = text_df.shape[0]
fields = image_df.columns.to_numpy()
image_df['id'] = image_df['id'].astype(str)

records = {
    'value': []
}

for index, row in image_df.iterrows():
    record = {}
    for field in fields:
            record[field] = row[field]

    records['value'].append(
        record
    )

    if index % batch_size == 0 or (index+1 == total_records):
        json_data = json.dumps(records)
        insert_record(acs_endpoint, index_name, json_data, acs_key, acs_api_version)
        records['value'] = []