## Data Pipeline - Fabric Real-Time Analytics(Kusto) using Python SDK

### Prerequisites
  
- Generate embeddings - [generate_embeddings.ipynb](../../common/generate_embeddings.ipynb) 

#### Set environment variables

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

kdb_cluster_name  = os.getenv("KUSTO_CLUSTER_NAME")
if kdb_cluster_name is None or kdb_cluster_name == "":
    print("KUSTO_CLUSTER_NAME environment variable not set.")
    exit()

kdb_database_name = os.getenv("KUSTO_DATABASE_NAME")
if kdb_database_name is None or kdb_database_name == "":
    print("KUSTO_DATABASE_NAME environment variable not set.")
    exit()

text_vectors_table = 'text-sample'
doc_vectors_table = 'doc-sample'
image_vectors_table = 'image-sample'


In [None]:
# (Optional) Set the env vars of service principal if authenticating with client-secret. 
kdb_client_id  = os.getenv("KUSTO_CLIENT_ID")
if kdb_client_id is None or kdb_client_id == "":
    print("KUSTO_CLIENT_ID environment variable not set.")
    exit()

kdb_client_secret  = os.getenv("KUSTO_CLIENT_SECRET")
if kdb_client_secret is None or kdb_client_secret == "":
    print("KUSTO_CLIENT_SECRET environment variable not set.")
    exit()

kdb_authority_id = os.getenv("KUSTO_AUTHORITY_ID")
if kdb_authority_id is None or kdb_authority_id == "":
    print("KUSTO_AUTHORITY_ID environment variable not set.")
    exit()

In [None]:
from azure.kusto.data import KustoClient, KustoConnectionStringBuilder
from azure.kusto.data.data_format import DataFormat
from azure.kusto.data.helpers import dataframe_from_result_table

from azure.kusto.ingest import (
    ReportLevel,
    IngestionProperties,
    QueuedIngestClient,
)

cluster = f"https://{kdb_cluster_name}.kusto.fabric.microsoft.com/"
ingest_cluster = f"https://ingest-{kdb_cluster_name}.kusto.fabric.microsoft.com/"

#### [Option 1] AAD Device Authentication

In [None]:
kcsb = KustoConnectionStringBuilder.with_aad_device_authentication(cluster)
client = KustoClient(kcsb)

ingestion_kcsb = KustoConnectionStringBuilder.with_aad_device_authentication(ingest_cluster)
ingestion_client = QueuedIngestClient(ingestion_kcsb)

#### [Option 2] Client-Secret Authentication 
To use client-secret authentication, you need to set the following environment variables in the `.env` file:
```bash
KUSTO_CLIENT_ID=<client id of the service principle>
KUSTO_CLIENT_SECRET=<client secret of the service principle>
KUSTO_AUTHORITY_ID=<authority id>
```

In [None]:
kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication(cluster, kdb_client_id, kdb_client_secret, kdb_authority_id)
client = KustoClient(kcsb)

ingestion_kcsb = KustoConnectionStringBuilder.with_aad_application_key_authentication(ingest_cluster, kdb_client_id, kdb_client_secret, kdb_authority_id)
ingestion_client = QueuedIngestClient(ingestion_kcsb)

#### Create Tables for Storing vectors

As Kusto databases currently doest not support HNSW indexes or other indexes suiting for similarity search, we will use tables to store vectors and then use built-in function `series_cosine_similarity_fl` in KQL(Kusto Query Language) to perform similarity search.

The similarity is calculated during the query:
```kql
similarity=series_cosine_similarity_fl('{query_vector}', ['{target_vector_column}'], 1, 1)
```

In [None]:
#### Create Tables

CREATE_TEXT_TABLE_COMMAND = f"""
.create table ['{text_vectors_table}'] (
    ['id']:int, 
    ['title']:string, 
    ['content']:string, 
    ['category']:string, 
    ['title_vector']:dynamic, 
    ['content_vector']:dynamic
)
"""

result = client.execute_mgmt(kdb_database_name, CREATE_TEXT_TABLE_COMMAND)
dataframe_from_result_table(result.primary_results[0])

CREATE_DOCS_TABLE_COMMAND = f"""
.create table ['{doc_vectors_table}'] (
    ['id']:int, 
    ['chunk_content']:string, 
    ['chunk_content_vector']:dynamic
)
"""

result = client.execute_mgmt(kdb_database_name, CREATE_DOCS_TABLE_COMMAND)
dataframe_from_result_table(result.primary_results[0])

CREATE_IMAGE_TABLE_COMMAND = f"""
.create table ['{image_vectors_table}'] (
    ['id']:int, 
    ['image']:string, 
    ['image_vector']:dynamic
)
"""

result = client.execute_mgmt(kdb_database_name, CREATE_IMAGE_TABLE_COMMAND)
dataframe_from_result_table(result.primary_results[0])


#### Ingest to text-sample

In [None]:
text_vectors_path = "../data/text/product_docs_embeddings.json"

ingestion_props = IngestionProperties(
    database=f"{kdb_database_name}",
    table=f"{text_vectors_table}",
    data_format=DataFormat.MULTIJSON,
    report_level=ReportLevel.FailuresAndSuccesses,
)

ingestion_client.ingest_from_file(text_vectors_path, ingestion_properties=ingestion_props)

#### Ingest to doc-sample

In [None]:
doc_vectors_path = "../data/docs/employee_handbook_embeddings.json"

ingestion_props = IngestionProperties(
    database=f"{kdb_database_name}",
    table=f"{doc_vectors_table}",
    data_format=DataFormat.MULTIJSON,
    report_level=ReportLevel.FailuresAndSuccesses,
)

ingestion_client.ingest_from_file(doc_vectors_path, ingestion_properties=ingestion_props)

#### Ingest to image-sample

In [None]:
image_vectors_path = "../data/images/images_embeddings.json"

ingestion_props = IngestionProperties(
    database=f"{kdb_database_name}",
    table=f"{image_vectors_table}",
    data_format=DataFormat.MULTIJSON,
    report_level=ReportLevel.FailuresAndSuccesses,
)

ingestion_client.ingest_from_file(image_vectors_path, ingestion_properties=ingestion_props)