In [None]:
%pip install azure-ai-ml
%pip install -U 'azureml-rag[faiss]>=0.1.11'

In [None]:
# If `import win32file` fails with a DLL error then run the following and restart kernel:
# %pip uninstall -y pywin32
# %conda install -y --force-reinstall pywin32

# Create a FAISS based Vector Index for DBCopilot with AzureML
We'll walk through setting up an AzureML Pipeline which grounding a DataBase into a LangChain-compatible FAISS Vector Index and create the promptflow to consume this index to serve as a DBCopilot chatbot.

## Get client for AzureML Workspace

The workspace is the top-level resource for Azure Machine Learning, providing a centralized place to work with all the artifacts you create when you use Azure Machine Learning. In this section we will connect to the workspace in which the job will be run.

Enter your Workspace details below, running this still will write a `workspace.json` file to the current folder.

In [None]:
%%writefile workspace.json
{
    "subscription_id": "<subscription_id>",
    "resource_group": "<resource_group_name>",
    "workspace_name": "<workspace_name>"
}

`MLClient` is how you interact with AzureML

In [None]:
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.ai.ml import MLClient
from azureml.core import Workspace

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

try:
    ml_client = MLClient.from_config(credential=credential, path="workspace.json")
except Exception as ex:
    raise Exception(
        "Failed to create MLClient from config file. Please modify and then run the above cell with your AzureML Workspace details."
    ) from ex

ws = Workspace(
    subscription_id=ml_client.subscription_id,
    resource_group=ml_client.resource_group_name,
    workspace_name=ml_client.workspace_name,
)
print(ml_client)

## For SQL Server Create a SQL Datastore
1. Go to workspace in Azure Portal
2. Click **Data** -> **Datastore** -> **+ Create**
3. Fill in the datastore form

| Field | Value |
| --- | --- |
| Datastore name | the name for the datastore |
| Datastore type | Azure SQL Database |
| Account information| could be found in the Azure SQL Database overview page |
| Authentication type | SQL Authentication/ Service principal |
| Authentication info | SQL Authentication: username and password; Service principal: tenant id, client id, client secret |

In [None]:
datastore_name = "<test_db_datastore_name>"
db_datastore_uri = f"azureml://datastores/{datastore_name}"

<H3>For Tabular Data</H3>



In [None]:
%pip install pandas
%pip install -U mltable azureml-dataprep[pandas]

In [None]:
%%writefile test.csv
id,text
0,Hello world!

In [None]:
import sqlite3
import mltable
from azure.ai.ml.entities import Data
from azure.ai.ml.constants import AssetTypes

tbl = mltable.from_delimited_files([{"file": "test.csv"}])
# Convert tabular data to sqlite db
df = tbl.to_pandas_dataframe()
conn = sqlite3.connect("test.db")
df.to_sql("test_table", conn, if_exists="replace")
conn.close()
# Upload sqlite db to AzureML
my_data = Data(
    name="test_db",
    description="test db",
    version="1",
    path="test.db",
    type=AssetTypes.URI_FILE,
)
my_data = ml_client.data.create_or_update(my_data)
print(f"Data asset created. Name: {my_data.name}, version: {my_data.version}")
db_datastore_uri = my_data.path

## Azure OpenAI

We recommend using gpt-35-turbo model to get good quality QAs. [Follow these instructions](https://learn.microsoft.com/en-us/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal) to setup an Azure OpenAI Instance and deploy the model. Once you have the model deployed in AOAI you can specify your Model name and Deployment name below.

We will use the automatically created `Default_AzureOpenAI` connection, change `aoai_connection_name` to use your own.

In [None]:
aoai_connection_name = "Default_AzureOpenAI"

In [None]:
from azureml.rag.utils.connections import (
    get_connection_by_name_v2,
    create_connection_v2,
)

try:
    aoai_connection = get_connection_by_name_v2(ws, aoai_connection_name)
except Exception as ex:
    # Create New Connection
    # Modify the details below to match the `Endpoint` and API key of your AOAI resource, these details can be found in Azure Portal
    raise RuntimeError(
        "Have you entered your AOAI resource details below? If so, delete me!"
    )
    aoai_connection = create_connection_v2(
        workspace=ws,
        name=aoai_connection,
        category="AzureOpenAI",
        # 'Endpoint' from Azure OpenAI resource overview
        target="https://<endpoint_name>.openai.azure.com/",
        auth_type="ApiKey",
        credentials={
            # Either `Key` from the `Keys and Endpoint` tab of your Azure OpenAI resource, will be stored in your Workspace associated Azure Key Vault.
            "key": "<api-key>"
        },
        metadata={"ApiType": "azure", "ApiVersion": "2023-05-15"},
    )

aoai_connection_id = aoai_connection["id"]

Now that your Workspace has a connection to Azure OpenAI we will make sure the `gpt-35-turbo` model has been deployed ready for inference. This cell will fail if there is not deployment for the embeddings model, [follow these instructions](https://learn.microsoft.com/azure/cognitive-services/openai/how-to/create-resource?pivots=web-portal#deploy-a-model) to deploy a model with Azure OpenAI.

In [None]:
from azureml.rag.utils.deployment import infer_deployment

aoai_embedding_model_name = "text-embedding-ada-002"
try:
    aoai_embedding_deployment_name = infer_deployment(
        aoai_connection, aoai_embedding_model_name
    )
    print(
        f"Deployment name in AOAI workspace for model '{aoai_embedding_model_name}' is '{aoai_embedding_deployment_name}'"
    )
except Exception as e:
    print(f"Deployment name in AOAI workspace for model '{model_name}' is not found.")
    print(
        f"Please create a deployment for this model by following the deploy instructions on the resource page for '{aoai_connection['properties']['target']}' in Azure Portal."
    )

In [None]:
from azureml.rag.utils.deployment import infer_deployment

aoai_completion_model_name = "gpt-35-turbo"

try:
    aoai_completion_deployment_name = infer_deployment(
        aoai_connection, aoai_completion_model_name
    )
except Exception as e:
    print(
        f"Deployment name in AOAI workspace for model '{aoai_completion_model_name}' is not found."
    )
    print(
        f"Please create a deployment for this model by following the deploy instructions on the resource page for '{aoai_connection['properties']['target']}' in Azure Portal."
    )

print(
    f"Deployment name in AOAI workspace for model '{aoai_completion_model_name}' is '{aoai_completion_deployment_name}'"
)

Finally we will combine the deployment and model information into a uri form which the AzureML embeddings components expect as input.

In [None]:
llm_completion_config = f'{{"type":"azure_open_ai","model_name":"{aoai_completion_model_name}","deployment_name":"{aoai_completion_deployment_name}","temperature":0,"max_tokens":"1500"}}'

### Setup Pipeline

The Components are published to a [Registry](https://learn.microsoft.com/azure/machine-learning/how-to-manage-registries?view=azureml-api-2&tabs=cli), `azureml`, which should have access to by default, it can be accessed from any Workspace.
In the below cell we get the Component Definitions from the `azureml` registry.

In [None]:
ml_registry = MLClient(credential=credential, registry_name="azureml")

db_copilot_component = ml_registry.components.get(
    "llm_ingest_db_to_faiss", label="latest"
)

print(db_copilot_component)

In [None]:
from azure.ai.ml.dsl import pipeline


@pipeline(name=f"db_copilot_vector_pipeline_faiss", default_compute="serverless")
def db_copilot_vector_pipeline_faiss(
    aoai_connection: str,
    db_datastore: str,
    embeddings_model: str,
    chat_aoai_deployment_name: str,
    embedding_aoai_deployment_name: str,
    mlindex_dataset_name: str,
    selected_tables: str = None,
    max_sampling_rows: int = 3,
):
    db_copilot_component(
        db_datastore=db_datastore,
        embeddings_model=embeddings_model,
        chat_aoai_deployment_name=chat_aoai_deployment_name,
        embedding_aoai_deployment_name=embedding_aoai_deployment_name,
        embeddings_dataset_name=mlindex_dataset_name,
        embedding_connection=aoai_connection,
        llm_connection=aoai_connection,
        selected_tables=selected_tables,
        max_sampling_rows=max_sampling_rows,
    )
    return {}

In [None]:
aoai_embedding_model_name = "text-embedding-ada-002"
asset_name = "llm_index_db_dataset"
pipeline_job = db_copilot_vector_pipeline_faiss(
    aoai_connection=aoai_connection_id,
    db_datastore=db_datastore_uri,
    embeddings_model=f"azure_open_ai://deployment/{aoai_embedding_deployment_name}/model/{aoai_completion_model_name}",
    chat_aoai_deployment_name=aoai_completion_deployment_name,
    embedding_aoai_deployment_name=aoai_embedding_deployment_name,
    mlindex_dataset_name=asset_name,
    selected_tables='["[dbo].[jobs]"]',  # '["jobs"]' for sqlite
    max_sampling_rows=3,
)

In [None]:
# These are added so that in progress index generations can be listed in UI, this tagging is done automatically by UI.
pipeline_job.properties["azureml.mlIndexAssetName"] = asset_name
pipeline_job.properties["azureml.mlIndexAssetKind"] = "faiss"
pipeline_job.properties["azureml.mlIndexAssetSource"] = "Database"

In [None]:
running_pipeline_job = ml_client.jobs.create_or_update(
    pipeline_job, experiment_name="db_copilot_pipeline"
)
running_pipeline_job

In [None]:
ml_client.jobs.stream(running_pipeline_job.name)

## Use DBCopilot with Promptflow
After the pipeline complete, it will create a promptflow which could be used to chat with the db.