# End to end process

This code demonstrate the complete process with the following tasks:
1. Convert HTML files to markdown format
2. Chunk markdown content with the maximum number of tokens specified (in this example, 512 tokens with 25% of overlapping)
3. Create the index and upload the chunks (in this example, the embeddings are created with the model ada-02)
4. Test the search and answer generation creating the Excel files with the results of answers evaluation

## Prerequisites
+ An Azure subscription, with [access to Azure OpenAI](https://aka.ms/oai/access).
+ A Document Intelligence service with its end-point and API key.
+ An Azure OpenAI service with the service name and an API key.
+ A deployment of the text-embedding-ada-002 embedding model on the Azure OpenAI Service.
+ An Azure AI Search service with the end-point, API Key and the index name to create.

We used Python 3.12.5, [Visual Studio Code with the Python extension](https://code.visualstudio.com/docs/python/python-tutorial), and the [Jupyter extension](https://marketplace.visualstudio.com/items?itemName=ms-toolsai.jupyter) to test this example.

### Set up a Python virtual environment in Visual Studio Code

1. Open the Command Palette (Ctrl+Shift+P).
1. Search for **Python: Create Environment**.
1. Select **Venv**.
1. Select a Python interpreter. Choose 3.10 or later.

It can take a minute to set up. If you run into problems, see [Python environments in VS Code](https://code.visualstudio.com/docs/python/environments).

In [None]:
!pip install openai
!pip install azure-search-documents
!pip install nbimporter
!pip install nbformat

## Import packages and create AOAI and AI Search clients

In [1]:
import os
import sys
from dotenv import load_dotenv
from openai import AzureOpenAI
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.search.documents import SearchClient
import pandas as pd

sys.path.append('../..')
from rag_utils import load_files, get_markdown_with_doc_intel, chunk_with_max_tokens, create_index, index_documents, generate_answers_and_questions, execute_test, generate_chunks_with_aoai, generate_topics_and_documents

# Load environment variables from .env
load_dotenv(override=True)

# DOCUMENT INTELLIGENCE
doc_intel_endpoint = os.getenv("DOC_INTEL_ENDPOINT")
doc_intel_key = os.getenv("DOC_INTEL_KEY")
doc_intel_client = DocumentIntelligenceClient(endpoint=doc_intel_endpoint, credential=AzureKeyCredential(doc_intel_key))

# AZURE AI SEARCH
ai_search_endpoint = os.environ["SEARCH_SERVICE_ENDPOINT"]
ai_search_apikey = os.environ["SEARCH_SERVICE_QUERY_KEY"]
ai_search_index_name = os.environ["SEARCH_INDEX_NAME"]
ai_search_credential = AzureKeyCredential(ai_search_apikey)

# CREATE AZURE AI SEARCH CLIENT
ai_search_client = SearchClient(endpoint=ai_search_endpoint, index_name=ai_search_index_name, credential=ai_search_credential)

aoai_api_version = '2024-02-15-preview'

# AOAI FOR ANSWER GENERATION
aoai_answer_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
aoai_answer_apikey = os.environ["AZURE_OPENAI_API_KEY"]
aoai_answer_model_name = os.environ["AZURE_OPENAI_DEPLOYMENT_NAME"]
# Create AOAI client for answer generation
aoai_answer_client = AzureOpenAI(
    azure_deployment=aoai_answer_model_name,
    api_version=aoai_api_version,
    azure_endpoint=aoai_answer_endpoint,
    api_key=aoai_answer_apikey
)

# AZURE OPENAI FOR RERANKING
aoai_rerank_endpoint = os.environ["AZURE_OPENAI_RERANK_ENDPOINT"]
azure_openai_rerank_key = os.environ["AZURE_OPENAI_RERANK_API_KEY"]
rerank_model_name = os.environ["AZURE_OPENAI_RERANK_DEPLOYMENT_NAME"]
# Create AOAI client for reranking
aoai_rerank_client = AzureOpenAI(
    azure_deployment=rerank_model_name,
    api_version=aoai_api_version,
    azure_endpoint=aoai_rerank_endpoint,
    api_key=azure_openai_rerank_key
)

# AZURE OPENAI FOR EMBEDDING
aoai_embedding_endpoint = os.environ["AZURE_OPENAI_EMBEDDING_ENDPOINT"]
azure_openai_embedding_key = os.environ["AZURE_OPENAI_EMBEDDING_API_KEY"]
embedding_model_name_ada = os.environ["AZURE_OPENAI_EMBEDDING_NAME_ADA"]
embedding_model_name_large_3 = os.environ["AZURE_OPENAI_EMBEDDING_NAME_LARGE_3"]
# Create AOAI client for embedding creation (ADA)
aoai_embedding_client_ada = AzureOpenAI(
    azure_deployment=embedding_model_name_ada,
    api_version=aoai_api_version,
    azure_endpoint=aoai_embedding_endpoint,
    api_key=azure_openai_embedding_key
)
# Create AOAI client for embedding creation (LARGE-3)
aoai_embedding_client_large_3 = AzureOpenAI(
    azure_deployment=embedding_model_name_large_3,
    api_version=aoai_api_version,
    azure_endpoint=aoai_embedding_endpoint,
    api_key=azure_openai_embedding_key
)

# Prepare the tests
TESTS = {
        # Test-name: Embeddings_fields | uppercase/lowercase) | embbeding_model | index_name | max_retrieve | max_generate
        "title_content_ada_512_search_upper_20_10": ("embeddingTitle, embeddingContent", "upper", "ada", aoai_embedding_client_ada, "project_assurance_ada_512", 20, 10),
        "title_content_ada_512_search_upper_20_20": ("embeddingTitle, embeddingContent", "upper", "ada", aoai_embedding_client_ada, "project_assurance_ada_512", 20, 20),
        "title_content_ada_512_search_lower_20_10": ("embeddingTitle, embeddingContent", "lower", "ada", aoai_embedding_client_ada, "project_assurance_ada_512", 20, 10),
        "title_content_ada_512_search_lower_20_20": ("embeddingTitle, embeddingContent", "lower", "ada", aoai_embedding_client_ada, "project_assurance_ada_512", 20, 20)
        
        # "title_content_large_3_512_search_upper_20_10": ("embeddingTitle, embeddingContent", "upper", "large-3", aoai_embedding_client_large_3, "project_assurance_large_3_512", 20, 10),
        # "title_content_large_3_512_search_upper_20_20": ("embeddingTitle, embeddingContent", "upper", "large-3", aoai_embedding_client_large_3, "project_assurance_large_3_512", 20, 20),
        # "title_content_large_3_512_search_lower_20_10": ("embeddingTitle, embeddingContent", "lower", "large-3", aoai_embedding_client_large_3, "project_assurance_large_3_512", 20, 10),
        # "title_content_large_3_512_search_lower_20_20": ("embeddingTitle, embeddingContent", "lower", "large-3", aoai_embedding_client_large_3, "project_assurance_large_3_512", 20, 20),
}

# CONSTANTS
MAX_CHUNK_TOKEN_SIZE = 512 # Max number of tokens for chunking
CHUNK_OVERLAPPING=0.25 # 25% of overlapping between chunks
SELECT_FIELDS=["id", "title", "content"] # Fields to retrieve in the search


### For every HTML file in the input directory generate synthetic html content, convert them to markdown, generate synthetic Q&A pairs, chunk markdown documents, index them is AI Search, execute the test and evaluation of generated answers

In [None]:
# STEP 1: Generate synthetic content about a customer's sector. Example: Telco
input_dir="../output_telco"
generate_topics_and_documents(aoai_answer_client, aoai_answer_model_name, "QuickConnect", "telecommunications", input_dir)

#input_dir = '..\data_in' # Comment STEP 1 and uncomment this line if you want to use the html generated automatically about a Telco company that comes with that folder
html_files = load_files(input_dir, '.html')

all_chunks = []
qa_data = {'question': [], 'answer': []}
# Read the html files
for i, html_file in enumerate(html_files):
    print(f"[{i + 1}]: {html_file['title']}")
    # print(f"\t[{html_file['content']}]")

    # STEP 2: Convert the html files to markdown format
    print(f'\tConverting to markdown...')
    markdown = get_markdown_with_doc_intel(doc_intel_client, html_file['content'])
    # print(f'markdown: [{markdown}]')
    title = html_file['title'].replace('.html', '')

    # STEP 3: Generate questions and answers pairs from the markdown content and prepare them to be salved in an Excel file
    qa_pairs = generate_answers_and_questions(aoai_answer_client, aoai_answer_model_name, title + '. ' + markdown)
    for qa in qa_pairs:
        qa_data['question'].append(qa['question'])
        qa_data['answer'].append(qa['answer'])

    # STEP 4: Chunk the markdown content with a maximum number of tokens and a percentage of overlapping
    # In this example, 512 tokens with 25% of overlapping
    chunks = chunk_with_max_tokens(markdown, max_tokens=MAX_CHUNK_TOKEN_SIZE, overlap=CHUNK_OVERLAPPING)

    # Chunk the markdown content with Semantic Chunking with GPT-4o with a maximum of tokens
    # To test this chunking method comment the previous line and uncomment the following one:
    # chunks = generate_chunks_with_aoai(aoai_answer_client, aoai_answer_model_name, markdown, MAX_CHUNK_TOKEN_SIZE)

    # Prepare the list of chunks to be indexed
    for chunk in chunks:
        new_row = {
            "title": title, 
            "content": chunk
        }
        all_chunks.append(new_row)

    # Stop after 10 files do finish faster the test
    if i + 1 == 10: break

# Save questions and answers pairs in an Excel file
df = pd.DataFrame(qa_data)
qa_output_file = 'qa_pairs.xlsx'
df.to_excel(qa_output_file, index=False)
print(f'File {qa_output_file} saved')

# STEP 5: Create the index for embeddings with ada-02
index_name = 'project_assurance_ada'
create_index(ai_search_endpoint, ai_search_credential, index_name, embedding_model_name_ada)

# STEP 6: Index the chunks
# In this example, the embeddings are created with the model ada-02
index_documents(ai_search_endpoint, ai_search_credential, index_name, aoai_embedding_client_ada, embedding_model_name_ada, all_chunks)

# STEP 7: Execute the tests with ada-02
for test_name, (embedding_fields, case, embedding_model, embedding_client, index_name, max_retrieve, max_generate) in TESTS.items():
    execute_test(ai_search_endpoint, ai_search_credential, SELECT_FIELDS, aoai_rerank_client, rerank_model_name, aoai_answer_client, aoai_answer_model_name, test_name, embedding_fields, case, embedding_model, embedding_client, index_name, max_retrieve, max_generate, qa_output_file)
