### Set up a Python virtual environment in Visual Studio Code

1. Open the Command Palette (Ctrl+Shift+P).
1. Search for **Python: Create Environment**.
1. Select **Venv**.
1. Select a Python interpreter. Choose 3.10 or later.

It can take a minute to set up. If you run into problems, see [Python environments in VS Code](https://code.visualstudio.com/docs/python/environments).

### Install packages

In [None]:
%pip install --quiet -r requirements.txt

### Load .env file (Copy .env-sample to .env and update accordingly)

In [17]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv() # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
search_endpoint = os.environ["AZURE_SEARCH_SERVICE_ENDPOINT"]
azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
azure_openai_embedding_deployment_id = os.environ["AZURE_OPENAI_EMBEDDING_DEPLOYMENT_ID"]

# Two indexes for different text splitting strategies
recursivetextsplitter_searchindex = os.environ["AZURE_SEARCH_LANGCHAIN_RECURSIVETEXTSPLITTER_INDEX"]
spacytextsplitter_searchindex = os.environ["AZURE_SEARCH_LANGCHAIN_SPACYTEXTSPLITTER_INDEX"]

search_credential = AzureKeyCredential(os.environ["AZURE_SEARCH_ADMIN_KEY"]) if len(os.environ["AZURE_SEARCH_ADMIN_KEY"]) > 0 else DefaultAzureCredential()
azure_openai_key = os.environ["AZURE_OPENAI_KEY"] if len(os.environ["AZURE_OPENAI_KEY"]) > 0 else None

### Setup sample resources for embedding chunks

In [9]:
from openai import AzureOpenAI
from azure.identity import get_bearer_token_provider

azure_openai_client = None
if azure_openai_key:
    azure_openai_client = AzureOpenAI(
        api_key=azure_openai_key, 
        api_version="2023-05-15",
        azure_deployment=azure_openai_embedding_deployment_id,
        azure_endpoint=azure_openai_endpoint)
else:
    azure_openai_client = AzureOpenAI(
        azure_ad_token_provider=get_bearer_token_provider(DefaultAzureCredential(), "https://cognitiveservices.azure.com/.default"),
        api_version="2023-05-15",
        azure_deployment=azure_openai_embedding_deployment_id,
        azure_endpoint=azure_openai_endpoint)

### Setup sample resources for recursive text splitter chunking

In [15]:
from azure.search.documents.indexes import SearchIndexClient
from lib.common import (
    create_search_index,
)

search_index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
rts_searchindex = create_search_index(
    recursivetextsplitter_searchindex,
    azure_openai_endpoint,
    azure_openai_embedding_deployment_id,
    azure_openai_key
)
search_index_client.create_or_update_index(rts_searchindex)

print("Created recursive text splitter index")


Created recursive text splitter index


### Load PDF

In [4]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

loader = PyPDFLoader(os.path.join("data", "earth_at_night_508.pdf"))
pages = loader.load()


### Chunk PDF using Recursive text splitter

In [None]:
recursive_text_splitter = RecursiveCharacterTextSplitter(
   chunk_size=2000, 
   chunk_overlap=500,
   length_function=len,
   is_separator_regex=False
)

recursive_text_splitter_chunks = recursive_text_splitter.split_documents(pages)

### Embed Recursive text splitter chunks

In [17]:
recursive_text_splitter_embeddings = azure_openai_client.embeddings.create(input=[chunk.page_content for chunk in recursive_text_splitter_chunks], model=azure_openai_embedding_deployment_id)
recursive_text_splitter_embeddings = [result.embedding for result in recursive_text_splitter_embeddings.data]

### Upload chunks to search index

In [21]:
recursive_search_client = search_index_client.get_search_client(recursivetextsplitter_searchindex)

docs = [
    {
        "parent_id": "0",
        "chunk_id": f"{i}_earth_at_night_508_pdf",
        "chunk": chunk.page_content,
        "title": "earth_at_night_508.pdf",
        "vector": recursive_text_splitter_embeddings[i]
    }
    for i, chunk in enumerate(recursive_text_splitter_chunks)
]

recursive_search_client.upload_documents(docs)

print("Uploaded chunks and embeddings for recursive text splitter")

Uploaded chunks and embeddings for recursive text splitter


### (Optional) Chunk PDF using Spacey text splitter

In [5]:
from langchain.text_splitter import SpacyTextSplitter

spacey_text_splitter = SpacyTextSplitter(
   pipeline="sentencizer", # See https://spacy.io/models/en/ for available models
   chunk_size=2000,
   chunk_overlap=500
)

spacey_text_splitter_chunks = spacey_text_splitter.split_documents(pages)

### (Optional) Embed Spacy text splitter chunks

In [13]:
spacy_text_splitter_embeddings = azure_openai_client.embeddings.create(input=[chunk.page_content for chunk in spacey_text_splitter_chunks], model=azure_openai_embedding_deployment_id)
spacy_text_splitter_embeddings = [result.embedding for result in spacy_text_splitter_embeddings.data]

### (Optional) Upload chunks to search index

In [18]:
search_index_client = SearchIndexClient(endpoint=search_endpoint, credential=search_credential)
sts_searchindex = create_search_index(
    spacytextsplitter_searchindex,
    azure_openai_endpoint,
    azure_openai_embedding_deployment_id,
    azure_openai_key
)
search_index_client.create_or_update_index(sts_searchindex)

print("Created spacy text splitter index")

spacy_search_client = search_index_client.get_search_client(spacytextsplitter_searchindex)

docs = [
    {
        "parent_id": "0",
        "chunk_id": f"{i}_earth_at_night_508_pdf",
        "chunk": chunk.page_content,
        "title": "earth_at_night_508.pdf",
        "vector": spacy_text_splitter_embeddings[i]
    }
    for i, chunk in enumerate(spacey_text_splitter_chunks)
]

spacy_search_client.upload_documents(docs)

print("Uploaded chunks and embeddings for spacy text splitter")

Created spacy text splitter index
Uploaded chunks and embeddings for spacy text splitter
