In [1]:
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.core.credentials import AzureKeyCredential
import os

load_dotenv(override=True) # take environment variables from .env.

# Variables not used here do not need to be updated in your .env file
# endpoint = os.environ["AZURE_COGNITIVE_SEARCH_ENDPOINT"]
# credential = AzureKeyCredential(os.getenv("AZURE_COGNITIVE_SEARCH_KEY")) if os.getenv("AZURE_COGNITIVE_SEARCH_KEY") else DefaultAzureCredential()
# index_name = os.getenv("AZURE_SEARCH_INDEX_NAME", "int-vec")

blob_container_name = os.getenv("BLOB_CONTAINER_NAME")
blob_connection_string = os.environ["BLOB_CONNECTION_STRING"]

# azure_openai_embedding_deployment = os.getenv("EMBEDDING_MODEL_NAME", "text-embedding-3-large")
# azure_openai_model_name = os.getenv("EMBEDDING_MODEL_NAME", "text-embedding-3-large")
# azure_openai_model_dimensions = int(os.getenv("AZURE_OPENAI_EMBEDDING_DIMENSIONS", 1536))
# azure_openai_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
# azure_openai_key = os.getenv("AZURE_OPENAI_KEY")

# set USE_OCR to enable OCR to add page numbers. It cannot be combined with the document layout skill
use_ocr = os.getenv("USE_OCR", "false") == "true"
# set USE_LAYOUT to enable Document Intelligence Layout skill for chunking by markdown. It cannot be combined with the built-in OCR
use_document_layout = os.getenv("USE_LAYOUT", "false") == "true"
# set USE_MARKDOWN to enable parsing markdown files in the blob container. It cannot be combined with the built-in OCR or document layout skill
use_markdown = os.getenv("USE_MARKDOWN", "true") == "true"
# Deepest nesting level in markdown that should be considered. See https://learn.microsoft.com/azure/search/cognitive-search-skill-document-intelligence-layout to learn more
document_layout_depth = os.getenv("LAYOUT_MARKDOWN_HEADER_DEPTH", "h3")
# OCR must be used to add page numbers
add_page_numbers = use_ocr

count_enabled = sum([use_ocr, use_document_layout, use_markdown])
if count_enabled >= 2:
    raise Exception(f"Please enable only one of OCR, Layout or Markdown.")

In [16]:
# Define the full path to your directory
directory_path = r"C:\Users\User\GitHub\website\content\enCopy"

# Use os.listdir() to get all entries
all_files_and_dirs = os.listdir(directory_path)

# Print the list
for entry in all_files_and_dirs:
    print(entry)

case-studies
community
docs
examples
includes
partners
releases
training


In [17]:
from azure.storage.blob import BlobServiceClient  
import glob

# sample_docs_directory = os.path.join( "data", "documents")
# sample_ocr_docs_directory = os.path.join( "data", "documents")
# sample_layout_docs_directory = os.path.join( "data", "documents")
# sample_markdown_docs_directory = os.path.join( "data", "documents")

sample_docs_directory = directory_path
sample_ocr_docs_directory = directory_path
sample_layout_docs_directory = directory_path
sample_markdown_docs_directory = directory_path


In [4]:
# def upload_sample_documents(
#         blob_connection_string: str,
#         blob_container_name: str,
#         documents_directory: str,
#         # Set to false if you want to use credentials included in the blob connection string
#         # Otherwise your identity will be used as credentials
#         use_user_identity: bool = True
#     ):
#         # Connect to Blob Storage
#         blob_service_client = BlobServiceClient.from_connection_string(logging_enable=True, conn_str=blob_connection_string, credential=DefaultAzureCredential() if use_user_identity else None)
#         container_client = blob_service_client.get_container_client(blob_container_name)
#         if not container_client.exists():
#             container_client.create_container()

#         files = glob.glob(os.path.join(documents_directory, '*'))
#         for file in files:
#             with open(file, "rb") as data:
#                 name = os.path.basename(file)
#                 if not container_client.get_blob_client(name).exists():
#                     container_client.upload_blob(name=name, data=data)


def upload_sample_documents(
        blob_connection_string: str,
        blob_container_name: str,
        documents_directory: str,
        # Set to false if you want to use credentials included in the blob connection string
        # Otherwise your identity will be used as credentials
        use_user_identity: bool = True
    ):
        # Connect to Blob Storage
        blob_service_client = BlobServiceClient.from_connection_string(logging_enable=True, conn_str=blob_connection_string, credential=DefaultAzureCredential() if use_user_identity else None)
        container_client = blob_service_client.get_container_client(blob_container_name)
        if not container_client.exists():
            container_client.create_container()

        # Iterate over all files and subdirectories
        for root, dirs, files in os.walk(documents_directory):
            for file_name in files:
                local_file_path = os.path.join(root, file_name)
                # Create the blob name, which includes the directory structure
                blob_name = os.path.relpath(local_file_path, documents_directory)
                with open(local_file_path, "rb") as data:
                    if not container_client.get_blob_client(blob_name).exists():
                        container_client.upload_blob(name=blob_name, data=data)

In [18]:
docs_directory = sample_docs_directory

if use_ocr:
    docs_directory = sample_ocr_docs_directory
elif use_document_layout:
    docs_directory = sample_layout_docs_directory
elif use_markdown:
    docs_directory = sample_markdown_docs_directory

In [None]:
# for root, dirs, files in os.walk(docs_directory):
#     for file_name in files:
#         local_file_path = os.path.join(root, file_name)
#         print(local_file_path)

C:\Users\User\GitHub\website\content\enCopy\docs\reference\command-line-tools-reference\kube-apiserver.md
C:\Users\User\GitHub\website\content\enCopy\docs\reference\command-line-tools-reference\kube-controller-manager.md
C:\Users\User\GitHub\website\content\enCopy\docs\reference\command-line-tools-reference\kube-proxy.md
C:\Users\User\GitHub\website\content\enCopy\docs\reference\command-line-tools-reference\kube-scheduler.md
C:\Users\User\GitHub\website\content\enCopy\docs\reference\command-line-tools-reference\kubelet.md
C:\Users\User\GitHub\website\content\enCopy\docs\reference\command-line-tools-reference\feature-gates\Accelerators.md
C:\Users\User\GitHub\website\content\enCopy\docs\reference\command-line-tools-reference\feature-gates\AdmissionWebhookMatchConditions.md
C:\Users\User\GitHub\website\content\enCopy\docs\reference\command-line-tools-reference\feature-gates\AdvancedAuditing.md
C:\Users\User\GitHub\website\content\enCopy\docs\reference\command-line-tools-reference\feature

In [None]:
upload_sample_documents(
    blob_connection_string=blob_connection_string,
    blob_container_name=blob_container_name,
    documents_directory = docs_directory)

print(f"Setup sample data in {blob_container_name}")

Setup sample data in azopenaisummer6768blob
