In [None]:
! pip install azure-search-documents==11.4.0a20230509004
! pip install openai
! pip install python-dotenv
! pip install azure-storage-blob

In [15]:
import os
import time
from dotenv import load_dotenv
import requests
import openai
import os
import re
import logging
from azure.core.credentials import AzureKeyCredential
from azure.storage.blob import BlobServiceClient
from azure.core.exceptions import ResourceNotFoundError
from azure.search.documents.models import Vector  
from azure.search.documents import SearchClient  
from azure.search.documents.indexes import SearchIndexClient, SearchIndexerClient
from azure.search.documents.indexes.models import (
    SimpleField,
    SearchField,
    SearchableField,
    SearchFieldDataType,
    SearchIndexer,
    IndexingParameters,
    FieldMapping,
    FieldMappingFunction,
    InputFieldMappingEntry, 
    OutputFieldMappingEntry, 
    SearchIndexerSkillset,
    SearchIndexerKnowledgeStore,
    SearchIndexerKnowledgeStoreProjection,
    SearchIndexerKnowledgeStoreFileProjectionSelector,
    IndexingParameters, 
    WebApiSkill,
    SearchIndex,
    SemanticSettings,
    SemanticConfiguration,
    PrioritizedFields,
    SemanticField,
    SearchIndexerDataSourceConnection,  
    SearchIndexerDataContainer,
    VectorSearch,  
    VectorSearchAlgorithmConfiguration,
    SearchIndexerDataContainer, SearchIndex, SearchIndexer, SimpleField, SearchFieldDataType,
    EntityRecognitionSkill, InputFieldMappingEntry, OutputFieldMappingEntry, SearchIndexerSkillset,
    CorsOptions, IndexingSchedule, SearchableField, IndexingParameters, SearchIndexerDataSourceConnection,
    DocumentExtractionSkill, Skill
)

# Configure environment variables  
load_dotenv()  
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  
index_name = "python-chunk"
key = os.getenv("AZURE_SEARCH_ADMIN_KEY")  
openai.api_type = "azure"  
openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")  
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")  
openai.api_version = os.getenv("AZURE_OPENAI_API_VERSION") 
blob_connection_string = os.getenv("BLOB_CONNECTION_STRING")  
container_name = "spe"
credential = AzureKeyCredential(key)

In [7]:
# Connect to Blob Storage
blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
container_client = blob_service_client.get_container_client(container_name)
blobs = container_client.list_blobs()

first_blob = next(blobs)
blob_url = container_client.get_blob_client(first_blob).url
print(f"URL of the first blob: {blob_url}")

URL of the first blob: https://fsunavalastorage.blob.core.windows.net/spe/Chevron%20frac%20containment%20and%20broaching%20wcd.pdf


In [8]:
# Create a data source 
ds_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))
container = SearchIndexerDataContainer(name=container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=f"{index_name}-blob",
    type="azureblob",
    connection_string=blob_connection_string,
    container=container
)
data_source = ds_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

Data source 'python-chunk-blob' created or updated


In [11]:
# Create a search index
index_client = SearchIndexClient(
    endpoint=service_endpoint, credential=credential)
fields = [
    SimpleField(name="id", type=SearchFieldDataType.String, key=True, sortable=True),  
    SimpleField(name="url", type=SearchFieldDataType.String, retrievable=True),  
    SearchableField(name="title", type=SearchFieldDataType.String, searchable=True, retrievable=True),  
    SearchableField(name="content", type=SearchFieldDataType.String, searchable=True, retrievable=True),  
    SearchField(  
        name="contentVector",  
        type=SearchFieldDataType.Collection(SearchFieldDataType.Single),  
        searchable=True,  
        dimensions=1536,  
        vector_search_configuration="my-vector-config",  
    ),  
]

vector_search = VectorSearch(
    algorithm_configurations=[
        VectorSearchAlgorithmConfiguration(
            name="my-vector-config",
            kind="hnsw",
            hnsw_parameters={
                "m": 4,
                "efConstruction": 400,
                "efSearch": 1000,
                "metric": "cosine"
            }
        )
    ]
)


# Create the search index 
index = SearchIndex(name=f"{index_name}-index", fields=fields, vector_search=vector_search,)
result = index_client.create_or_update_index(index)
print(f' {result.name} created')


 python-chunk-index created


In [17]:
# Create a skillset  
skillset_name = f"{index_name}-skillset"  
  
# Add a document cracker skill for PDFs  
pdf_cracker_skill = Skill(  
    odata_type="#Microsoft.Skills.Util.DocumentExtractionSkill",  
    name="pdfCracker",  
    description="Extract text from PDFs",  
    context="/document",  
    inputs=[  
        InputFieldMappingEntry(name="file_data", source="/document/content")  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="text", target_name="content")  
    ]  
)  
  
skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to process PDF files",  
    skills=[pdf_cracker_skill],  
)  
  
skillset_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))  
skillset_result = skillset_client.create_or_update_skillset(skillset)  
print(f'Skillset {skillset_name} created')  


NameError: name 'Skill' is not defined

In [None]:
# Create an indexer  
indexer_name = f"{index_name}-indexer"  
indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to process PDFs",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,  
    field_mappings=[  
        FieldMapping(source_field_name="metadata_storage_path", target_field_name="url"),  
        FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")  
    ],  
    output_field_mappings=[  
        FieldMapping(source_field_name="/document/content", target_field_name="content")  
    ]  
)  
  
indexer_client = SearchIndexerClient(service_endpoint, AzureKeyCredential(key))  
indexer_result = indexer_client.create_or_update_indexer(indexer)  
  
# Run the indexer  
indexer_client.run_indexer(indexer_name)  
print(f' {indexer_name} created')  