In [1]:
# azure-ai-documentintelligence==1.0.0
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest
from openai import OpenAI, AzureOpenAI
import re

In [None]:
### KEYS AND ENDPOINTS

In [3]:
endpoint = "AZURE_AI_DOCUMENT_INTELLIGENCE_ENDPOINT"
key = "AZURE_AI_DOCUMENT_INTELLIGENCE_key"

OPENAI_API_KEY = "OPENAI_API_KEY"

AZURE_OPENAI_KEY = "AZURE_OPENAI_KEY"
AZURE_OPENAI_ENDPOINT = "AZURE_OPENAI_ENDPOINT"
AZURE_OPENAI_API_VERSION = "2024-02-15-preview"
EMBEDDING_MODEL = "text-embedding-3-large"

AZURE_SEARCH_KEY = "AZURE_SEARCH_KEY"
AZURE_SEARCH_ENDPOINT = "AZURE_SEARCH_ENDPOINT"
INDEX_NAME = "keyence-info-index"
AZURE_SEARCH_API_VERSION = "2024-11-01-preview"

headers = {
    "Content-Type": "application/json",
    "api-key": AZURE_SEARCH_KEY}

In [5]:
openai_client = OpenAI(api_key=OPENAI_API_KEY)

In [7]:
def analyze_read(url):
    document_intelligence_client  = DocumentIntelligenceClient(
        endpoint=endpoint, credential=AzureKeyCredential(key)
    )
    
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-read", AnalyzeDocumentRequest(url_source=url)
    )
    result = poller.result()
    return result.content

In [None]:
def normalize_with_GPT(text, query):

    system_prompt = f'''Generate an informative document with the structure requested by the user.
                    Use the following document as context: {text}'''
    
    chat_history = []
    chat_history.append({"role": "system", "content": system_prompt})
    chat_history.append({"role": "user", "content": query})  
    
    gpt_response  = openai_client.chat.completions.create(
        model = "gpt-4.1",
        messages=chat_history,  # Send complete history,
        max_tokens=5000,  
        temperature=0.7,  
        top_p=0.7,  
        frequency_penalty=0,  
        presence_penalty=0,
        stop=None,  
        stream=False
        )
    
    response = gpt_response.choices[0].message.content
    return response

In [None]:
def segment_text_as_json(text):
    
    """
        Creates a hierarchical structure:
          {
            "A. ...": {
              "1. ...": [...],
              "2. ...": [...]
            },
            "B. ...": {
              "1. ...": [...],
              ...
            },
            ...
          }
        """
    
    def create_hierarchical_chunking(text):
        
        # English translations:
        # Pattern to detect headers of type 'X. Something...' where X can be A, B, C ... or 1, 2, 3 ...
        # 1) Capital letter followed by period: ^[A-Z]\.\s+(.*) 
        # 2) Number (one or more digits) followed by period: ^\\d+\\.\\s+(.*)
        letter_pattern = re.compile(r'^([A-Z]\.\s+.*)')
        number_pattern = re.compile(r'^(\d+\.\s+.*)')
    
        chunk_dict = {}
        current_letter = None
        current_number = None
    
        # Process line by line
        for line in text.splitlines():
            line = line.strip()
            if not line:
                # If the line is blank, it's ignored (doesn't contribute content)
                continue
    
            # Matches Letter header (e.g. "A. Text", "B. Text", etc.)
            letter_match = letter_pattern.match(line)
            # Matches Number header (e.g. "1. Text", "2. Text", etc.)
            number_match = number_pattern.match(line)
    
            if letter_match:
                # New major block (e.g. "A. Names.")
                current_letter = letter_match.group(1)
                # Create an empty dictionary for that letter
                chunk_dict[current_letter] = {}
                # Reset the sub-block
                current_number = None
    
            elif number_match:
                # Sub-block within the letter (e.g. "1. Genberal Info ...")
                if current_letter is not None:
                    current_number = number_match.group(1)
                    # Create a list to store lines that belong to this sub-block
                    chunk_dict[current_letter][current_number] = []
                else:
                    # If no letter is defined, it's ignored or handled according to your logic
                    pass
            else:
                    # It's not a letter or number header: it's added as content line
                    # to the last detected sub-block (letter->number)
                if current_letter is not None and current_number is not None:
                    chunk_dict[current_letter][current_number].append(line)
                else:
                    # If we get here without 'current_letter' or without 'current_number',
                    # it means the line is not under any valid header.
                    # It can be ignored or handled differently.
                    pass
    
        return chunk_dict
    
    # 2) Create the chunking structure
    hierarchical_chunking = create_hierarchical_chunking(text)
        
    # 3) Convert to JSON to observe the final result
    # and print it (or save it, according to your use case
        
    json_result = json.dumps(
        hierarchical_chunking,
        ensure_ascii=False,
        indent=2)
    
    return json_result

In [31]:
def string_to_json(json_string):
    try:
        json_data = json.loads(json_string)  # Convertir string a JSON
        return json_data
        
    except json.JSONDecodeError as e:
        print(f"Error al convertir string a JSON: {e}")
        return None

In [None]:
from azure.storage.blob import BlobServiceClient

# Configura tu cadena de conexión y nombre del contenedor
connection_string = "AZURE_STORAGE_ENDPOINT"
container_name = "files"

# Configure your connection string and container name
blob_service_client = BlobServiceClient.from_connection_string(connection_string)

# Create the BlobService client
container_client = blob_service_client.get_container_client(container_name)

# List the blobs in the container
blobs = [blob.name for blob in container_client.list_blobs()]

['AP-N.pdf',
 'AS_124515_LR-X_C_613B35_KMX_MX_2104_5.pdf',
 'AS_145349_IV4-CP70_IM_17963MX_KMX_MX_2074_2.pdf',
 'BT-600.pdf',
 'BT-A500.pdf',
 'BT-A600.pdf',
 'BT-A700.pdf',
 'CU.pdf',
 'DL.pdf',
 'EA-300.pdf',
 'FD-H.pdf',
 'FD_R.pdf',
 'FR.pdf',
 'GL-R.pdf',
 'GL-V.pdf',
 'GP-M.pdf',
 'IM-8000.pdf',
 'IV4-CP70.pdf',
 'KV-8000.pdf',
 'KV-8000_programming.pdf',
 'KV-X_MOTION.pdf',
 'KV_Nano.pdf',
 'LJ-S8000.pdf',
 'LJ-X8000.pdf',
 'LK-G5000.pdf',
 'LM.pdf',
 'LMX.pdf',
 'LR-T.pdf',
 'LR-Z.pdf',
 'LR_Series.pdf',
 'LS-9000.pdf',
 'MS2.pdf',
 'Medicion_Dimensional.pdf',
 'Medicion_Espesores.pdf',
 'NQ.pdf',
 'NU.pdf',
 'PLC_KV.pdf',
 'QCD.pdf',
 'SJ-F700.pdf',
 'SR-2000.pdf',
 'SR-G100.pdf',
 'SR-X.pdf',
 'Serie_DL.pdf',
 'Serie_FS-N.pdf',
 'Serie_GT2.pdf',
 'Serie_IG.pdf',
 'Serie_IL.pdf',
 'Serie_IV4.pdf',
 'Serie_IX.pdf',
 'Serie_LM-X.pdf',
 'Serie_LV-N.pdf',
 'Serie_MKG.pdf',
 'Serie_PR-G.pdf',
 'Serie_PS-N.pdf',
 'Serie_PX.pdf',
 'Serie_PZ-G.pdf',
 'Series_FD-Q.pdf',
 'TG.pdf',
 'TM

In [35]:
url = "https://sa19920811dv.blob.core.windows.net/files/AP-N.pdf"

In [39]:
text = analyze_read(url)

In [None]:
query = '''1. Write the provided document in a structured way dividing the text hierarchically with bullets 
              placing alphabet letters (A., B., C., etc) for topics, numbers (1., 2., 3., etc.) 
              for subtopics and descriptions(-).
           2. Develop all descriptions in detail.
           3. Only respond with what is requested.'''

normalized_text = normalize_with_GPT(text, query)

In [45]:
print(normalized_text)

A. Sensores de Presión KEYENCE AP-N Series

   1. Compatibilidad con Múltiples Protocolos de Red
      - Los sensores de presión de la Serie AP-N de KEYENCE están diseñados para integrarse fácilmente en una amplia variedad de redes industriales abiertas. 
      - Son compatibles con protocolos como EtherNet/IP™, CC-Link, DeviceNet™ y EtherCAT®, lo que facilita su implementación en diferentes sistemas de automatización alrededor del mundo.

   2. Ventajas de la Red de Sensores
      - Simplificación del Cableado
         - Solo se requiere un cable para conectar y comunicar el sensor con un PLC, eliminando la complejidad del manejo y gestión de múltiples cables.
         - Esto reduce significativamente el tiempo de instalación y los errores asociados al cableado tradicional.
      - Reducción del Tiempo de Inactividad
         - La monitorización continua de los valores de presión permite predecir necesidades de mantenimiento, identificando rápidamente sensores con fallos.
         - E

In [47]:
chunks = segment_text_as_json(normalized_text)
print(chunks)

{
  "A. Sensores de Presión KEYENCE AP-N Series": {
    "1. Compatibilidad con Múltiples Protocolos de Red": [
      "- Los sensores de presión de la Serie AP-N de KEYENCE están diseñados para integrarse fácilmente en una amplia variedad de redes industriales abiertas.",
      "- Son compatibles con protocolos como EtherNet/IP™, CC-Link, DeviceNet™ y EtherCAT®, lo que facilita su implementación en diferentes sistemas de automatización alrededor del mundo."
    ],
    "2. Ventajas de la Red de Sensores": [
      "- Simplificación del Cableado",
      "- Solo se requiere un cable para conectar y comunicar el sensor con un PLC, eliminando la complejidad del manejo y gestión de múltiples cables.",
      "- Esto reduce significativamente el tiempo de instalación y los errores asociados al cableado tradicional.",
      "- Reducción del Tiempo de Inactividad",
      "- La monitorización continua de los valores de presión permite predecir necesidades de mantenimiento, identificando rápidamente

In [11]:
import json
import requests
import os
import uuid

In [9]:
AZURE_OPENAI_KEY = "4TP0AYpsE4MVrPXqN5Usr7eTnlwrh2lri4z2Ee3SlV0FkjZGtLnJJQQJ99BHACYeBjFXJ3w3AAABACOGvjX3"
AZURE_OPENAI_ENDPOINT = "https://openai19920811.openai.azure.com/"
AZURE_OPENAI_API_VERSION = "2024-02-15-preview"
EMBEDDING_MODEL = "text-embedding-3-large"

AZURE_SEARCH_KEY = "2YmnIpBoIg4cPv9QMOvoqvdNNao2rpw4mbHS64eg77AzSeDyK5Lq"
AZURE_SEARCH_ENDPOINT = "https://aisearch19920811dv.search.windows.net"
INDEX_NAME = "keyence-info-index"
AZURE_SEARCH_API_VERSION = "2024-11-01-preview"

headers = {
    "Content-Type": "application/json",
    "api-key": AZURE_SEARCH_KEY}

In [17]:
azure_openai_client = AzureOpenAI(
                      api_key = AZURE_OPENAI_KEY,  
                      api_version = AZURE_OPENAI_API_VERSION,
                      azure_endpoint = AZURE_OPENAI_ENDPOINT)

def generate_embedding(client, text, embedding_model):
    response = client.embeddings.create(
        input=text,
        model = embedding_model
    )
    embeddings=response.model_dump()
    return embeddings['data'][0]['embedding']

# generate_embedding(azure_openai_client, 'hOLA MUNDO, COMO ESTAS?', EMBEDDING_MODEL)

In [75]:
def get_index_document(url, client, chunks, embedding_model):
            
    data = string_to_json(chunks)
    documents = []
    for section_title, subsections in data.items():
        for subsection_title, content_list in subsections.items():
            content_text = " ".join(content_list) if isinstance(content_list, list) else content_list
            text_4_vector = section_title + " " + subsection_title 
            vector = generate_embedding(client, text_4_vector, embedding_model)

            doc = {
                "id": str(uuid.uuid4()),
                "title": section_title,
                "subtitle": subsection_title,
                "content": content_text,
                "contentVector": vector,
                "category": "Keyence",
                "additionalMetadata": f"manual={url}"
            }
            documents.append(doc)
            
    return documents

documents = get_index_document(url, azure_openai_client, chunks, EMBEDDING_MODEL)

In [59]:
len(documents)

15

In [81]:
import json

# Ruta al archivo JSON
path = "index_definition.json"

# Leer y cargar el JSON como objeto Python (lista o diccionario)
with open(path, "r", encoding="utf-8") as f:
    index_definition = json.load(f)

In [83]:
index_definition

{'name': 'keyence-info-index',
 'fields': [{'name': 'id',
   'type': 'Edm.String',
   'searchable': False,
   'filterable': False,
   'retrievable': True,
   'sortable': False,
   'facetable': False,
   'key': True},
  {'name': 'title',
   'type': 'Edm.String',
   'searchable': True,
   'filterable': True,
   'retrievable': True,
   'sortable': False,
   'facetable': False,
   'analyzer': 'standard.lucene'},
  {'name': 'subtitle',
   'type': 'Edm.String',
   'searchable': True,
   'filterable': True,
   'retrievable': True,
   'sortable': False,
   'facetable': False,
   'analyzer': 'standard.lucene'},
  {'name': 'content',
   'type': 'Edm.String',
   'searchable': True,
   'filterable': False,
   'retrievable': True,
   'sortable': False,
   'facetable': False,
   'analyzer': 'standard.lucene'},
  {'name': 'contentVector',
   'type': 'Collection(Edm.Single)',
   'searchable': True,
   'dimensions': 3072,
   'vectorSearchProfile': 'vector-profile-19920811'},
  {'name': 'category',
   '

In [85]:
headers

{'Content-Type': 'application/json',
 'api-key': '2YmnIpBoIg4cPv9QMOvoqvdNNao2rpw4mbHS64eg77AzSeDyK5Lq'}

In [87]:
INDEX_NAME

'keyence-info-index'

In [89]:
AZURE_SEARCH_API_VERSION

'2024-11-01-preview'

In [91]:
AZURE_SEARCH_ENDPOINT

'https://aisearch19920811dv.search.windows.net'

In [None]:
# URL of the index
url = f"{AZURE_SEARCH_ENDPOINT}/indexes/{INDEX_NAME}?api-version={AZURE_SEARCH_API_VERSION}"

# Send the `DELETE` request to delete the previous index
response = requests.delete(url, headers=headers)

# Verify if the request was successful
if response.status_code == 204:
    print(f"Índice '{INDEX_NAME}' eliminado correctamente en Azure AI Search.")
elif response.status_code == 404:
    print(f"Índice '{INDEX_NAME}' no encontrado. Puede que ya haya sido eliminado.")
else:
    print(f"Error al eliminar el índice: {response.text}")

Índice 'keyence-info-index' eliminado correctamente en Azure AI Search.


In [None]:
# Send the `PUT` request to create the index
url = f"{AZURE_SEARCH_ENDPOINT}/indexes/{INDEX_NAME}?api-version={AZURE_SEARCH_API_VERSION}"
response = requests.put(url, headers=headers, json=index_definition)

# Verify if the request was successful
if response.status_code == 201:
    print(f"Índice '{INDEX_NAME}' creado correctamente en Azure AI Search.")
elif response.status_code == 204:
    print(f"Índice '{INDEX_NAME}' ya existía y se actualizó correctamente.")
else:
    print(f"Error al crear el índice: {response.text}")

Índice 'keyence-info-index' creado correctamente en Azure AI Search.


In [None]:
# Upload data to index in batches
batch_size = 1
for i in range(0, len(documents), batch_size):
    batch = {"value": documents[i:i+batch_size]}
    url = f"{AZURE_SEARCH_ENDPOINT}/indexes/{INDEX_NAME}/docs/index?api-version={AZURE_SEARCH_API_VERSION}"
    response = requests.post(url, headers=headers, json=batch)
    if response.status_code == 200:
        print(f"Batch {i}-{i+len(batch['value'])} subido correctamente.")
    else:
        print(f"Error en batch {i}: {response.text}")

Batch 0-1 subido correctamente.
Batch 1-2 subido correctamente.
Batch 2-3 subido correctamente.
Batch 3-4 subido correctamente.
Batch 4-5 subido correctamente.
Batch 5-6 subido correctamente.
Batch 6-7 subido correctamente.
Batch 7-8 subido correctamente.
Batch 8-9 subido correctamente.
Batch 9-10 subido correctamente.
Batch 10-11 subido correctamente.
Batch 11-12 subido correctamente.
Batch 12-13 subido correctamente.
Batch 13-14 subido correctamente.
Batch 14-15 subido correctamente.


In [None]:
def chat_GPT(query):
    
    query_embedding = generate_embedding(azure_openai_client, query, EMBEDDING_MODEL)
    # Build Hybrid query
    payload = {
      "search": query,
      "select": "title, subtitle, content, category, additionalMetadata", 
      "queryLanguage": "en-us",
      "vectorQueries": [
        {
          "kind": "vector",
          "vector": query_embedding,  
          "fields": "contentVector",
          "k": 3
        }
      ],
      "top": 10
    }
    
    # send query to Azure AI Search
    url = f"{AZURE_SEARCH_ENDPOINT}/indexes/{INDEX_NAME}/docs/search?api-version={AZURE_SEARCH_API_VERSION}"
    response = requests.post(url, headers=headers, json=payload)
    
    results = response.json()['value']

    context = " ".join([doc["category"] + ", " + doc["title"] + ", " + doc["subtitle"]  + ", " + doc["content"] + ", " + doc["additionalMetadata"] for doc in results])
    
    # Generate GPT-4.1 response
    
    # Build hybrid query
    # Send query to Azure AI Search
    # Generate response with GPT-4.1

    system_prompt = f''' 
                        Your name is Alberto and you are an expert sales agent who works for Keyence.
                        Your function is to respond exclusively about Keyence sensors and products using the information contained in the provided knowledge base, including technical characteristics, applications, benefits, compatibility, approximate prices and product manuals.
                        
                        🎯 Main objective:
                        
                        Provide clear, professional, warm and convincing answers that help the customer make purchasing decisions.
                        
                        Keep each response to a maximum of 1600 characters, summarizing the information if necessary without losing key details.
                        
                        At the end of each response, suggest EXACTLY 2 concrete and relevant topics for the user to continue asking about.
                        
                        📋 Instructions:
                        
                        Respond only based on the information in the knowledge base.
                        If the answer is not in the base, respond exactly:
                        "That information is not available at the moment, but I can verify it with a specialized advisor."
                        
                        Use a professional, close and sales-oriented tone, like a technical-commercial advisor who understands the customer's needs.
                        
                        Prioritize benefits, practical applications and added value of each sensor or product, avoiding excess technical language when not necessary.
                        
                        If the text exceeds 1600 characters, summarize while maintaining clarity and relevance.
                        
                        At the end of each response, write a block in this format:
                        
                        📌 You can continue asking about:
                        
                        [Suggested topic 1 related to the query or context]
                        
                        [Suggested topic 2 related to the query or context]
                        
                        🧠 Response strategies:
                        
                        If the user asks for product comparison, use only available information and present brief pros and cons.
                        
                        If the user asks something general, offer concrete examples from the knowledge base.
                        
                        If the user asks for price, mention the range or approximate price and refer to the corresponding advisor for the exact figure.
                        
                        If the user asks for detailed technical information, include the link to the product manual when available.
                        
                        Always keep in mind that the goal is to guide the conversation towards additional information so the user continues interacting.
                        
                        Respond with emojis when appropriate to maintain a close tone.
                        
                        Don't invent information or make assumptions outside the knowledge base.
                        
                        Use the following document as context: {context}
                                                        
                        '''

    
    user_prompt = query
    gpt_response  = openai_client.chat.completions.create(
        model = "gpt-4.1",
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ],
        max_tokens=1000,  
        temperature=0.8,  
        top_p=0.9,  
        frequency_penalty=0,  
        presence_penalty=0,
        stop=None,  
        stream=False  
    )
    
    return gpt_response.choices[0].message.content

In [None]:
# Build hybrid query
# Send query to Azure AI Search
query = "what products do you sell"
response = chat_GPT(query)
print(response)

¡Por supuesto! Las cortinas de luz de seguridad KEYENCE, como la Serie GL-R y GL-V, son soluciones avanzadas diseñadas para proteger al personal en zonas peligrosas de maquinaria, sin obstaculizar la productividad. Estas cortinas generan un campo invisible de haces de luz; si alguno se interrumpe, la máquina se detiene de inmediato, evitando accidentes.

**Principales beneficios:**
- **Máxima protección:** Cumplen con los estándares internacionales más altos de seguridad (Type 4, SIL3, PLe, CE, TÜV, cULus).
- **Facilidad de instalación:** Integran tecnologías y herrajes que simplifican el montaje y ajuste, reduciendo tiempo y costos.
- **Robustez:** Fabricadas con materiales resistentes como aluminio y policarbonato, ideales para ambientes industriales exigentes.
- **Compatibilidad:** Diferentes modelos se adaptan a distancias de detección y entornos diversos, desde 0.3 hasta 15 metros.
- **Mantenimiento sencillo:** Indicadores visuales ayudan a detectar alineación incorrecta o fallos 

In [None]:
def generate_embedding(client, text):
    response = client.embeddings.create(
        input=text,
        model = EMBEDDING_MODEL
    )
    embeddings=response.model_dump()
    return embeddings['data'][0]['embedding']

def search_query(query):

    query_embedding = generate_embedding(azure_openai_client, query)
    # Construir consulta híbrida
    payload = {
      "search": query,
      "select": "title, subtitle, content, category, additionalMetadata", 
      "queryLanguage": "en-us",
      "vectorQueries": [
        {
          "kind": "vector",
          "vector": query_embedding,  
          "fields": "contentVector",
          "k": 3
        }
      ],
      "top": 10
    }
    
    # Enviar consulta a Azure AI Search
    url = f"{AZURE_SEARCH_ENDPOINT}/indexes/{INDEX_NAME}/docs/search?api-version={AZURE_SEARCH_API_VERSION}"
    response = requests.post(url, headers=headers, json=payload)
    results = response.json()['value']
    context = " ".join([doc["category"] + ", " + doc["title"] + ", " + doc["subtitle"]  + ", " + doc["content"] + ", " + doc["additionalMetadata"] for doc in results])
    
    return context

    
if __name__ == "__main__":
    
    query = "que producto vendes"
    context = search_query(query)
    print(context)