<a href="https://colab.research.google.com/github/Chediak/common-master-ai/blob/main/redhat_prodesan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
!pip install pdfplumber sentence-transformers faiss-cpu
!python -m spacy download pt_core_news_sm

Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m40.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pt-core-news-sm
Successfully installed pt-core-news-sm-3.7.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [11]:
!python -m spacy download pt_core_news_sm
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import spacy
import json
import re
from datetime import datetime

# Step 1: Extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

# Step 2: Preprocess and split into individual news items
def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

# Step 3: Generate embeddings using SentenceTransformer
def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

# Step 4: Store embeddings in a FAISS vector store
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Step 5: Add metadata for later retrieval
def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i], "length": len(news_items[i])} for i in range(len(news_items))]
    return metadata

# Step 6: Extract entities using spaCy
def extract_entities(news_items):
    nlp = spacy.load("pt_core_news_sm")  # Use Portuguese model
    extracted_data = []
    for idx, news in enumerate(news_items):
        doc = nlp(news)
        entities = {"id": idx, "content": news, "dates": [], "names": [], "organizations": []}

        for ent in doc.ents:
            if ent.label_ == "DATE":
                entities["dates"].append(ent.text)
            elif ent.label_ == "PERSON":
                entities["names"].append(ent.text)
            elif ent.label_ == "ORG":
                entities["organizations"].append(ent.text)

        # Extract CNPJ using regex
        cnpj_matches = re.findall(r"\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}", news)
        entities["cnpjs"] = cnpj_matches
        extracted_data.append(entities)
    return extracted_data

# Step 7: Create a structured index for fast retrieval
def create_structured_index(entities):
    index = {"dates": {}, "names": {}, "organizations": {}, "cnpjs": {}}
    for item in entities:
        for date in item["dates"]:
            index["dates"].setdefault(date, []).append(item)
        for name in item["names"]:
            index["names"].setdefault(name, []).append(item)
        for org in item["organizations"]:
            index["organizations"].setdefault(org, []).append(item)
        for cnpj in item["cnpjs"]:
            index["cnpjs"].setdefault(cnpj, []).append(item)
    return index

# Step 8: Query the structured index
def query_index(index, query_type, query_value):
    if query_type in index:
        return index[query_type].get(query_value, [])
    return []

# Step 9: Reorganize for optimization of space
def optimize_layout(metadata, top_k=5):
    sorted_news = sorted(metadata, key=lambda x: x["length"], reverse=True)
    layout = []
    for i, item in enumerate(sorted_news):
        layout.append({
            "type": "news",
            "content": item["content"],
            "page": i // top_k + 1,
            "position": i % top_k
        })
        if (i + 1) % 3 == 0:
            layout.append({
                "type": "advertisement",
                "content": "Ad Placeholder",
                "page": (i + 1) // top_k + 1,
                "position": "bottom"
            })
    return layout

# Full Pipeline Test
def main():
    pdf_path = "/content/DOM-1947.pdf"  # Replace with your PDF path

    # Extract text
    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    # Preprocess and split
    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    # Generate embeddings
    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    # Create FAISS index
    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    # Add metadata
    print("Adding metadata...")
    metadata = add_metadata(news_items)

    # Extract entities
    print("Extracting entities...")
    entities = extract_entities(news_items)

    # Create structured index
    print("Creating structured index...")
    structured_index = create_structured_index(entities)

    # Query examples
    print("Querying by date '13/11/2024'...")
    date_results = query_index(structured_index, "dates", "13/11/2024")
    print(json.dumps(date_results, indent=2))

    print("Querying by organization 'ARAUJO E REPLANDE LTDA'...")
    org_results = query_index(structured_index, "organizations", "ARAUJO E REPLANDE LTDA")
    print(json.dumps(org_results, indent=2))

    # Optimize layout
    print("Optimizing layout...")
    layout = optimize_layout(metadata)
    print(json.dumps(layout, indent=2))

if __name__ == "__main__":
    main()


Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m73.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Extracting text from PDF...
Preprocessing and splitting text...
Generating embeddings...
Creating FAISS index...
Adding metadata...
Extracting entities...
Creating structured index...
Querying by date '13/11/2024'...
[]
Querying by organization 'ARAUJO E REPLANDE LTDA'...
[]
Optimizing layout...
[
  {
    "type": "news",
    "content":

In [5]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

# Step 1: Extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

# Step 2: Preprocess and split into individual news items
def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

# Step 3: Generate embeddings using SentenceTransformer
def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

# Step 4: Store embeddings in a FAISS vector store
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Step 5: Add metadata for later retrieval
def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i], "length": len(news_items[i])} for i in range(len(news_items))]
    return metadata

# Step 6: Query the vector store
def search_similar_news(query, model, index, metadata, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = [
        {"content": metadata[idx]["content"], "distance": float(dist), "length": metadata[idx]["length"]}
        for dist, idx in zip(distances[0], indices[0])
    ]
    return results

# Step 7: Reorganize for Optimization of Space
def optimize_layout(metadata, top_k=5):
    # Sort news items by length (longer news first)
    sorted_news = sorted(metadata, key=lambda x: x["length"], reverse=True)

    # Create layout with strategic placement of advertisements
    layout = []
    for i, item in enumerate(sorted_news):
        layout.append({"type": "news", "content": item["content"], "page": i // top_k + 1, "position": i % top_k})
        # Add an advertisement after every 3 news items
        if (i + 1) % 3 == 0:
            layout.append({"type": "advertisement", "content": "Ad Placeholder", "page": (i + 1) // top_k + 1, "position": "bottom"})

    return layout

# Full Pipeline Test
def main():
    # File path to the test PDF
    pdf_path = "/content/DOM-1947.pdf"  # Replace with your PDF path

    # Extract text
    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    # Preprocess and split
    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    # Generate embeddings
    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    # Create FAISS index
    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    # Add metadata
    print("Adding metadata...")
    metadata = add_metadata(news_items)

    # Test a query
    query = "example query about a topic"  # Replace with your query
    print("Searching for similar news...")
    results = search_similar_news(query, model, index, metadata, top_k=5)

    # Print search results
    print("Search Results:")
    print(json.dumps(results, indent=2))

    # Optimize layout
    print("Optimizing layout...")
    layout = optimize_layout(metadata)

    # Print optimized layout
    print("Layout:")
    print(json.dumps(layout, indent=2))

if __name__ == "__main__":
    main()


Extracting text from PDF...
Preprocessing and splitting text...
Generating embeddings...
Creating FAISS index...
Adding metadata...
Searching for similar news...
Search Results:
[
  {
    "content": "Quarta-feira, 13 de novembro de 2024 03 Ano X \u2022 N\u00ba 1.947 \u2022 Prefeitura Municipal de Guara\u00ed/TO\nSECRETARIA MUNICIPAL DE SA\u00daDE Instala\u00e7\u00e3o, desinstala\u00e7\u00e3o, manuten\u00e7\u00e3o,\nlimpeza, reparo em geladeira,\n100 60,00 6.000,00\nHORA bebedouro, refrigerador, frigobar,\nfreezer, filtro e purificador de \u00e1gua\nEXTRATO DO PRIMEIRO TERMO ADITIVO 01 Valor global estimado para aquisi\u00e7\u00e3o\nde pe\u00e7as e/ou componentes originais\nem geladeira, bebedouro, refrigerador, 6.000,00\nCONTRATO N.\u00ba 073/2023\nfrigobar, freezer, filtro e purificador\nProcesso: 3243/2023 de \u00e1gua\nPreg\u00e3o Eletr\u00f4nico: 028/2023\nDESCONTO NO VALOR DAS PE\u00c7AS - 40 %\n\u00d3rg\u00e3o: Fundo Municipal da Sa\u00fade de Guara\u00ed - TO.\nContratada: ARAUJ

In [12]:
import spacy
from datetime import datetime
import re

# Load spaCy model for NER
nlp = spacy.load("en_core_web_sm")  # Use a language-specific model, e.g., 'pt_core_news_sm' for Portuguese

# Extract entities from text
def extract_entities(news_items):
    extracted_data = []
    for idx, news in enumerate(news_items):
        doc = nlp(news)
        entities = {"id": idx, "content": news, "dates": [], "names": [], "organizations": []}

        for ent in doc.ents:
            if ent.label_ in ["DATE"]:
                entities["dates"].append(ent.text)
            elif ent.label_ in ["PERSON"]:
                entities["names"].append(ent.text)
            elif ent.label_ in ["ORG"]:
                entities["organizations"].append(ent.text)

        # Regex for CNPJ or specific patterns
        cnpj_matches = re.findall(r"\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}", news)
        entities["cnpjs"] = cnpj_matches
        extracted_data.append(entities)
    return extracted_data

# Create a structured index for fast retrieval
def create_structured_index(entities):
    index = {}
    for item in entities:
        for date in item["dates"]:
            date_key = datetime.strptime(date, "%d/%m/%Y").date() if re.match(r"\d{2}/\d{2}/\d{4}", date) else date
            index.setdefault("dates", {}).setdefault(date_key, []).append(item)
        for name in item["names"]:
            index.setdefault("names", {}).setdefault(name, []).append(item)
        for org in item["organizations"]:
            index.setdefault("organizations", {}).setdefault(org, []).append(item)
        for cnpj in item["cnpjs"]:
            index.setdefault("cnpjs", {}).setdefault(cnpj, []).append(item)
    return index

# Query structured index
def query_index(index, query_type, query_value):
    if query_type not in index:
        return []
    return index.get(query_type, {}).get(query_value, [])

# Example integration
def main_with_structured_search():
    # Example news items
    news_items = [
        "Em 13/11/2024, a Prefeitura de Guaraí contratou ARAUJO E REPLANDE LTDA para manutenção de ar condicionado.",
        "A empresa PONTUAL REFRIGERAÇÃO assinou contrato em 12/11/2024 com a Prefeitura Municipal de Guaraí.",
    ]

    # Extract entities
    print("Extracting entities...")
    entities = extract_entities(news_items)

    # Create structured index
    print("Creating structured index...")
    index = create_structured_index(entities)

    # Query examples
    print("Querying for '13/11/2024' in dates...")
    results = query_index(index, "dates", datetime.strptime("13/11/2024", "%d/%m/%Y").date())
    print(results)

    print("Querying for 'ARAUJO E REPLANDE LTDA' in organizations...")
    results = query_index(index, "organizations", "ARAUJO E REPLANDE LTDA")
    print(results)

if __name__ == "__main__":
    main_with_structured_search()


Extracting entities...
Creating structured index...
Querying for '13/11/2024' in dates...
[{'id': 0, 'content': 'Em 13/11/2024, a Prefeitura de Guaraí contratou ARAUJO E REPLANDE LTDA para manutenção de ar condicionado.', 'dates': ['13/11/2024'], 'names': ['Prefeitura de Guaraí'], 'organizations': ['REPLANDE LTDA', 'manutenção de'], 'cnpjs': []}]
Querying for 'ARAUJO E REPLANDE LTDA' in organizations...
[]


In [13]:
import json

# Define layout constants
PAGE_WIDTH = 420  # A3 width in mm
PAGE_HEIGHT = 297  # A3 height in mm
MAX_PAGES = 20  # Limit to 20 pages

def adjust_layout(input_json):
    pdf_data = input_json["pdf_data"]
    output_data = {"pdf_data": []}
    current_page = 1
    current_x, current_y = 10, 10  # Starting coordinates with margin
    margin = 10  # Margin in mm

    for item in pdf_data:
        # Extract item dimensions
        item_width = item.get("largura", 0)
        item_height = item.get("altura", 0)
        agrupamento = item.get("agrupamento", 0)

        # Check if item fits in current page
        if current_y + item_height + margin > PAGE_HEIGHT:
            # Move to next page
            current_page += 1
            current_x, current_y = 10, 10
            if current_page > MAX_PAGES:
                raise ValueError("Exceeded maximum number of pages (20).")

        # Add item to output
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": current_x,
            "y": current_y,
            "pagina": current_page,
            "tipo": f"agrupamento-{agrupamento}",
        })

        # Update cursor for next item
        current_y += item_height + margin

    return output_data

# Example input JSON
input_json = {
    "pdf_data": [
        {"id": "item1", "altura": 125, "largura": 35, "agrupamento": 1},
        {"id": "item2", "altura": 70, "largura": 40, "agrupamento": 2},
        {"id": "item3", "altura": 80, "largura": 50, "agrupamento": 1},
    ]
}

# Process layout
try:
    output_json = adjust_layout(input_json)
    print(json.dumps(output_json, indent=2))
except ValueError as e:
    print(f"Error: {e}")


{
  "pdf_data": [
    {
      "id": "item1",
      "x": 10,
      "y": 10,
      "pagina": 1,
      "tipo": "agrupamento-1"
    },
    {
      "id": "item2",
      "x": 10,
      "y": 145,
      "pagina": 1,
      "tipo": "agrupamento-2"
    },
    {
      "id": "item3",
      "x": 10,
      "y": 10,
      "pagina": 2,
      "tipo": "agrupamento-1"
    }
  ]
}


In [14]:
# Instalar bibliotecas necessárias para o Docker
!apt-get update
!apt-get install -y docker.io

# Iniciar o serviço do Docker
!service docker start

# Baixar e executar o contêiner Elasticsearch
!docker run -d -p 9200:9200 -e "discovery.type=single-node" elasticsearch:8.10.2

Get:1 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1,581 B]
Get:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:3 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:4 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:5 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Get:6 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [1,110 kB]
Get:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]
Get:9 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ Packages [59.5 kB]
Get:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease [24.3 kB]
Get:11 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:12 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:13 https://r2u.stat.

In [15]:
!curl -X GET "localhost:9200/"

curl: (7) Failed to connect to localhost port 9200 after 0 ms: Connection refused


In [16]:
!pip install elasticsearch
!pip install spacy
!python -m spacy download pt_core_news_sm

Collecting elasticsearch
  Downloading elasticsearch-8.16.0-py3-none-any.whl.metadata (8.8 kB)
Collecting elastic-transport<9,>=8.15.1 (from elasticsearch)
  Downloading elastic_transport-8.15.1-py3-none-any.whl.metadata (3.7 kB)
Downloading elasticsearch-8.16.0-py3-none-any.whl (543 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m543.1/543.1 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading elastic_transport-8.15.1-py3-none-any.whl (64 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.4/64.4 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: elastic-transport, elasticsearch
Successfully installed elastic-transport-8.15.1 elasticsearch-8.16.0
Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m

In [17]:
from elasticsearch import Elasticsearch
import spacy
import re
from datetime import datetime

# Conectar ao Elasticsearch
es = Elasticsearch("http://localhost:9200")

# Função para extrair entidades do texto
def extract_entities(text):
    nlp = spacy.load("pt_core_news_sm")
    doc = nlp(text)
    entities = {
        "dates": [],
        "names": [],
        "organizations": [],
        "cnpjs": []
    }
    for ent in doc.ents:
        if ent.label_ == "DATE":
            entities["dates"].append(ent.text)
        elif ent.label_ == "PERSON":
            entities["names"].append(ent.text)
        elif ent.label_ == "ORG":
            entities["organizations"].append(ent.text)
    # Regex para encontrar CNPJs
    cnpjs = re.findall(r"\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}", text)
    entities["cnpjs"] = cnpjs
    return entities

# Indexar um documento no Elasticsearch
def index_document(es_client, index_name, doc_id, content, entities):
    document = {
        "content": content,
        "dates": entities["dates"],
        "names": entities["names"],
        "organizations": entities["organizations"],
        "cnpjs": entities["cnpjs"],
        "timestamp": datetime.now()  # Adiciona timestamp para rastreamento
    }
    es_client.index(index=index_name, id=doc_id, document=document)

# Processar e indexar todos os textos extraídos
def process_and_index_texts(texts, index_name="diario_oficial"):
    for i, text in enumerate(texts):
        print(f"Indexando documento {i}...")
        entities = extract_entities(text)
        index_document(es, index_name, i, text, entities)

# Substituir pelo texto extraído do PDF
example_texts = [
    "Em 13/11/2024, a Prefeitura de Guaraí contratou ARAUJO E REPLANDE LTDA para manutenção de ar condicionado.",
    "A empresa PONTUAL REFRIGERAÇÃO assinou contrato em 12/11/2024 com a Prefeitura Municipal de Guaraí.",
]

# Indexar os textos
process_and_index_texts(example_texts)


Indexando documento 0...


Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connection.py", line 199, in _new_conn
    sock = connection.create_connection(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 85, in create_connection
    raise err
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/connection.py", line 73, in create_connection
    sock.connect(sa)
ConnectionRefusedError: [Errno 111] Connection refused

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/elastic_transport/_node/_http_urllib3.py", line 167, in perform_request
    response = self.pool.urlopen(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/connectionpool.py", line 843, in urlopen
    retries = retries.increment(
  File "/usr/local/lib/python3.10/dist-packages/urllib3/util/retry.py", line 449, in increment
    raise reraise(type(error), error, _s

ConnectionError: Connection error caused by: ConnectionError(Connection error caused by: NewConnectionError(<urllib3.connection.HTTPConnection object at 0x7a5da565ba00>: Failed to establish a new connection: [Errno 111] Connection refused))

In [None]:
# Consultar por entidades específicas
def search_by_field(es_client, index_name, field, value):
    query = {
        "query": {
            "match": {
                field: value
            }
        }
    }
    response = es_client.search(index=index_name, body=query)
    return response["hits"]["hits"]

# Exemplo: Buscar por uma data
results = search_by_field(es, "diario_oficial", "dates", "13/11/2024")
print("Resultados da busca por data:")
for hit in results:
    print(hit["_source"])

# Exemplo: Buscar por uma organização
results = search_by_field(es, "diario_oficial", "organizations", "ARAUJO E REPLANDE LTDA")
print("Resultados da busca por organização:")
for hit in results:
    print(hit["_source"])


In [None]:
import pdfplumber

# Extrair texto do PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

# Pipeline completo de extração e indexação
def process_pdf_and_index(pdf_path, index_name="diario_oficial"):
    texts = extract_text_from_pdf(pdf_path)
    process_and_index_texts(texts, index_name=index_name)

# Substitua pelo caminho do seu PDF
pdf_path = "/content/exemplo.pdf"
process_pdf_and_index(pdf_path)


In [None]:
!docker run -d -p 5601:5601 --link elasticsearch:elasticsearch docker.elastic.co/kibana/kibana:8.10.2


In [None]:
http://localhost:5601
