<a href="https://colab.research.google.com/github/Chediak/common-master-ai/blob/main/prodesan_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
!pip install pdfplumber sentence-transformers faiss-cpu spacy requests elasticsearch requests
!python -m spacy download pt_core_news_sm
!pip install fastapi uvicorn
!pip install python-multipart

Collecting pt-core-news-sm==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/pt_core_news_sm-3.7.0/pt_core_news_sm-3.7.0-py3-none-any.whl (13.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.0/13.0 MB[0m [31m92.4 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('pt_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [23]:
import os

TITLE = os.getenv("TITLE", "Título do Documento")
CALHAU_SIZES = [(81, 100), (250, 210)]
PAGE_SIZE = (297, 420)
MARGINS = {"top": 17, "bottom": 13, "left": 13, "right": 13}
MIN_SPACING_MM = 5

In [24]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import spacy
import json
import re
from datetime import datetime

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i], "length": len(news_items[i])} for i in range(len(news_items))]
    return metadata

def extract_entities(news_items):
    nlp = spacy.load("pt_core_news_sm")
    extracted_data = []
    for idx, news in enumerate(news_items):
        doc = nlp(news)
        entities = {"id": idx, "content": news, "dates": [], "names": [], "organizations": []}

        for ent in doc.ents:
            if ent.label_ == "DATE":
                entities["dates"].append(ent.text)
            elif ent.label_ == "PERSON":
                entities["names"].append(ent.text)
            elif ent.label_ == "ORG":
                entities["organizations"].append(ent.text)

        cnpj_matches = re.findall(r"\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}", news)
        entities["cnpjs"] = cnpj_matches
        extracted_data.append(entities)
    return extracted_data

def create_structured_index(entities):
    index = {"dates": {}, "names": {}, "organizations": {}, "cnpjs": {}}
    for item in entities:
        for date in item["dates"]:
            index["dates"].setdefault(date, []).append(item)
        for name in item["names"]:
            index["names"].setdefault(name, []).append(item)
        for org in item["organizations"]:
            index["organizations"].setdefault(org, []).append(item)
        for cnpj in item["cnpjs"]:
            index["cnpjs"].setdefault(cnpj, []).append(item)
    return index

def query_index(index, query_type, query_value):
    if query_type in index:
        return index[query_type].get(query_value, [])
    return []

def optimize_layout(metadata, top_k=5):
    sorted_news = sorted(metadata, key=lambda x: x["length"], reverse=True)
    layout = []
    for i, item in enumerate(sorted_news):
        layout.append({
            "type": "news",
            "content": item["content"],
            "page": i // top_k + 1,
            "position": i % top_k
        })
        if (i + 1) % 3 == 0:
            layout.append({
                "type": "advertisement",
                "content": "Ad Placeholder",
                "page": (i + 1) // top_k + 1,
                "position": "bottom"
            })
    return layout

def main():
    pdf_path = "/content/DOM-1947.pdf"

    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    print("Adding metadata...")
    metadata = add_metadata(news_items)

    print("Extracting entities...")
    entities = extract_entities(news_items)

    print("Creating structured index...")
    structured_index = create_structured_index(entities)

    print("Querying by date '13/11/2024'...")
    date_results = query_index(structured_index, "dates", "13/11/2024")
    print(json.dumps(date_results, indent=2))

    print("Querying by organization 'ARAUJO E REPLANDE LTDA'...")
    org_results = query_index(structured_index, "organizations", "ARAUJO E REPLANDE LTDA")
    print(json.dumps(org_results, indent=2))

    print("Optimizing layout...")
    layout = optimize_layout(metadata)
    print(json.dumps(layout, indent=2))

if __name__ == "__main__":
    main()

Extracting text from PDF...
Preprocessing and splitting text...
Generating embeddings...
Creating FAISS index...
Adding metadata...
Extracting entities...
Creating structured index...
Querying by date '13/11/2024'...
[]
Querying by organization 'ARAUJO E REPLANDE LTDA'...
[]
Optimizing layout...
[
  {
    "type": "news",
    "content": "Quarta-feira, 13 de novembro de 2024 03 Ano X \u2022 N\u00ba 1.947 \u2022 Prefeitura Municipal de Guara\u00ed/TO\nSECRETARIA MUNICIPAL DE SA\u00daDE Instala\u00e7\u00e3o, desinstala\u00e7\u00e3o, manuten\u00e7\u00e3o,\nlimpeza, reparo em geladeira,\n100 60,00 6.000,00\nHORA bebedouro, refrigerador, frigobar,\nfreezer, filtro e purificador de \u00e1gua\nEXTRATO DO PRIMEIRO TERMO ADITIVO 01 Valor global estimado para aquisi\u00e7\u00e3o\nde pe\u00e7as e/ou componentes originais\nem geladeira, bebedouro, refrigerador, 6.000,00\nCONTRATO N.\u00ba 073/2023\nfrigobar, freezer, filtro e purificador\nProcesso: 3243/2023 de \u00e1gua\nPreg\u00e3o Eletr\u00f4nico

In [25]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i], "length": len(news_items[i])} for i in range(len(news_items))]
    return metadata

def search_similar_news(query, model, index, metadata, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = [
        {"content": metadata[idx]["content"], "distance": float(dist), "length": metadata[idx]["length"]}
        for dist, idx in zip(distances[0], indices[0])
    ]
    return results

def optimize_layout(metadata, top_k=5):
    sorted_news = sorted(metadata, key=lambda x: x["length"], reverse=True)

    layout = []
    for i, item in enumerate(sorted_news):
        layout.append({"type": "news", "content": item["content"], "page": i // top_k + 1, "position": i % top_k})
        if (i + 1) % 3 == 0:
            layout.append({"type": "advertisement", "content": "Ad Placeholder", "page": (i + 1) // top_k + 1, "position": "bottom"})

    return layout

def main():
    pdf_path = "/content/DOM-1947.pdf"

    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    print("Adding metadata...")
    metadata = add_metadata(news_items)

    query = "example query about a topic"
    print("Searching for similar news...")
    results = search_similar_news(query, model, index, metadata, top_k=5)

    print("Search Results:")
    print(json.dumps(results, indent=2))

    print("Optimizing layout...")
    layout = optimize_layout(metadata)

    print("Layout:")
    print(json.dumps(layout, indent=2))

if __name__ == "__main__":
    main()


Extracting text from PDF...
Preprocessing and splitting text...
Generating embeddings...
Creating FAISS index...
Adding metadata...
Searching for similar news...
Search Results:
[
  {
    "content": "Quarta-feira, 13 de novembro de 2024 03 Ano X \u2022 N\u00ba 1.947 \u2022 Prefeitura Municipal de Guara\u00ed/TO\nSECRETARIA MUNICIPAL DE SA\u00daDE Instala\u00e7\u00e3o, desinstala\u00e7\u00e3o, manuten\u00e7\u00e3o,\nlimpeza, reparo em geladeira,\n100 60,00 6.000,00\nHORA bebedouro, refrigerador, frigobar,\nfreezer, filtro e purificador de \u00e1gua\nEXTRATO DO PRIMEIRO TERMO ADITIVO 01 Valor global estimado para aquisi\u00e7\u00e3o\nde pe\u00e7as e/ou componentes originais\nem geladeira, bebedouro, refrigerador, 6.000,00\nCONTRATO N.\u00ba 073/2023\nfrigobar, freezer, filtro e purificador\nProcesso: 3243/2023 de \u00e1gua\nPreg\u00e3o Eletr\u00f4nico: 028/2023\nDESCONTO NO VALOR DAS PE\u00c7AS - 40 %\n\u00d3rg\u00e3o: Fundo Municipal da Sa\u00fade de Guara\u00ed - TO.\nContratada: ARAUJ

In [26]:
import json

input_data = {
    "pdf_data": [
        {"id": "f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc", "altura": 125, "largura": 35, "agrupamento": 1},
        {"id": "f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc", "altura": 70, "largura": 40, "agrupamento": 2}
    ]
}

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420
ad_sizes = [(81, 100), (250, 210)]

def generate_output(input_data):
    output_data = {"pdf_data": []}
    page = 1
    current_y = margin_top

    for item in input_data["pdf_data"]:
        content_type = "matéria" if item["agrupamento"] == 1 else "titulo"
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": margin_left,
            "y": current_y,
            "pagina": page,
            "tipo": content_type
        })
        current_y += item["altura"] + min_spacing_mm

        if current_y + margin_bottom > page_height_mm:
            page += 1
            current_y = margin_top

        for ad_width, ad_height in ad_sizes:
            if current_y + ad_height + margin_bottom <= page_height_mm:
                output_data["pdf_data"].append({
                    "id": "calhau-placeholder",
                    "x": margin_left,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + min_spacing_mm
                break

    return output_data

structured_output = generate_output(input_data)

output_path = "structured_output.json"
with open(output_path, "w") as f:
    json.dump(structured_output, f, indent=2)

print(f"Structured JSON output saved to {output_path}")
structured_output


Structured JSON output saved to structured_output.json


{'pdf_data': [{'id': 'f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc',
   'x': 13,
   'y': 17,
   'pagina': 1,
   'tipo': 'matéria'},
  {'id': 'calhau-placeholder',
   'x': 13,
   'y': 147,
   'pagina': 1,
   'tipo': 'calhau-81x100'},
  {'id': 'f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc',
   'x': 13,
   'y': 252,
   'pagina': 1,
   'tipo': 'titulo'}]}

In [27]:
import pdfplumber
import json

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420
ad_sizes = [(81, 100), (250, 210)]

def extract_pdf_data(pdf_path):
    """Extrai blocos de conteúdo do PDF."""
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_idx, page in enumerate(pdf.pages):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_data.append({
                    "id": f"block-{page_idx}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,]
                    "agrupamento": 1 if idx % 2 == 0 else 2
                })
    return extracted_data

def generate_output(pdf_data):
    """Gera layout baseado em agrupamentos e minimiza espaço em branco."""
    output_data = {"pdf_data": []}
    page = 1
    current_y = MARGINS["top"]
    current_x = MARGINS["left"]
    column_width = (PAGE_SIZE[0] - MARGINS["left"] - MARGINS["right"]) / 2

    # Ordenar por agrupamento
    pdf_data.sort(key=lambda x: x["agrupamento"])

    for agrupamento in [1, 2]:
        agrupamento_data = [block for block in pdf_data if block["agrupamento"] == agrupamento]

        for block in agrupamento_data:
            # Verificar se o bloco cabe na coluna atual
            if current_y + block["altura"] + MARGINS["bottom"] > PAGE_SIZE[1]:
                # Passar para próxima coluna ou página
                if current_x + column_width + MARGINS["right"] > PAGE_SIZE[0]:
                    page += 1
                    current_x = MARGINS["left"]
                    current_y = MARGINS["top"]
                else:
                    current_x += column_width
                    current_y = MARGINS["top"]

            # Adicionar bloco ao layout
            output_data["pdf_data"].append({
                "id": block["id"],
                "x": current_x,
                "y": current_y,
                "pagina": page,
                "tipo": "matéria" if agrupamento == 1 else "título",
                "altura": block["altura"]
            })
            current_y += block["altura"] + MIN_SPACING_MM

        # Adicionar calhau no final de cada coluna
        for ad_width, ad_height in CALHAU_SIZES:
            if current_y + ad_height + MARGINS["bottom"] <= PAGE_SIZE[1]:
                output_data["pdf_data"].append({
                    "id": f"calhau-{ad_width}x{ad_height}",
                    "x": current_x,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + MIN_SPACING_MM
                break

    return output_data

pdf_path = "/content/DOM-1947.pdf"

print("Extracting data from the PDF...")
pdf_data = extract_pdf_data(pdf_path)

print("Generating structured JSON output...")
structured_output = generate_output(pdf_data)

output_path = "structured_output.json"
with open(output_path, "w") as f:
    json.dump(structured_output, f, indent=2)

print(f"Structured JSON output saved to {output_path}")
structured_output


SyntaxError: closing parenthesis ']' does not match opening parenthesis '{' on line 21 (<ipython-input-27-cf5672746dee>, line 24)

In [None]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import pdfplumber
import json
import os

app = FastAPI()

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420
ad_sizes = [(81, 100), (250, 210)]

def extract_pdf_data(pdf_path):
    """Extrai blocos de conteúdo do PDF."""
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_idx, page in enumerate(pdf.pages):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_data.append({
                    "id": f"block-{page_idx}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,
                    "agrupamento": 1 if idx % 2 == 0 else 2
                })
    return extracted_data

def generate_output(pdf_data):
    """Gera layout baseado em agrupamentos e minimiza espaço em branco."""
    output_data = {"pdf_data": []}
    page = 1
    current_y = MARGINS["top"]
    current_x = MARGINS["left"]
    column_width = (PAGE_SIZE[0] - MARGINS["left"] - MARGINS["right"]) / 2

    # Ordenar por agrupamento
    pdf_data.sort(key=lambda x: x["agrupamento"])

    for agrupamento in [1, 2]:
        agrupamento_data = [block for block in pdf_data if block["agrupamento"] == agrupamento]

        for block in agrupamento_data:
            # Verificar se o bloco cabe na coluna atual
            if current_y + block["altura"] + MARGINS["bottom"] > PAGE_SIZE[1]:
                # Passar para próxima coluna ou página
                if current_x + column_width + MARGINS["right"] > PAGE_SIZE[0]:
                    page += 1
                    current_x = MARGINS["left"]
                    current_y = MARGINS["top"]
                else:
                    current_x += column_width
                    current_y = MARGINS["top"]

            # Adicionar bloco ao layout
            output_data["pdf_data"].append({
                "id": block["id"],
                "x": current_x,
                "y": current_y,
                "pagina": page,
                "tipo": "matéria" if agrupamento == 1 else "título",
                "altura": block["altura"]
            })
            current_y += block["altura"] + MIN_SPACING_MM

        # Adicionar calhau no final de cada coluna
        for ad_width, ad_height in CALHAU_SIZES:
            if current_y + ad_height + MARGINS["bottom"] <= PAGE_SIZE[1]:
                output_data["pdf_data"].append({
                    "id": f"calhau-{ad_width}x{ad_height}",
                    "x": current_x,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + MIN_SPACING_MM
                break

    return output_data

@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
    pdf_path = f"./{file.filename}"
    with open(pdf_path, "wb") as f:
        f.write(await file.read())

    try:
        pdf_data = extract_pdf_data(pdf_path)

        structured_output = generate_output(pdf_data)

        output_path = f"./{os.path.splitext(file.filename)[0]}_output.json"
        with open(output_path, "w") as f:
            json.dump(structured_output, f, indent=2)

        return JSONResponse(content=structured_output)

    except Exception as e:
        return {"error": str(e)}

    finally:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)

In [None]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import pdfplumber
import json
import os
import nest_asyncio
import uvicorn

app = FastAPI()

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420  #
ad_sizes = [(81, 100), (250, 210)]

def extract_pdf_data(pdf_path):
    """Extrai blocos de conteúdo do PDF."""
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_idx, page in enumerate(pdf.pages):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_data.append({
                    "id": f"block-{page_idx}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,
                    "agrupamento": 1 if idx % 2 == 0 else 2
                })
    return extracted_data

def generate_output(pdf_data):
    """Gera layout baseado em agrupamentos e minimiza espaço em branco."""
    output_data = {"pdf_data": []}
    page = 1
    current_y = MARGINS["top"]
    current_x = MARGINS["left"]
    column_width = (PAGE_SIZE[0] - MARGINS["left"] - MARGINS["right"]) / 2

    # Ordenar por agrupamento
    pdf_data.sort(key=lambda x: x["agrupamento"])

    for agrupamento in [1, 2]:
        agrupamento_data = [block for block in pdf_data if block["agrupamento"] == agrupamento]

        for block in agrupamento_data:
            # Verificar se o bloco cabe na coluna atual
            if current_y + block["altura"] + MARGINS["bottom"] > PAGE_SIZE[1]:
                # Passar para próxima coluna ou página
                if current_x + column_width + MARGINS["right"] > PAGE_SIZE[0]:
                    page += 1
                    current_x = MARGINS["left"]
                    current_y = MARGINS["top"]
                else:
                    current_x += column_width
                    current_y = MARGINS["top"]

            # Adicionar bloco ao layout
            output_data["pdf_data"].append({
                "id": block["id"],
                "x": current_x,
                "y": current_y,
                "pagina": page,
                "tipo": "matéria" if agrupamento == 1 else "título",
                "altura": block["altura"]
            })
            current_y += block["altura"] + MIN_SPACING_MM

        # Adicionar calhau no final de cada coluna
        for ad_width, ad_height in CALHAU_SIZES:
            if current_y + ad_height + MARGINS["bottom"] <= PAGE_SIZE[1]:
                output_data["pdf_data"].append({
                    "id": f"calhau-{ad_width}x{ad_height}",
                    "x": current_x,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + MIN_SPACING_MM
                break

    return output_data

@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
    pdf_path = f"./{file.filename}"
    with open(pdf_path, "wb") as f:
        f.write(await file.read())

    try:
        pdf_data = extract_pdf_data(pdf_path)

        structured_output = generate_output(pdf_data)

        output_path = f"./{os.path.splitext(file.filename)[0]}_output.json"
        with open(output_path, "w") as f:
            json.dump(structured_output, f, indent=2)

        return JSONResponse(content=structured_output)

    except Exception as e:
        return {"error": str(e)}

    finally:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)

import nest_asyncio
nest_asyncio.apply()

print("Starting FastAPI server...")


In [None]:
# import requests

# url = "http://127.0.0.1:8000/upload-pdf/"
# files = {"file": open("RHOAI _ Prodesp - Diário Oficial.pdf", "rb")}
# response = requests.post(url, files=files)

# print(response.json())

In [None]:
import random
import math
import json

MARGIN_TOP = 17
MARGIN_BOTTOM = 13
MARGIN_LEFT = 13
MARGIN_RIGHT = 13
MIN_SPACING_MM = 5
PAGE_HEIGHT_MM = 420
PAGE_WIDTH_MM = 297
AD_SIZES = [(81, 100), (250, 210)]

def calculate_cost(layout):
    """
    Calcula o custo baseado em:
    - Espaços vazios: Quanto menos espaço desperdiçado, menor o custo.
    - Páginas usadas: Menos páginas, menor custo.
    """
    unused_space = 0
    total_pages = max(block['pagina'] for block in layout) + 1
    for page in range(total_pages):
        page_blocks = [b for b in layout if b['pagina'] == page]
        used_space = sum(b['altura'] + MIN_SPACING_MM for b in page_blocks)
        unused_space += PAGE_HEIGHT_MM - used_space
    return total_pages * 100 + unused_space

def generate_initial_layout(blocks):
    layout = []
    page = 0
    current_y = MARGIN_TOP
    for block in blocks:
        if current_y + block['altura'] + MARGIN_BOTTOM > PAGE_HEIGHT_MM:
            page += 1
            current_y = MARGIN_TOP
        layout.append({
            "id": block['id'],
            "x": MARGIN_LEFT,
            "y": current_y,
            "pagina": page,
            "tipo": block.get('tipo', 'matéria'),
            "altura": block['altura']
        })
        current_y += block['altura'] + MIN_SPACING_MM
    return layout

def perturb_layout(layout):
    new_layout = layout[:]
    idx1, idx2 = random.sample(range(len(new_layout)), 2)
    new_layout[idx1], new_layout[idx2] = new_layout[idx2], new_layout[idx1]
    return new_layout

def simulated_annealing(blocks, initial_temp, cooling_rate, max_iterations):
    current_layout = generate_initial_layout(blocks)
    current_cost = calculate_cost(current_layout)
    best_layout = current_layout[:]
    best_cost = current_cost

    temperature = initial_temp

    for iteration in range(max_iterations):
        new_layout = perturb_layout(current_layout)
        new_cost = calculate_cost(new_layout)
        delta = new_cost - current_cost

        if delta < 0 or random.random() < math.exp(-delta / temperature):
            current_layout = new_layout
            current_cost = new_cost
            if current_cost < best_cost:
                best_layout = current_layout[:]
                best_cost = current_cost

        temperature *= cooling_rate

        if iteration % 100 == 0:
            print(f"Iteração {iteration}, Custo Atual: {current_cost}, Melhor Custo: {best_cost}")

    return best_layout, best_cost

blocks = [
    {"id": f"block-{i}", "altura": random.randint(50, 150), "tipo": "matéria"} for i in range(20)
]

initial_temp = 1000
cooling_rate = 0.95
max_iterations = 1000

print("Executando Simulated Annealing para diagramação...")
optimized_layout, optimized_cost = simulated_annealing(blocks, initial_temp, cooling_rate, max_iterations)

output = {"pdf_data": optimized_layout}
output_path = "optimized_layout.json"
with open(output_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"Layout otimizado salvo em {output_path}")


## Implementation

In [None]:
import pdfplumber
import json
import re
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse

MARGINS = {"top": 17, "bottom": 13, "left": 13, "right": 13}  # mm
PAGE_HEIGHT_MM = 420
MIN_SPACING_MM = 5
AD_SIZES = [(81, 100), (250, 210)]

app = FastAPI()

def extract_pdf_blocks(pdf_path):
    """Extract content blocks from the PDF."""
    extracted_blocks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_idx, page in enumerate(pdf.pages):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_blocks.append({
                    "id": f"block-{page_idx}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,
                    "pagina": page_idx + 1
                })
    return extracted_blocks

def auto_layout(blocks):
    """Generate layout JSON with alignment and ads."""
    layout = []
    current_y = MARGINS["top"]
    page = 1

    for block in blocks:
        layout.append({
            "id": block["id"],
            "x": MARGINS["left"],
            "y": current_y,
            "pagina": page,
            "tipo": "matéria"
        })
        current_y += block["altura"] + MIN_SPACING_MM
        if current_y + MARGINS["bottom"] > PAGE_HEIGHT_MM:
            page += 1
            current_y = MARGINS["top"]

        for ad_width, ad_height in AD_SIZES:
            if current_y + ad_height + MARGINS["bottom"] <= PAGE_HEIGHT_MM:
                layout.append({
                    "id": "calhau-placeholder",
                    "x": MARGINS["left"],
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + MIN_SPACING_MM
                break

    return layout

@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
    """Upload PDF and return structured JSON."""
    pdf_path = f"./{file.filename}"
    with open(pdf_path, "wb") as f:
        f.write(await file.read())

    try:
        blocks = extract_pdf_blocks(pdf_path)
        structured_layout = auto_layout(blocks)
        return JSONResponse(content={"pdf_data": structured_layout})
    except Exception as e:
        return {"error": str(e)}
    finally:
        os.remove(pdf_path)

In [None]:
from elasticsearch import Elasticsearch
import pdfplumber
import json

ES_HOST = "https://7d29aea5a554429db47e92ddc177f4d1.us-central1.gcp.cloud.es.io:443"
API_KEY = "UURhZ1RwTUJIX19CdjAzZVJOUFY6X293dWg0RWtSbXVjSTVKYlVJb2pxQQ=="

es = Elasticsearch(
    ES_HOST,
    api_key=API_KEY
)

if es.ping():
    print("Successfully connected to Elasticsearch")
else:
    print("Failed to connect to Elasticsearch")
    raise RuntimeError("Elasticsearch connection failed.")

INDEX_NAME = "pdf_index"

def create_index(index_name):
    if not es.indices.exists(index=index_name):
        es.indices.create(
            index=index_name,
            body={
                "mappings": {
                    "properties": {
                        "id": {"type": "keyword"},
                        "content": {"type": "text"},
                        "altura": {"type": "float"},
                        "pagina": {"type": "integer"},
                        "tipo": {"type": "keyword"}
                    }
                }
            }
        )
        print(f"Index '{index_name}' created successfully.")
    else:
        print(f"Index '{index_name}' already exists.")

create_index(INDEX_NAME)

def extract_pdf_blocks(pdf_path):
    extracted_blocks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_idx, page in enumerate(pdf.pages):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_blocks.append({
                    "id": f"block-{page_idx}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,
                    "pagina": page_idx + 1,
                    "tipo": "matéria" if idx % 2 == 0 else "titulo"
                })
    return extracted_blocks

def index_data(data, index_name):
    for item in data:
        es.index(index=index_name, id=item["id"], body=item)
    print(f"{len(data)} documents indexed in Elasticsearch.")

pdf_path = "/content/DOM-1947.pdf"

print("Extracting data from PDF...")
blocks = extract_pdf_blocks(pdf_path)

print("Indexing data into Elasticsearch...")
index_data(blocks, INDEX_NAME)

def search_elasticsearch(query, index_name, field="content", size=5):
    response = es.search(
        index=index_name,
        body={
            "query": {
                "match": {
                    field: query
                }
            },
            "size": size
        }
    )
    return response["hits"]["hits"]

print("Performing search in Elasticsearch...")
query = "sua consulta aqui"
results = search_elasticsearch(query, INDEX_NAME)

for result in results:
    print(json.dumps(result["_source"], indent=2, ensure_ascii=False))


In [None]:
def search_elasticsearch(query, index_name, field="content", size=5):
    response = es.search(
        index=index_name,
        body={
            "query": {
                "match": {
                    field: query
                }
            },
            "size": size
        }
    )
    return response["hits"]["hits"]

print("Buscando no Elasticsearch...")
query = "oficial"
results = search_elasticsearch(query, INDEX_NAME)

for result in results:
    print(json.dumps(result["_source"], indent=2, ensure_ascii=False))


In [None]:
def advanced_search(query, index_name, page_filter=None, type_filter=None, size=5):
    filters = []
    if page_filter:
        filters.append({"term": {"pagina": page_filter}})
    if type_filter:
        filters.append({"term": {"tipo": type_filter}})

    body = {
        "query": {
            "bool": {
                "must": [{"match": {"content": query}}],
                "filter": filters
            }
        },
        "size": size
    }

    response = es.search(index=index_name, body=body)
    return response["hits"]["hits"]

results = advanced_search("oficial", INDEX_NAME, page_filter=1, type_filter="matéria")
for result in results:
    print(json.dumps(result["_source"], indent=2, ensure_ascii=False))

In [None]:
import requests
import json

def access_doe_api(endpoint, params=None):
    url = f"https://do-api-web-search.doe.sp.gov.br{endpoint}"
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro {response.status_code}: {response.text}")
        return None

journals = access_doe_api("/v2/journals")
if journals:
    print(json.dumps(journals, indent=2, ensure_ascii=False))

journal_id = "ca96256b-6ca1-407f-866e-567ef9430123"
sections = access_doe_api(f"/v2/sections?JournalId={journal_id}")
if sections:
    print(json.dumps(sections, indent=2, ensure_ascii=False))

In [None]:
params = {
    "FromDate": "2024-11-21",
    "ToDate": "2024-11-22",
    "Terms[0]": "NOMEADO",
    "Terms[1]": "EXTRADITADO",
    "JournalId": journal_id,
    "PageNumber": 1,
    "PageSize": 10
}
advanced_search = access_doe_api("/v2/advanced-search/publications", params=params)
if advanced_search:
    print(json.dumps(advanced_search, indent=2, ensure_ascii=False))

In [None]:
import requests
import json

BASE_URL = "https://do-api-web-search.doe.sp.gov.br"
PDF_URL = "https://www.imprensaoficial.com.br/downloads/pdf/edicao"

def make_request(endpoint, params=None):
    response = requests.get(f"{BASE_URL}{endpoint}", params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro {response.status_code}: {response.text}")
        return None

def listar_cadernos():
    return make_request("/v2/journals")

def buscar_secoes_raiz(journal_id):
    return make_request(f"/v2/sections?JournalId={journal_id}")

def obter_hierarquia_dia(date, journal_id, section_id):
    params = {
        "Date": date,
        "JournalId": journal_id,
        "SectionId": section_id
    }
    return make_request("/v2/summary/structured", params=params)

def recuperar_materia(slug):
    return make_request(f"/v2/publications/{slug}")

def busca_avancada(from_date, to_date, terms, journal_id=None, section_id=None, page_number=1, page_size=20, sort_field="Date"):
    params = {
        "FromDate": from_date,
        "ToDate": to_date,
        "PageNumber": page_number,
        "PageSize": page_size,
        "SortField": sort_field
    }
    for i, term in enumerate(terms):
        params[f"Terms[{i}]"] = term
    if journal_id:
        params["JournalId"] = journal_id
    if section_id:
        params["SectionId"] = section_id
    return make_request("/v2/advanced-search/publications", params=params)

def verificar_status_edicao(date):
    response = requests.get(f"https://do-api-publication-pdf.doe.sp.gov.br/v1/editions/status/{date}")
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro {response.status_code}: {response.text}")
        return None

def abrir_edicao_pdf(date, sigla):
    url = f"{PDF_URL}/{date}{sigla}.pdf"
    response = requests.get(url)
    if response.status_code == 200:
        file_path = f"{date}_{sigla}.pdf"
        with open(file_path, "wb") as file:
            file.write(response.content)
        print(f"PDF salvo em {file_path}")
    else:
        print(f"Erro ao baixar PDF: {response.status_code}")

cadernos = listar_cadernos()
print("Cadernos Disponíveis:", json.dumps(cadernos, indent=2))

if cadernos:
    journal_id = cadernos["items"][0]["id"]
    secoes = buscar_secoes_raiz(journal_id)
    print("Seções Raiz:", json.dumps(secoes, indent=2))

if secoes:
    section_id = secoes["items"][0]["id"]
    hierarquia = obter_hierarquia_dia("2024-11-22", journal_id, section_id)
    print("Hierarquia do Dia:", json.dumps(hierarquia, indent=2))

if hierarquia:
    if "items" in hierarquia and hierarquia["items"]:
        for item in hierarquia["items"]:
            if "children" in item and item["children"]:
                for child in item["children"]:
                    if "publications" in child and child["publications"]:
                        slug = child["publications"][0]["slug"]
                        materia = recuperar_materia(slug)
                        print("Matéria Específica:", json.dumps(materia, indent=2))
                        break
                else:
                    print("Nenhuma publicação encontrada no nível 'children'.")
            else:
                print("Nenhuma 'children' encontrada no item da hierarquia.")
    else:
        print("Nenhum item encontrado na hierarquia.")


busca = busca_avancada("2024-11-21", "2024-11-22", ["NOMEADO", "EXTRADITADO"], journal_id)
print("Busca Avançada:", json.dumps(busca, indent=2))

status_pdf = verificar_status_edicao("2024-11-22")
print("Status da Edição em PDF:", json.dumps(status_pdf, indent=2))

abrir_edicao_pdf("20241125", "EXEC1")


In [None]:
import requests
import json
import pdfplumber

BASE_URL = "https://do-api-web-search.doe.sp.gov.br"
PDF_URL = "https://www.imprensaoficial.com.br/downloads/pdf/edicao"

MARGINS = {"top": 17, "bottom": 13, "left": 13, "right": 13}  # mm
PAGE_HEIGHT_MM = 420
MIN_SPACING_MM = 5
AD_SIZES = [(81, 100), (250, 210)]

def make_request(endpoint, params=None):
    response = requests.get(f"{BASE_URL}{endpoint}", params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro {response.status_code}: {response.text}")
        return None

def listar_cadernos():
    return make_request("/v2/journals")

def buscar_secoes_raiz(journal_id):
    return make_request(f"/v2/sections?JournalId={journal_id}")

def obter_hierarquia_dia(date, journal_id, section_id):
    params = {
        "Date": date,
        "JournalId": journal_id,
        "SectionId": section_id
    }
    return make_request("/v2/summary/structured", params=params)

def recuperar_materia(slug):
    return make_request(f"/v2/publications/{slug}")

def busca_avancada(from_date, to_date, terms, journal_id=None, section_id=None, page_number=1, page_size=20, sort_field="Date"):
    params = {
        "FromDate": from_date,
        "ToDate": to_date,
        "PageNumber": page_number,
        "PageSize": page_size,
        "SortField": sort_field
    }
    for i, term in enumerate(terms):
        params[f"Terms[{i}]"] = term
    if journal_id:
        params["JournalId"] = journal_id
    if section_id:
        params["SectionId"] = section_id
    return make_request("/v2/advanced-search/publications", params=params)

def prepare_diagram_data(api_results):
    blocks = []
    for result in api_results.get("publications", []):
        blocks.append({
            "id": result["id"],
            "content": result["content"],
            "altura": len(result["content"]) * 0.1,
            "tipo": "matéria" if "matéria" in result.get("type", "").lower() else "título"
        })
    return blocks

def auto_layout(blocks):
    layout = []
    current_y = MARGINS["top"]
    page = 1

    for block in blocks:
        layout.append({
            "id": block["id"],
            "x": MARGINS["left"],
            "y": current_y,
            "pagina": page,
            "tipo": block["tipo"]
        })
        current_y += block["altura"] + MIN_SPACING_MM

        if current_y + MARGINS["bottom"] > PAGE_HEIGHT_MM:
            page += 1
            current_y = MARGINS["top"]

        for ad_width, ad_height in AD_SIZES:
            if current_y + ad_height + MARGINS["bottom"] <= PAGE_HEIGHT_MM:
                layout.append({
                    "id": "calhau-placeholder",
                    "x": MARGINS["left"],
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + MIN_SPACING_MM
                break

    return layout

def main():
    busca_results = busca_avancada("2024-11-21", "2024-11-22", ["NOMEADO", "EXTRADITADO"])
    if not busca_results:
        print("Nenhum resultado encontrado pela API.")
        return

    print("Preparando dados para diagramação...")
    blocks = prepare_diagram_data(busca_results)

    print("Gerando layout...")
    layout = auto_layout(blocks)

    output_path = "structured_layout.json"
    with open(output_path, "w") as f:
        json.dump({"pdf_data": layout}, f, indent=2)
    print(f"Layout salvo em {output_path}")

if __name__ == "__main__":
    main()


In [None]:
def main():
    pdf_path = "/content/DOM-1947.pdf"

    print("Extraindo dados do PDF...")
    try:
        pdf_data = extract_pdf_data(pdf_path)
    except Exception as e:
        print(f"Erro durante a extração: {e}")
        return

    print("Gerando layout estruturado...")
    try:
        structured_layout = generate_output(pdf_data)
        output_path = "structured_layout.json"
        with open(output_path, "w") as f:
            json.dump(structured_layout, f, indent=2)
        print(f"Layout salvo em {output_path}")
    except Exception as e:
        print(f"Erro durante a geração do layout: {e}")

if __name__ == "__main__":
    main()