<a href="https://colab.research.google.com/github/Chediak/common-master-ai/blob/main/prodesan_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdfplumber sentence-transformers faiss-cpu spacy requests elasticsearch
!python -m spacy download pt_core_news_sm
!pip install fastapi uvicorn
!pip install python-multipart

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting elasticsearch
  Downloading elasticsearch-8.16.0-py3-none-any.whl.metadata (8.8 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting elastic-transport<9,>=8.15.1 (from elasticse

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import spacy
import json
import re
from datetime import datetime

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i], "length": len(news_items[i])} for i in range(len(news_items))]
    return metadata

def extract_entities(news_items):
    nlp = spacy.load("pt_core_news_sm")
    extracted_data = []
    for idx, news in enumerate(news_items):
        doc = nlp(news)
        entities = {"id": idx, "content": news, "dates": [], "names": [], "organizations": []}

        for ent in doc.ents:
            if ent.label_ == "DATE":
                entities["dates"].append(ent.text)
            elif ent.label_ == "PERSON":
                entities["names"].append(ent.text)
            elif ent.label_ == "ORG":
                entities["organizations"].append(ent.text)

        cnpj_matches = re.findall(r"\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}", news)
        entities["cnpjs"] = cnpj_matches
        extracted_data.append(entities)
    return extracted_data

def create_structured_index(entities):
    index = {"dates": {}, "names": {}, "organizations": {}, "cnpjs": {}}
    for item in entities:
        for date in item["dates"]:
            index["dates"].setdefault(date, []).append(item)
        for name in item["names"]:
            index["names"].setdefault(name, []).append(item)
        for org in item["organizations"]:
            index["organizations"].setdefault(org, []).append(item)
        for cnpj in item["cnpjs"]:
            index["cnpjs"].setdefault(cnpj, []).append(item)
    return index

def query_index(index, query_type, query_value):
    if query_type in index:
        return index[query_type].get(query_value, [])
    return []

def optimize_layout(metadata, top_k=5):
    sorted_news = sorted(metadata, key=lambda x: x["length"], reverse=True)
    layout = []
    for i, item in enumerate(sorted_news):
        layout.append({
            "type": "news",
            "content": item["content"],
            "page": i // top_k + 1,
            "position": i % top_k
        })
        if (i + 1) % 3 == 0:
            layout.append({
                "type": "advertisement",
                "content": "Ad Placeholder",
                "page": (i + 1) // top_k + 1,
                "position": "bottom"
            })
    return layout

def main():
    pdf_path = "/content/RHOAI _ Prodesp - Diário Oficial.pdf"

    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    print("Adding metadata...")
    metadata = add_metadata(news_items)

    print("Extracting entities...")
    entities = extract_entities(news_items)

    print("Creating structured index...")
    structured_index = create_structured_index(entities)

    print("Querying by date '13/11/2024'...")
    date_results = query_index(structured_index, "dates", "13/11/2024")
    print(json.dumps(date_results, indent=2))

    print("Querying by organization 'ARAUJO E REPLANDE LTDA'...")
    org_results = query_index(structured_index, "organizations", "ARAUJO E REPLANDE LTDA")
    print(json.dumps(org_results, indent=2))

    print("Optimizing layout...")
    layout = optimize_layout(metadata)
    print(json.dumps(layout, indent=2))

if __name__ == "__main__":
    main()


  from tqdm.autonotebook import tqdm, trange


Extracting text from PDF...
Preprocessing and splitting text...
Generating embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating FAISS index...
Adding metadata...
Extracting entities...
Creating structured index...
Querying by date '13/11/2024'...
[]
Querying by organization 'ARAUJO E REPLANDE LTDA'...
[]
Optimizing layout...
[
  {
    "type": "news",
    "content": "EstruturadoJSONderetornoesperadoqueser\u00e1geradopelaRedHat\n{ \"pdf_data\":\n[\n{\"id\": \u201cf97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc\u201d ,\"x\":50,\"y\":300,\"pagina\":1,\"tipo\":\"mat\u00e9ria\"},\n{\u201cid\u201d:\u201c\",\"x\":100,\"y\":550,\"pagina\":1,\"tipo\":\"titulo\"},\n{\u201cid\u201d:\u201cd2a7e6bd-5d6d-4e1c-a747-a0ae5f1499da\",\"x\":100,\"y\":550,\"pagina\":1,\"tipo\":\"mat\u00e9ria\"}\n{\u201cid\u201d:\u201c\",\"x\":100,\"y\":550,\"pagina\":1,\"tipo\":\"calhau-1\"}\n]\n}\n\u2014--------\nDi\u00e1riooficial:tamanhoA3\n-Alinhamentodasmat\u00e9riascomamargemsuperiorquandonaprimeiralinhainterpretando\n\"Alinhamento\"comodist\u00e2nciadamargemsuperior:17mm.\n-Alinhamentodasmat\u00e9riascomamargeminferiorquandonaultimalinha,interp

In [4]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i], "length": len(news_items[i])} for i in range(len(news_items))]
    return metadata

def search_similar_news(query, model, index, metadata, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = [
        {"content": metadata[idx]["content"], "distance": float(dist), "length": metadata[idx]["length"]}
        for dist, idx in zip(distances[0], indices[0])
    ]
    return results

def optimize_layout(metadata, top_k=5):
    sorted_news = sorted(metadata, key=lambda x: x["length"], reverse=True)

    layout = []
    for i, item in enumerate(sorted_news):
        layout.append({"type": "news", "content": item["content"], "page": i // top_k + 1, "position": i % top_k})
        if (i + 1) % 3 == 0:
            layout.append({"type": "advertisement", "content": "Ad Placeholder", "page": (i + 1) // top_k + 1, "position": "bottom"})

    return layout

def main():
    pdf_path = "/content/RHOAI _ Prodesp - Diário Oficial.pdf"

    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    print("Adding metadata...")
    metadata = add_metadata(news_items)

    query = "example query about a topic"
    print("Searching for similar news...")
    results = search_similar_news(query, model, index, metadata, top_k=5)

    print("Search Results:")
    print(json.dumps(results, indent=2))

    print("Optimizing layout...")
    layout = optimize_layout(metadata)

    print("Layout:")
    print(json.dumps(layout, indent=2))

if __name__ == "__main__":
    main()


Extracting text from PDF...
Preprocessing and splitting text...
Generating embeddings...
Creating FAISS index...
Adding metadata...
Searching for similar news...
Search Results:
[
  {
    "content": "-Justificarasmat\u00e9riasqueest\u00e3odentrodamesmacolunaautomaticamente\n10.Entreasse\u00e7\u00f5esdemat\u00e9ria,incluirumtextoautomaticamentedaquebradese\u00e7\u00e3o\n4.Extra\u00e7\u00e3o\na. Arquitetura\n#TODO\n7",
    "distance": 1.64579439163208,
    "length": 168
  },
  {
    "content": "EstruturadoJSONderetornoesperadoqueser\u00e1geradopelaRedHat\n{ \"pdf_data\":\n[\n{\"id\": \u201cf97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc\u201d ,\"x\":50,\"y\":300,\"pagina\":1,\"tipo\":\"mat\u00e9ria\"},\n{\u201cid\u201d:\u201c\",\"x\":100,\"y\":550,\"pagina\":1,\"tipo\":\"titulo\"},\n{\u201cid\u201d:\u201cd2a7e6bd-5d6d-4e1c-a747-a0ae5f1499da\",\"x\":100,\"y\":550,\"pagina\":1,\"tipo\":\"mat\u00e9ria\"}\n{\u201cid\u201d:\u201c\",\"x\":100,\"y\":550,\"pagina\":1,\"tipo\":\"calhau-1\"}\n]\n}\n\u2014---

In [5]:
import json

input_data = {
    "pdf_data": [
        {"id": "f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc", "altura": 125, "largura": 35, "agrupamento": 1},
        {"id": "f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc", "altura": 70, "largura": 40, "agrupamento": 2}
    ]
}

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420
ad_sizes = [(81, 100), (250, 210)]

def generate_output(input_data):
    output_data = {"pdf_data": []}
    page = 1
    current_y = margin_top

    for item in input_data["pdf_data"]:
        content_type = "matéria" if item["agrupamento"] == 1 else "titulo"
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": margin_left,
            "y": current_y,
            "pagina": page,
            "tipo": content_type
        })
        current_y += item["altura"] + min_spacing_mm

        if current_y + margin_bottom > page_height_mm:
            page += 1
            current_y = margin_top

        for ad_width, ad_height in ad_sizes:
            if current_y + ad_height + margin_bottom <= page_height_mm:
                output_data["pdf_data"].append({
                    "id": "calhau-placeholder",
                    "x": margin_left,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + min_spacing_mm
                break

    return output_data

structured_output = generate_output(input_data)

output_path = "structured_output.json"
with open(output_path, "w") as f:
    json.dump(structured_output, f, indent=2)

print(f"Structured JSON output saved to {output_path}")
structured_output


Structured JSON output saved to structured_output.json


{'pdf_data': [{'id': 'f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc',
   'x': 13,
   'y': 17,
   'pagina': 1,
   'tipo': 'matéria'},
  {'id': 'calhau-placeholder',
   'x': 13,
   'y': 147,
   'pagina': 1,
   'tipo': 'calhau-81x100'},
  {'id': 'f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc',
   'x': 13,
   'y': 252,
   'pagina': 1,
   'tipo': 'titulo'}]}

In [6]:
import pdfplumber
import json

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420
ad_sizes = [(81, 100), (250, 210)]

def extract_pdf_data(pdf_path):
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_data.append({
                    "id": f"block-{page_number}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,
                    "largura": 200,
                    "agrupamento": 1 if idx % 2 == 0 else 2
                })
    return extracted_data

def generate_output(pdf_data):
    output_data = {"pdf_data": []}
    page = 1
    current_y = margin_top

    for item in pdf_data:
        content_type = "matéria" if item["agrupamento"] == 1 else "titulo"
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": margin_left,
            "y": current_y,
            "pagina": page,
            "tipo": content_type
        })
        current_y += item["altura"] + min_spacing_mm

        if current_y + margin_bottom > page_height_mm:
            page += 1
            current_y = margin_top

        for ad_width, ad_height in ad_sizes:
            if current_y + ad_height + margin_bottom <= page_height_mm:
                output_data["pdf_data"].append({
                    "id": "calhau-placeholder",
                    "x": margin_left,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + min_spacing_mm
                break

    return output_data

pdf_path = "/content/RHOAI _ Prodesp - Diário Oficial.pdf"

print("Extracting data from the PDF...")
pdf_data = extract_pdf_data(pdf_path)

print("Generating structured JSON output...")
structured_output = generate_output(pdf_data)

output_path = "structured_output.json"
with open(output_path, "w") as f:
    json.dump(structured_output, f, indent=2)

print(f"Structured JSON output saved to {output_path}")
structured_output


Extracting data from the PDF...
Generating structured JSON output...
Structured JSON output saved to structured_output.json


{'pdf_data': [{'id': 'block-1-0',
   'x': 13,
   'y': 17,
   'pagina': 1,
   'tipo': 'matéria'},
  {'id': 'calhau-placeholder',
   'x': 13,
   'y': 32.1,
   'pagina': 1,
   'tipo': 'calhau-81x100'},
  {'id': 'block-2-0', 'x': 13, 'y': 137.1, 'pagina': 1, 'tipo': 'matéria'},
  {'id': 'calhau-placeholder',
   'x': 13,
   'y': 159.4,
   'pagina': 1,
   'tipo': 'calhau-81x100'},
  {'id': 'block-3-0', 'x': 13, 'y': 264.4, 'pagina': 1, 'tipo': 'matéria'},
  {'id': 'block-4-0',
   'x': 13,
   'y': 318.79999999999995,
   'pagina': 1,
   'tipo': 'matéria'},
  {'id': 'block-5-0',
   'x': 13,
   'y': 355.59999999999997,
   'pagina': 1,
   'tipo': 'matéria'},
  {'id': 'calhau-placeholder',
   'x': 13,
   'y': 17,
   'pagina': 2,
   'tipo': 'calhau-81x100'},
  {'id': 'block-6-0', 'x': 13, 'y': 122, 'pagina': 2, 'tipo': 'matéria'},
  {'id': 'calhau-placeholder',
   'x': 13,
   'y': 170.9,
   'pagina': 2,
   'tipo': 'calhau-81x100'},
  {'id': 'block-7-0', 'x': 13, 'y': 275.9, 'pagina': 2, 'tipo': 'ma

In [7]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import pdfplumber
import json
import os

app = FastAPI()

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420
ad_sizes = [(81, 100), (250, 210)]

def extract_pdf_data(pdf_path):
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_data.append({
                    "id": f"block-{page_number}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,
                    "largura": 200,
                    "agrupamento": 1 if idx % 2 == 0 else 2
                })
    return extracted_data

def generate_output(pdf_data):
    output_data = {"pdf_data": []}
    page = 1
    current_y = margin_top

    for item in pdf_data:
        content_type = "matéria" if item["agrupamento"] == 1 else "titulo"
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": margin_left,
            "y": current_y,
            "pagina": page,
            "tipo": content_type
        })
        current_y += item["altura"] + min_spacing_mm

        if current_y + margin_bottom > page_height_mm:
            page += 1
            current_y = margin_top

        for ad_width, ad_height in ad_sizes:
            if current_y + ad_height + margin_bottom <= page_height_mm:
                output_data["pdf_data"].append({
                    "id": "calhau-placeholder",
                    "x": margin_left,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + min_spacing_mm
                break

    return output_data

@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
    pdf_path = f"./{file.filename}"
    with open(pdf_path, "wb") as f:
        f.write(await file.read())

    try:
        pdf_data = extract_pdf_data(pdf_path)

        structured_output = generate_output(pdf_data)

        output_path = f"./{os.path.splitext(file.filename)[0]}_output.json"
        with open(output_path, "w") as f:
            json.dump(structured_output, f, indent=2)

        return JSONResponse(content=structured_output)

    except Exception as e:
        return {"error": str(e)}

    finally:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)

# To run the API server:
# Use the command: uvicorn <filename>:app --reload


In [8]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import pdfplumber
import json
import os
import nest_asyncio
import uvicorn

# Initialize FastAPI app
app = FastAPI()

# Constants for layout rules
margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420  # Approx height of A3 in mm
ad_sizes = [(81, 100), (250, 210)]  # Example sizes in mm

# Extract text and preprocess data from PDF
def extract_pdf_data(pdf_path):
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            # Split text into blocks for processing
            for idx, block in enumerate(text.split("\n\n")):
                extracted_data.append({
                    "id": f"block-{page_number}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,  # Mock height based on content length
                    "largura": 200,  # Arbitrary width
                    "agrupamento": 1 if idx % 2 == 0 else 2  # Alternate grouping
                })
    return extracted_data

# Generate the expected JSON output with alignment and spacing rules
def generate_output(pdf_data):
    output_data = {"pdf_data": []}
    page = 1
    current_y = margin_top  # Start at the top margin

    for item in pdf_data:
        # Add an entry for the main content
        content_type = "matéria" if item["agrupamento"] == 1 else "titulo"
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": margin_left,  # Always align to left margin
            "y": current_y,
            "pagina": page,
            "tipo": content_type
        })
        # Update Y position considering the height and spacing
        current_y += item["altura"] + min_spacing_mm

        # Check if a new page is needed
        if current_y + margin_bottom > page_height_mm:
            page += 1
            current_y = margin_top

        # Add calhau (ad placeholder) if space permits
        for ad_width, ad_height in ad_sizes:
            if current_y + ad_height + margin_bottom <= page_height_mm:
                output_data["pdf_data"].append({
                    "id": "calhau-placeholder",
                    "x": margin_left,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + min_spacing_mm
                break

    return output_data

@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
    # Save the uploaded file
    pdf_path = f"./{file.filename}"
    with open(pdf_path, "wb") as f:
        f.write(await file.read())

    try:
        # Extract data from the PDF
        pdf_data = extract_pdf_data(pdf_path)

        # Generate the structured output
        structured_output = generate_output(pdf_data)

        # Save the structured JSON output
        output_path = f"./{os.path.splitext(file.filename)[0]}_output.json"
        with open(output_path, "w") as f:
            json.dump(structured_output, f, indent=2)

        # Return the structured output as response
        return JSONResponse(content=structured_output)

    except Exception as e:
        return {"error": str(e)}

    finally:
        # Clean up: Remove the uploaded file
        if os.path.exists(pdf_path):
            os.remove(pdf_path)

# Run the server in Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()  # Allow nested event loops in Jupyter

# Start the FastAPI server
print("Starting FastAPI server...")
# uvicorn.run(app, host="127.0.0.1", port=8000)

Starting FastAPI server...


In [9]:
# import requests

# url = "http://127.0.0.1:8000/upload-pdf/"
# files = {"file": open("RHOAI _ Prodesp - Diário Oficial.pdf", "rb")}
# response = requests.post(url, files=files)

# print(response.json())

In [10]:
import random
import math
import json

# Constantes de layout
MARGIN_TOP = 17  # mm
MARGIN_BOTTOM = 13  # mm
MARGIN_LEFT = 13  # mm
MARGIN_RIGHT = 13  # mm
MIN_SPACING_MM = 5  # mm
PAGE_HEIGHT_MM = 420  # mm (altura de uma página A3)
PAGE_WIDTH_MM = 297  # mm (largura de uma página A3)
AD_SIZES = [(81, 100), (250, 210)]  # Exemplos de anúncios

# Função de custo
def calculate_cost(layout):
    """
    Calcula o custo baseado em:
    - Espaços vazios: Quanto menos espaço desperdiçado, menor o custo.
    - Páginas usadas: Menos páginas, menor custo.
    """
    unused_space = 0
    total_pages = max(block['pagina'] for block in layout) + 1
    for page in range(total_pages):
        page_blocks = [b for b in layout if b['pagina'] == page]
        used_space = sum(b['altura'] + MIN_SPACING_MM for b in page_blocks)
        unused_space += PAGE_HEIGHT_MM - used_space
    return total_pages * 100 + unused_space

# Geração inicial do layout
def generate_initial_layout(blocks):
    layout = []
    page = 0
    current_y = MARGIN_TOP
    for block in blocks:
        if current_y + block['altura'] + MARGIN_BOTTOM > PAGE_HEIGHT_MM:
            page += 1
            current_y = MARGIN_TOP
        layout.append({
            "id": block['id'],
            "x": MARGIN_LEFT,
            "y": current_y,
            "pagina": page,
            "tipo": block.get('tipo', 'matéria'),
            "altura": block['altura']
        })
        current_y += block['altura'] + MIN_SPACING_MM
    return layout

# Perturbação (vizinho)
def perturb_layout(layout):
    new_layout = layout[:]
    idx1, idx2 = random.sample(range(len(new_layout)), 2)
    new_layout[idx1], new_layout[idx2] = new_layout[idx2], new_layout[idx1]
    return new_layout

# Simulated Annealing
def simulated_annealing(blocks, initial_temp, cooling_rate, max_iterations):
    current_layout = generate_initial_layout(blocks)
    current_cost = calculate_cost(current_layout)
    best_layout = current_layout[:]
    best_cost = current_cost

    temperature = initial_temp

    for iteration in range(max_iterations):
        new_layout = perturb_layout(current_layout)
        new_cost = calculate_cost(new_layout)
        delta = new_cost - current_cost

        # Aceitação baseada na temperatura
        if delta < 0 or random.random() < math.exp(-delta / temperature):
            current_layout = new_layout
            current_cost = new_cost
            if current_cost < best_cost:
                best_layout = current_layout[:]
                best_cost = current_cost

        # Resfriamento
        temperature *= cooling_rate

        # Print de progresso
        if iteration % 100 == 0:
            print(f"Iteração {iteration}, Custo Atual: {current_cost}, Melhor Custo: {best_cost}")

    return best_layout, best_cost

# Dados de entrada fictícios
blocks = [
    {"id": f"block-{i}", "altura": random.randint(50, 150), "tipo": "matéria"} for i in range(20)
]

# Execução do Simulated Annealing
initial_temp = 1000
cooling_rate = 0.95
max_iterations = 1000

print("Executando Simulated Annealing para diagramação...")
optimized_layout, optimized_cost = simulated_annealing(blocks, initial_temp, cooling_rate, max_iterations)

# Saída em JSON
output = {"pdf_data": optimized_layout}
output_path = "optimized_layout.json"
with open(output_path, "w") as f:
    json.dump(output, f, indent=2)

print(f"Layout otimizado salvo em {output_path}")


Executando Simulated Annealing para diagramação...
Iteração 0, Custo Atual: 1564, Melhor Custo: 1564
Iteração 100, Custo Atual: 1564, Melhor Custo: 1564
Iteração 200, Custo Atual: 1564, Melhor Custo: 1564
Iteração 300, Custo Atual: 1564, Melhor Custo: 1564
Iteração 400, Custo Atual: 1564, Melhor Custo: 1564
Iteração 500, Custo Atual: 1564, Melhor Custo: 1564
Iteração 600, Custo Atual: 1564, Melhor Custo: 1564
Iteração 700, Custo Atual: 1564, Melhor Custo: 1564
Iteração 800, Custo Atual: 1564, Melhor Custo: 1564
Iteração 900, Custo Atual: 1564, Melhor Custo: 1564
Layout otimizado salvo em optimized_layout.json


## Implementation

In [11]:
import pdfplumber
import json
import re
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse

# Layout Constants
MARGINS = {"top": 17, "bottom": 13, "left": 13, "right": 13}  # mm
PAGE_HEIGHT_MM = 420  # A3 Height
MIN_SPACING_MM = 5  # Minimum spacing between blocks
AD_SIZES = [(81, 100), (250, 210)]  # Ad dimensions (width, height)

app = FastAPI()

def extract_pdf_blocks(pdf_path):
    """Extract content blocks from the PDF."""
    extracted_blocks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_idx, page in enumerate(pdf.pages):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_blocks.append({
                    "id": f"block-{page_idx}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,  # Estimate height
                    "pagina": page_idx + 1
                })
    return extracted_blocks

def auto_layout(blocks):
    """Generate layout JSON with alignment and ads."""
    layout = []
    current_y = MARGINS["top"]
    page = 1

    for block in blocks:
        layout.append({
            "id": block["id"],
            "x": MARGINS["left"],
            "y": current_y,
            "pagina": page,
            "tipo": "matéria"
        })
        current_y += block["altura"] + MIN_SPACING_MM
        if current_y + MARGINS["bottom"] > PAGE_HEIGHT_MM:
            page += 1
            current_y = MARGINS["top"]

        # Add ads if space allows
        for ad_width, ad_height in AD_SIZES:
            if current_y + ad_height + MARGINS["bottom"] <= PAGE_HEIGHT_MM:
                layout.append({
                    "id": "calhau-placeholder",
                    "x": MARGINS["left"],
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + MIN_SPACING_MM
                break

    return layout

@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
    """Upload PDF and return structured JSON."""
    pdf_path = f"./{file.filename}"
    with open(pdf_path, "wb") as f:
        f.write(await file.read())

    try:
        blocks = extract_pdf_blocks(pdf_path)
        structured_layout = auto_layout(blocks)
        return JSONResponse(content={"pdf_data": structured_layout})
    except Exception as e:
        return {"error": str(e)}
    finally:
        os.remove(pdf_path)

# Start API server


In [12]:
from elasticsearch import Elasticsearch
import pdfplumber
import json

# Elasticsearch Configuration
ES_HOST = "https://7d29aea5a554429db47e92ddc177f4d1.us-central1.gcp.cloud.es.io:443"
API_KEY = "UURhZ1RwTUJIX19CdjAzZVJOUFY6X293dWg0RWtSbXVjSTVKYlVJb2pxQQ=="

# Initialize Elasticsearch client
es = Elasticsearch(
    ES_HOST,
    api_key=API_KEY
)

# Test Elasticsearch connection
if es.ping():
    print("Successfully connected to Elasticsearch")
else:
    print("Failed to connect to Elasticsearch")
    raise RuntimeError("Elasticsearch connection failed.")

# Define the index name
INDEX_NAME = "pdf_index"

# Function to create an index in Elasticsearch
def create_index(index_name):
    if not es.indices.exists(index=index_name):
        es.indices.create(
            index=index_name,
            body={
                "mappings": {
                    "properties": {
                        "id": {"type": "keyword"},
                        "content": {"type": "text"},
                        "altura": {"type": "float"},
                        "pagina": {"type": "integer"},
                        "tipo": {"type": "keyword"}
                    }
                }
            }
        )
        print(f"Index '{index_name}' created successfully.")
    else:
        print(f"Index '{index_name}' already exists.")

create_index(INDEX_NAME)

# Function to extract text blocks from a PDF
def extract_pdf_blocks(pdf_path):
    extracted_blocks = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_idx, page in enumerate(pdf.pages):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_blocks.append({
                    "id": f"block-{page_idx}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,  # Estimated height
                    "pagina": page_idx + 1,
                    "tipo": "matéria" if idx % 2 == 0 else "titulo"
                })
    return extracted_blocks

# Function to index extracted data into Elasticsearch
def index_data(data, index_name):
    for item in data:
        es.index(index=index_name, id=item["id"], body=item)
    print(f"{len(data)} documents indexed in Elasticsearch.")

# Path to your PDF
pdf_path = "/content/RHOAI _ Prodesp - Diário Oficial.pdf"  # Replace with your PDF file path

# Extract data from the PDF
print("Extracting data from PDF...")
blocks = extract_pdf_blocks(pdf_path)

# Index data in Elasticsearch
print("Indexing data into Elasticsearch...")
index_data(blocks, INDEX_NAME)

# Function to search data in Elasticsearch
def search_elasticsearch(query, index_name, field="content", size=5):
    response = es.search(
        index=index_name,
        body={
            "query": {
                "match": {
                    field: query
                }
            },
            "size": size
        }
    )
    return response["hits"]["hits"]

# Example search
print("Performing search in Elasticsearch...")
query = "sua consulta aqui"
results = search_elasticsearch(query, INDEX_NAME)

# Display results
for result in results:
    print(json.dumps(result["_source"], indent=2, ensure_ascii=False))


Successfully connected to Elasticsearch
Index 'pdf_index' already exists.
Extracting data from PDF...
Indexing data into Elasticsearch...
9 documents indexed in Elasticsearch.
Performing search in Elasticsearch...


In [13]:
# Função para buscar dados no Elasticsearch
def search_elasticsearch(query, index_name, field="content", size=5):
    response = es.search(
        index=index_name,
        body={
            "query": {
                "match": {
                    field: query
                }
            },
            "size": size
        }
    )
    return response["hits"]["hits"]

# Exemplo de busca
print("Buscando no Elasticsearch...")
query = "oficial"
results = search_elasticsearch(query, INDEX_NAME)

# Exibir resultados
for result in results:
    print(json.dumps(result["_source"], indent=2, ensure_ascii=False))


Buscando no Elasticsearch...
{
  "id": "block-0-0",
  "content": "Red Hat OpenShift AI\nDocumento de Arquitetura\nDiário Oficial: Diagramação e Extração\nPreparadopara:\n1",
  "altura": 10.100000000000001,
  "pagina": 1,
  "tipo": "matéria"
}


In [14]:
def advanced_search(query, index_name, page_filter=None, type_filter=None, size=5):
    filters = []
    if page_filter:
        filters.append({"term": {"pagina": page_filter}})
    if type_filter:
        filters.append({"term": {"tipo": type_filter}})

    body = {
        "query": {
            "bool": {
                "must": [{"match": {"content": query}}],
                "filter": filters
            }
        },
        "size": size
    }

    response = es.search(index=index_name, body=body)
    return response["hits"]["hits"]

# Exemplo de busca avançada
results = advanced_search("sua consulta aqui", INDEX_NAME, page_filter=1, type_filter="matéria")
for result in results:
    print(json.dumps(result["_source"], indent=2, ensure_ascii=False))


In [17]:
# Função para acessar a API do Diário Oficial
def access_doe_api(endpoint, params=None):
    url = f"https://do-api-web-search.doe.sp.gov.br{endpoint}"
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Erro {response.status_code}: {response.text}")
        return None

# Exemplo: Listar cadernos disponíveis
journals = access_doe_api("/v2/journals")
if journals:
    print(json.dumps(journals, indent=2, ensure_ascii=False))

# Exemplo: Obter seções de um caderno específico
journal_id = "ca96256b-6ca1-407f-866e-567ef9430123"  # Substituir pelo ID correto
sections = access_doe_api(f"/v2/sections?JournalId={journal_id}")
if sections:
    print(json.dumps(sections, indent=2, ensure_ascii=False))


{
  "count": 5,
  "items": [
    {
      "id": "ca96256b-6ca1-407f-866e-567ef9430123",
      "name": "Executivo",
      "sequence": 1
    },
    {
      "id": "0953858b-7195-4020-ec15-08db6b8e0e4c",
      "name": "Legislativo",
      "sequence": 2
    },
    {
      "id": "d65936d7-1ca8-4267-934e-1dea132fa237",
      "name": "Municípios",
      "sequence": 3
    },
    {
      "id": "8e1f4298-811b-4e36-0df1-08db6b8c9f7b",
      "name": "Empresarial",
      "sequence": 4
    },
    {
      "id": "fe4c14dc-e256-4834-b19f-8fd84d664171",
      "name": "Jucesp",
      "sequence": 5
    }
  ]
}
{
  "count": 3,
  "items": [
    {
      "id": "257b103f-1eb2-4f24-a170-4e553c7e4aac",
      "journalId": "ca96256b-6ca1-407f-866e-567ef9430123",
      "name": "Atos Normativos",
      "sequence": 1
    },
    {
      "id": "e6c70876-31ee-4b9a-8840-b8d5ea12352d",
      "journalId": "ca96256b-6ca1-407f-866e-567ef9430123",
      "name": "Atos de Pessoal",
      "sequence": 2
    },
    {
      "id": "43

In [18]:
# Busca avançada de matérias
params = {
    "FromDate": "2024-11-21",
    "ToDate": "2024-11-22",
    "Terms[0]": "NOMEADO",
    "Terms[1]": "EXTRADITADO",
    "JournalId": journal_id,
    "PageNumber": 1,
    "PageSize": 10
}
advanced_search = access_doe_api("/v2/advanced-search/publications", params=params)
if advanced_search:
    print(json.dumps(advanced_search, indent=2, ensure_ascii=False))


{
  "items": [
    {
      "isLegacy": false,
      "id": "47781553-a228-4812-92c0-08dcf9fe32bd",
      "publicationTypeId": "f6ac099f-77a7-4366-a64a-fa20b90e6528",
      "secondLevelSectionId": "d6f11cbc-adff-46cd-7d5e-08db6b94d2bf",
      "thirdLevelSectionId": "cf2f551e-bb7a-4d1a-b645-08db6b943832",
      "date": "2024-11-21T05:00:48.0509223",
      "title": "Decisão do Diretor-Geral, de 19/11/2024",
      "slug": "executivo/ministerio-publico/decisao-do-diretor-geral-de-19-11-2024-2024111911416199726484",
      "excerpt": "§ 1º do artigo 2º, prevê a obrigatoriedade do repasse dos emolumentos à Instituição, considerando que há equiparação de direitos e deveres entre o Interino nomeado e o Titular da Delegação, nos termos item 12 da Subseção III, da Seção II do Capítulo XXI Tomo II, do PROVIMENTO Nº 58/89 (NORMAS DE SERVIÇO CARTÓRIO EXTRAJ",
      "hierarchy": "Executivo > Atos Normativos > Ministério Público > Diretoria Geral",
      "totalTermsFound": 1,
      "termsFound": [
     