<a href="https://colab.research.google.com/github/Chediak/common-master-ai/blob/main/prodesan_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pdfplumber sentence-transformers faiss-cpu spacy
!python -m spacy download pt_core_news_sm
!pip install fastapi uvicorn
!pip install python-multipart

In [None]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import spacy
import json
import re
from datetime import datetime

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i], "length": len(news_items[i])} for i in range(len(news_items))]
    return metadata

def extract_entities(news_items):
    nlp = spacy.load("pt_core_news_sm")
    extracted_data = []
    for idx, news in enumerate(news_items):
        doc = nlp(news)
        entities = {"id": idx, "content": news, "dates": [], "names": [], "organizations": []}

        for ent in doc.ents:
            if ent.label_ == "DATE":
                entities["dates"].append(ent.text)
            elif ent.label_ == "PERSON":
                entities["names"].append(ent.text)
            elif ent.label_ == "ORG":
                entities["organizations"].append(ent.text)

        cnpj_matches = re.findall(r"\d{2}\.\d{3}\.\d{3}/\d{4}-\d{2}", news)
        entities["cnpjs"] = cnpj_matches
        extracted_data.append(entities)
    return extracted_data

def create_structured_index(entities):
    index = {"dates": {}, "names": {}, "organizations": {}, "cnpjs": {}}
    for item in entities:
        for date in item["dates"]:
            index["dates"].setdefault(date, []).append(item)
        for name in item["names"]:
            index["names"].setdefault(name, []).append(item)
        for org in item["organizations"]:
            index["organizations"].setdefault(org, []).append(item)
        for cnpj in item["cnpjs"]:
            index["cnpjs"].setdefault(cnpj, []).append(item)
    return index

def query_index(index, query_type, query_value):
    if query_type in index:
        return index[query_type].get(query_value, [])
    return []

def optimize_layout(metadata, top_k=5):
    sorted_news = sorted(metadata, key=lambda x: x["length"], reverse=True)
    layout = []
    for i, item in enumerate(sorted_news):
        layout.append({
            "type": "news",
            "content": item["content"],
            "page": i // top_k + 1,
            "position": i % top_k
        })
        if (i + 1) % 3 == 0:
            layout.append({
                "type": "advertisement",
                "content": "Ad Placeholder",
                "page": (i + 1) // top_k + 1,
                "position": "bottom"
            })
    return layout

def main():
    pdf_path = "RHOAI _ Prodesp - Diário Oficial.pdf"

    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    print("Adding metadata...")
    metadata = add_metadata(news_items)

    print("Extracting entities...")
    entities = extract_entities(news_items)

    print("Creating structured index...")
    structured_index = create_structured_index(entities)

    print("Querying by date '13/11/2024'...")
    date_results = query_index(structured_index, "dates", "13/11/2024")
    print(json.dumps(date_results, indent=2))

    print("Querying by organization 'ARAUJO E REPLANDE LTDA'...")
    org_results = query_index(structured_index, "organizations", "ARAUJO E REPLANDE LTDA")
    print(json.dumps(org_results, indent=2))

    print("Optimizing layout...")
    layout = optimize_layout(metadata)
    print(json.dumps(layout, indent=2))

if __name__ == "__main__":
    main()


In [None]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i], "length": len(news_items[i])} for i in range(len(news_items))]
    return metadata

def search_similar_news(query, model, index, metadata, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = [
        {"content": metadata[idx]["content"], "distance": float(dist), "length": metadata[idx]["length"]}
        for dist, idx in zip(distances[0], indices[0])
    ]
    return results

def optimize_layout(metadata, top_k=5):
    # Sort news items by length (longer news first)
    sorted_news = sorted(metadata, key=lambda x: x["length"], reverse=True)

    layout = []
    for i, item in enumerate(sorted_news):
        layout.append({"type": "news", "content": item["content"], "page": i // top_k + 1, "position": i % top_k})
        if (i + 1) % 3 == 0:
            layout.append({"type": "advertisement", "content": "Ad Placeholder", "page": (i + 1) // top_k + 1, "position": "bottom"})

    return layout

def main():
    pdf_path = "RHOAI _ Prodesp - Diário Oficial.pdf"

    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    print("Adding metadata...")
    metadata = add_metadata(news_items)

    query = "example query about a topic"
    print("Searching for similar news...")
    results = search_similar_news(query, model, index, metadata, top_k=5)

    print("Search Results:")
    print(json.dumps(results, indent=2))

    print("Optimizing layout...")
    layout = optimize_layout(metadata)

    print("Layout:")
    print(json.dumps(layout, indent=2))

if __name__ == "__main__":
    main()


In [None]:
import json

input_data = {
    "pdf_data": [
        {"id": "f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc", "altura": 125, "largura": 35, "agrupamento": 1},
        {"id": "f97fb8c9-60d6-4fed-a82d-1cdf4e8be0bc", "altura": 70, "largura": 40, "agrupamento": 2}
    ]
}

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420
ad_sizes = [(81, 100), (250, 210)]

def generate_output(input_data):
    output_data = {"pdf_data": []}
    page = 1
    current_y = margin_top

    for item in input_data["pdf_data"]:
        content_type = "matéria" if item["agrupamento"] == 1 else "titulo"
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": margin_left,
            "y": current_y,
            "pagina": page,
            "tipo": content_type
        })
        current_y += item["altura"] + min_spacing_mm

        if current_y + margin_bottom > page_height_mm:
            page += 1
            current_y = margin_top

        for ad_width, ad_height in ad_sizes:
            if current_y + ad_height + margin_bottom <= page_height_mm:
                output_data["pdf_data"].append({
                    "id": "calhau-placeholder",
                    "x": margin_left,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + min_spacing_mm
                break

    return output_data

structured_output = generate_output(input_data)

output_path = "structured_output.json"
with open(output_path, "w") as f:
    json.dump(structured_output, f, indent=2)

print(f"Structured JSON output saved to {output_path}")
structured_output


In [None]:
import pdfplumber
import json

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420
ad_sizes = [(81, 100), (250, 210)]

def extract_pdf_data(pdf_path):
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            # Split text into blocks for processing
            for idx, block in enumerate(text.split("\n\n")):
                extracted_data.append({
                    "id": f"block-{page_number}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,
                    "largura": 200,
                    "agrupamento": 1 if idx % 2 == 0 else 2
                })
    return extracted_data

def generate_output(pdf_data):
    output_data = {"pdf_data": []}
    page = 1
    current_y = margin_top

    for item in pdf_data:
        content_type = "matéria" if item["agrupamento"] == 1 else "titulo"
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": margin_left,
            "y": current_y,
            "pagina": page,
            "tipo": content_type
        })
        current_y += item["altura"] + min_spacing_mm

        if current_y + margin_bottom > page_height_mm:
            page += 1
            current_y = margin_top

        for ad_width, ad_height in ad_sizes:
            if current_y + ad_height + margin_bottom <= page_height_mm:
                output_data["pdf_data"].append({
                    "id": "calhau-placeholder",
                    "x": margin_left,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + min_spacing_mm
                break

    return output_data

pdf_path = "RHOAI _ Prodesp - Diário Oficial.pdf"

print("Extracting data from the PDF...")
pdf_data = extract_pdf_data(pdf_path)

print("Generating structured JSON output...")
structured_output = generate_output(pdf_data)

output_path = "structured_output.json"
with open(output_path, "w") as f:
    json.dump(structured_output, f, indent=2)

print(f"Structured JSON output saved to {output_path}")
structured_output


In [None]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import pdfplumber
import json
import os

app = FastAPI()

margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420
ad_sizes = [(81, 100), (250, 210)]

def extract_pdf_data(pdf_path):
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            for idx, block in enumerate(text.split("\n\n")):
                extracted_data.append({
                    "id": f"block-{page_number}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,
                    "largura": 200,
                    "agrupamento": 1 if idx % 2 == 0 else 2
                })
    return extracted_data

def generate_output(pdf_data):
    output_data = {"pdf_data": []}
    page = 1
    current_y = margin_top

    for item in pdf_data:
        content_type = "matéria" if item["agrupamento"] == 1 else "titulo"
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": margin_left,
            "y": current_y,
            "pagina": page,
            "tipo": content_type
        })
        current_y += item["altura"] + min_spacing_mm

        if current_y + margin_bottom > page_height_mm:
            page += 1
            current_y = margin_top

        for ad_width, ad_height in ad_sizes:
            if current_y + ad_height + margin_bottom <= page_height_mm:
                output_data["pdf_data"].append({
                    "id": "calhau-placeholder",
                    "x": margin_left,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + min_spacing_mm
                break

    return output_data

@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
    pdf_path = f"./{file.filename}"
    with open(pdf_path, "wb") as f:
        f.write(await file.read())

    try:
        pdf_data = extract_pdf_data(pdf_path)

        structured_output = generate_output(pdf_data)

        output_path = f"./{os.path.splitext(file.filename)[0]}_output.json"
        with open(output_path, "w") as f:
            json.dump(structured_output, f, indent=2)

        return JSONResponse(content=structured_output)

    except Exception as e:
        return {"error": str(e)}

    finally:
        if os.path.exists(pdf_path):
            os.remove(pdf_path)

# To run the API server:
# Use the command: uvicorn <filename>:app --reload


In [None]:
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import pdfplumber
import json
import os
import nest_asyncio
import uvicorn

# Initialize FastAPI app
app = FastAPI()

# Constants for layout rules
margin_top = 17  # mm
margin_bottom = 13  # mm
margin_left = 13  # mm
margin_right = 13  # mm
min_spacing_mm = 5  # mm
page_height_mm = 420  # Approx height of A3 in mm
ad_sizes = [(81, 100), (250, 210)]  # Example sizes in mm

# Extract text and preprocess data from PDF
def extract_pdf_data(pdf_path):
    extracted_data = []
    with pdfplumber.open(pdf_path) as pdf:
        for page_number, page in enumerate(pdf.pages, start=1):
            text = page.extract_text()
            if not text:
                continue
            # Split text into blocks for processing
            for idx, block in enumerate(text.split("\n\n")):
                extracted_data.append({
                    "id": f"block-{page_number}-{idx}",
                    "content": block.strip(),
                    "altura": len(block) * 0.1,  # Mock height based on content length
                    "largura": 200,  # Arbitrary width
                    "agrupamento": 1 if idx % 2 == 0 else 2  # Alternate grouping
                })
    return extracted_data

# Generate the expected JSON output with alignment and spacing rules
def generate_output(pdf_data):
    output_data = {"pdf_data": []}
    page = 1
    current_y = margin_top  # Start at the top margin

    for item in pdf_data:
        # Add an entry for the main content
        content_type = "matéria" if item["agrupamento"] == 1 else "titulo"
        output_data["pdf_data"].append({
            "id": item["id"],
            "x": margin_left,  # Always align to left margin
            "y": current_y,
            "pagina": page,
            "tipo": content_type
        })
        # Update Y position considering the height and spacing
        current_y += item["altura"] + min_spacing_mm

        # Check if a new page is needed
        if current_y + margin_bottom > page_height_mm:
            page += 1
            current_y = margin_top

        # Add calhau (ad placeholder) if space permits
        for ad_width, ad_height in ad_sizes:
            if current_y + ad_height + margin_bottom <= page_height_mm:
                output_data["pdf_data"].append({
                    "id": "calhau-placeholder",
                    "x": margin_left,
                    "y": current_y,
                    "pagina": page,
                    "tipo": f"calhau-{ad_width}x{ad_height}"
                })
                current_y += ad_height + min_spacing_mm
                break

    return output_data

@app.post("/upload-pdf/")
async def upload_pdf(file: UploadFile = File(...)):
    # Save the uploaded file
    pdf_path = f"./{file.filename}"
    with open(pdf_path, "wb") as f:
        f.write(await file.read())

    try:
        # Extract data from the PDF
        pdf_data = extract_pdf_data(pdf_path)

        # Generate the structured output
        structured_output = generate_output(pdf_data)

        # Save the structured JSON output
        output_path = f"./{os.path.splitext(file.filename)[0]}_output.json"
        with open(output_path, "w") as f:
            json.dump(structured_output, f, indent=2)

        # Return the structured output as response
        return JSONResponse(content=structured_output)

    except Exception as e:
        return {"error": str(e)}

    finally:
        # Clean up: Remove the uploaded file
        if os.path.exists(pdf_path):
            os.remove(pdf_path)

# Run the server in Jupyter Notebook
import nest_asyncio
nest_asyncio.apply()  # Allow nested event loops in Jupyter

# Start the FastAPI server
print("Starting FastAPI server...")
uvicorn.run(app, host="127.0.0.1", port=8000)

In [None]:
import requests

url = "http://127.0.0.1:8000/upload-pdf/"
files = {"file": open("RHOAI _ Prodesp - Diário Oficial.pdf", "rb")}
response = requests.post(url, files=files)

print(response.json())