<a href="https://colab.research.google.com/github/Chediak/common-master-ai/blob/main/redhat_prodesan.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install pdfplumber sentence-transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m57.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.9.0


In [5]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

# Step 1: Extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

# Step 2: Preprocess and split into individual news items
def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

# Step 3: Generate embeddings using SentenceTransformer
def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

# Step 4: Store embeddings in a FAISS vector store
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Step 5: Add metadata for later retrieval
def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i]} for i in range(len(news_items))]
    return metadata

# Step 6: Query the vector store
def search_similar_news(query, model, index, metadata, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = [
        {"content": metadata[idx]["content"], "distance": float(dist)}
        for dist, idx in zip(distances[0], indices[0])
    ]
    return results

# Full Pipeline Test
def main():
    # File path to the test PDF
    pdf_path = "/content/DOM-1947.pdf"  # Replace with your PDF path

    # Extract text
    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    # Preprocess and split
    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    # Generate embeddings
    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    # Create FAISS index
    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    # Add metadata
    print("Adding metadata...")
    metadata = add_metadata(news_items)

    # Test a query
    query = "example query about a topic"  # Replace with your query
    print("Searching for similar news...")
    results = search_similar_news(query, model, index, metadata, top_k=5)

    # Print results
    print("Results:")
    print(json.dumps(results, indent=2))

if __name__ == "__main__":
    main()

Extracting text from PDF...
Preprocessing and splitting text...
Generating embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Creating FAISS index...
Adding metadata...
Searching for similar news...
Results:
[
  {
    "content": "Quarta-feira, 13 de novembro de 2024 03 Ano X \u2022 N\u00ba 1.947 \u2022 Prefeitura Municipal de Guara\u00ed/TO\nSECRETARIA MUNICIPAL DE SA\u00daDE Instala\u00e7\u00e3o, desinstala\u00e7\u00e3o, manuten\u00e7\u00e3o,\nlimpeza, reparo em geladeira,\n100 60,00 6.000,00\nHORA bebedouro, refrigerador, frigobar,\nfreezer, filtro e purificador de \u00e1gua\nEXTRATO DO PRIMEIRO TERMO ADITIVO 01 Valor global estimado para aquisi\u00e7\u00e3o\nde pe\u00e7as e/ou componentes originais\nem geladeira, bebedouro, refrigerador, 6.000,00\nCONTRATO N.\u00ba 073/2023\nfrigobar, freezer, filtro e purificador\nProcesso: 3243/2023 de \u00e1gua\nPreg\u00e3o Eletr\u00f4nico: 028/2023\nDESCONTO NO VALOR DAS PE\u00c7AS - 40 %\n\u00d3rg\u00e3o: Fundo Municipal da Sa\u00fade de Guara\u00ed - TO.\nContratada: ARAUJO E REPLANDE LTDA-ME, inscrita no CNPJ/MF VALOR ESTIMADO 10.000,00\nsob o n.\u00ba 09.026.012/00

In [6]:
import pdfplumber
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import json

# Step 1: Extract text from a PDF
def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        pages = [page.extract_text() for page in pdf.pages]
    return pages

# Step 2: Preprocess and split into individual news items
def preprocess_and_split_text(pages):
    news_items = [item.strip() for page in pages for item in page.split("\n\n") if item.strip()]
    return news_items

# Step 3: Generate embeddings using SentenceTransformer
def generate_embeddings(news_items, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name)
    embeddings = model.encode(news_items)
    return embeddings, model

# Step 4: Store embeddings in a FAISS vector store
def create_faiss_index(embeddings):
    dimension = embeddings.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings)
    return index

# Step 5: Add metadata for later retrieval
def add_metadata(news_items):
    metadata = [{"id": i, "content": news_items[i], "length": len(news_items[i])} for i in range(len(news_items))]
    return metadata

# Step 6: Query the vector store
def search_similar_news(query, model, index, metadata, top_k=5):
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = [
        {"content": metadata[idx]["content"], "distance": float(dist), "length": metadata[idx]["length"]}
        for dist, idx in zip(distances[0], indices[0])
    ]
    return results

# Step 7: Reorganize for Optimization of Space
def optimize_layout(metadata, top_k=5):
    # Sort news items by length (longer news first)
    sorted_news = sorted(metadata, key=lambda x: x["length"], reverse=True)

    # Create layout with strategic placement of advertisements
    layout = []
    for i, item in enumerate(sorted_news):
        layout.append({"type": "news", "content": item["content"], "page": i // top_k + 1, "position": i % top_k})
        # Add an advertisement after every 3 news items
        if (i + 1) % 3 == 0:
            layout.append({"type": "advertisement", "content": "Ad Placeholder", "page": (i + 1) // top_k + 1, "position": "bottom"})

    return layout

# Full Pipeline Test
def main():
    # File path to the test PDF
    pdf_path = "/content/DOM-1947.pdf"  # Replace with your PDF path

    # Extract text
    print("Extracting text from PDF...")
    pages = extract_text_from_pdf(pdf_path)

    # Preprocess and split
    print("Preprocessing and splitting text...")
    news_items = preprocess_and_split_text(pages)

    # Generate embeddings
    print("Generating embeddings...")
    embeddings, model = generate_embeddings(news_items)

    # Create FAISS index
    print("Creating FAISS index...")
    index = create_faiss_index(np.array(embeddings))

    # Add metadata
    print("Adding metadata...")
    metadata = add_metadata(news_items)

    # Test a query
    query = "example query about a topic"  # Replace with your query
    print("Searching for similar news...")
    results = search_similar_news(query, model, index, metadata, top_k=5)

    # Print search results
    print("Search Results:")
    print(json.dumps(results, indent=2))

    # Optimize layout
    print("Optimizing layout...")
    layout = optimize_layout(metadata)

    # Print optimized layout
    print("Layout:")
    print(json.dumps(layout, indent=2))

if __name__ == "__main__":
    main()


Extracting text from PDF...
Preprocessing and splitting text...
Generating embeddings...
Creating FAISS index...
Adding metadata...
Searching for similar news...
Search Results:
[
  {
    "content": "Quarta-feira, 13 de novembro de 2024 03 Ano X \u2022 N\u00ba 1.947 \u2022 Prefeitura Municipal de Guara\u00ed/TO\nSECRETARIA MUNICIPAL DE SA\u00daDE Instala\u00e7\u00e3o, desinstala\u00e7\u00e3o, manuten\u00e7\u00e3o,\nlimpeza, reparo em geladeira,\n100 60,00 6.000,00\nHORA bebedouro, refrigerador, frigobar,\nfreezer, filtro e purificador de \u00e1gua\nEXTRATO DO PRIMEIRO TERMO ADITIVO 01 Valor global estimado para aquisi\u00e7\u00e3o\nde pe\u00e7as e/ou componentes originais\nem geladeira, bebedouro, refrigerador, 6.000,00\nCONTRATO N.\u00ba 073/2023\nfrigobar, freezer, filtro e purificador\nProcesso: 3243/2023 de \u00e1gua\nPreg\u00e3o Eletr\u00f4nico: 028/2023\nDESCONTO NO VALOR DAS PE\u00c7AS - 40 %\n\u00d3rg\u00e3o: Fundo Municipal da Sa\u00fade de Guara\u00ed - TO.\nContratada: ARAUJ