#### **RAG QWEN3 -8B**

In [None]:
# 0. Cagar Librerías
from litellm import completion
import chromadb
from sentence_transformers import SentenceTransformer
import torch
import pdfplumber
import uuid
from typing import List, Tuple

In [None]:
# 1. Cargar model de embedding
model_name = "Qwen/Qwen3-Embedding-0.6B"
embedder = SentenceTransformer(model_name, device="cuda")

def embed_text(text: str):
    """
    Genera embeddings con Qwen3-Embedding-0.6B en GPU
    """
    return embedder.encode(text, convert_to_numpy=True, normalize_embeddings=True).tolist()

In [None]:
# 2. Inicializar ChromaDB
# En memoria
client = chromadb.Client()
collection = client.get_or_create_collection(name="docs")
# Persistente
# client = chromadb.PersistentClient(path="./chroma_db")  
# collection = client.create_collection("mi_coleccion")

In [None]:
# 3. Ingestar documentos

def extract_text_from_pdf(path: str) -> List[Tuple[int, str]]:
    """Devuelve lista de tuplas (número_de_página, texto_de_página)."""
    pages = []
    with pdfplumber.open(path) as pdf:
        for i, page in enumerate(pdf.pages):
            txt = page.extract_text() or ""
            pages.append((i + 1, txt))
    return pages

def chunk_text(text: str, chunk_size: int = 1000, overlap: int = 200) -> List[str]:
    """Chunking simple por palabras con solapamiento (caracteres aproximados)."""
    words = text.split()
    chunks = []
    current = []
    current_len = 0
    for w in words:
        current.append(w)
        current_len += len(w) + 1
        if current_len >= chunk_size:
            chunks.append(" ".join(current))
            # preparar solapamiento
            if overlap > 0:
                # tomar las últimas palabras que aproximan el overlap (por caracteres)
                ov_words = []
                ov_len = 0
                for tok in reversed(current):
                    ov_words.insert(0, tok)
                    ov_len += len(tok) + 1
                    if ov_len >= overlap:
                        break
                current = ov_words.copy()
                current_len = sum(len(tok) + 1 for tok in current)
            else:
                current = []
                current_len = 0
    if current:
        chunks.append(" ".join(current))
    return chunks

def ingest_pdf_to_chromadb(pdf_path: str, collection, chunk_size: int = 1000, overlap: int = 200):
    """
    Extrae texto del PDF, lo chunkea, calcula embeddings (usa embed_text) y los añade a collection.
    collection: objeto chromadb collection ya instanciado.
    """
    pages = extract_text_from_pdf(pdf_path)
    total = 0
    for page_number, page_text in pages:
        if not page_text.strip():
            continue
        chunks = chunk_text(page_text, chunk_size=chunk_size, overlap=overlap)
        for chunk in chunks:
            emb = embed_text(chunk)  # usa la función ya definida en el notebook
            doc_id = str(uuid.uuid4())
            # añadir con metadatos útiles
            collection.add(
                documents=[chunk],
                embeddings=[emb],
                ids=[doc_id],
                metadatas=[{"source": pdf_path, "page": page_number}]
            )
            total += 1
    print(f"Ingestados {total} chunks desde {pdf_path} en la colección '{collection.name}'.")

ingest_pdf_to_chromadb("C:/Users/josue/Downloads/Propuesta Políticas de Gobierno de Datos.pdf", collection, chunk_size=1000, overlap=200)


Ingestados 14 chunks desde C:/Users/josue/Downloads/Propuesta Políticas de Gobierno de Datos.pdf en la colección 'docs'.


In [19]:
# 4. Recuperar contexto
def retrieve_context(query: str, n_results: int = 3) -> list:
    query_emb = embed_text(query)
    results = collection.query(query_embeddings=[query_emb], n_results=n_results)
    return " ".join(results['documents'][0])

In [20]:
# 5. RAG con Ollama
def rag_query(query: str) -> str:
    context = retrieve_context(query)
    promtp = f"""
    Usa el siguiente contexto para responder la pregunta:

    Contexto: {context}

    Pregunta: {query}
    """
    response = completion(
        model="ollama/Qwen3:8B",
        messages=[{"role": "user", "content": promtp}],
        api_base="http://localhost:11434",
        temperature=0.1,
    )
    return response['choices'][0]['message']['content']

In [None]:
# 6. Probar RAG
pregunta = "¿Cuáles son los principios indicados en la política?"
respuesta = rag_query(pregunta)
print("Pregunta:", pregunta)
print("Respuesta:", respuesta)

Pregunta: ¿Cuáles son los principios indicados en la política?
Respuesta: <think>
Okay, the user is asking about the principles mentioned in the data governance policy provided. Let me start by reading through the context carefully. The context is in Spanish, so I need to make sure I understand each point correctly.

First, the context lists several principles under the "Principios" section. Let me go through each one by one. The first one is "Dato como activo estratégico," which translates to "Data as a strategic asset." That makes sense as a principle, emphasizing data as a valuable asset managed like other corporate assets.

Next is "Rendición de cuentas," which is "Accountability." The context mentions that responsibilities are clearly defined with Owners and Custodians for critical data assets. That's a key point for accountability.

Then there's "Cumplimiento y Ética," which is "Compliance and Ethics." The policy states that data use adheres to laws, regulations, and ethical stan

Pregunta: ¿Cuáles son los principios indicados en la política?
Respuesta: <think>
Okay, the user is asking about the principles mentioned in the data governance policy provided. Let me start by reading through the context carefully. The context is in Spanish, so I need to make sure I understand each point correctly.

First, the context lists several principles under the "Principios" section. Let me go through each one by one. The first one is "Dato como activo estratégico," which translates to "Data as a strategic asset." That makes sense as a principle, emphasizing data as a valuable asset managed like other corporate assets.

Next is "Rendición de cuentas," which is "Accountability." The context mentions that responsibilities are clearly defined with Owners and Custodians for critical data assets. That's a key point for accountability.

Then there's "Cumplimiento y Ética," which is "Compliance and Ethics." The policy states that data use adheres to laws, regulations, and ethical standards, prioritizing trust. That's important for legal and ethical compliance.

"Calidad por diseño" translates to "Quality by Design." The context says data quality is integrated into processes and systems from the start. That's about ensuring quality is built into the design phase.

"Seguridad y Privacidad" is "Security and Privacy." Data is classified by sensitivity and protected with appropriate controls to ensure confidentiality, integrity, and availability. That's a standard principle in data governance.

"Transparencia" is "Transparency." The processes are transparent and understandable to stakeholders. This ensures that all parties can see how data is managed.

Lastly, "Generación de valor" is "Value Generation." The policy mentions that data governance should facilitate measurable value, using a legal framework and best practices. This ties governance to business value.

Wait, the user might be looking for a list of the principles as per the context. Let me check if I missed any. The context lists six principles under the "Principios" section. The last part mentions a norma de carácter legal and compliance, but that seems to be part of the framework, not a principle. The user specifically asked for the principles indicated, so the six points I listed are the correct ones. I should present them in a clear, numbered list, translating each into English as the user might need the answer in Spanish or English. Since the question is in Spanish, maybe the answer should be in Spanish. Let me confirm the user's language. The question is in Spanish, so the answer should be in Spanish. However, the user might prefer the answer in Spanish. Let me check the original query again. The user wrote the question in Spanish, so the answer should be in Spanish. But the user might want the principles translated. Wait, the user's instruction says "Usa el siguiente contexto para responder la pregunta" and the question is in Spanish. The answer should be in Spanish. However, the user might have a preference. Let me check the initial query again. The user's question is in Spanish, so the answer should be in Spanish. Therefore, I need to list the principles in Spanish, translating them accurately. Let me make sure each principle is correctly translated and listed. The six principles are: Dato como activo estratégico, Rendición de cuentas, Cumplimiento y Ética, Calidad por diseño, Seguridad y Privacidad, Transparencia, and Generación de valor. Wait, the context mentions "Generación de valor" as the last principle. Let me count again. The context lists six principles under "Principios" and then mentions a norma de carácter legal and compliance as part of the framework. So the six principles are the ones listed. The user might have a typo, but according to the context, the six principles are the ones I listed. I should present them in Spanish, each with a brief explanation if needed, but the user just asks for the principles, so a list of the six points in Spanish. Let me structure the answer accordingly.
</think>

Los principios indicados en la política son los siguientes:  

1. **Dato como activo estratégico**: Los datos son considerados un activo valioso y se gestionan con la misma disciplina que otros activos corporativos.  
2. **Rendición de cuentas**: Las responsabilidades sobre los datos están claramente definidas, con Dueños y Custodios asignados para los activos de datos críticos.  
3. **Cumplimiento y Ética**: El uso de los datos se alinea con leyes, regulaciones y estándares éticos, priorizando la confianza del afiliado.  
4. **Calidad por diseño**: La calidad de los datos se integra en el diseño de procesos y sistemas desde su origen.  
5. **Seguridad y Privacidad**: Los datos se clasifican según su sensibilidad y se protegen con controles adecuados para garantizar su confidencialidad, integridad y disponibilidad.  
6. **Transparencia**: Los procesos de gobierno de datos son transparentes y comprensibles para las partes interesadas.  
7. **Generación de valor**: El gobierno de datos debe facilitar la obtención de valor medible, basado en un marco de referencia de mejores prácticas y cumplimiento legal.  

Estos principios forman la base para la gestión y gobernanza efectiva de los datos en la organización.