In [44]:
from dotenv import load_dotenv
import os

load_dotenv()
azure_search_endpoint = os.getenv('AZURE_SEARCH_ENDPOINT')
azure_search_key = os.getenv('AZURE_SEARCH_KEY')
azure_openai_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
openai_api_key = os.getenv('AZURE_OPENAI_API_KEY')
openai_deployment = os.getenv('AZURE_OPENAI_DEPLOYMENT')
openai_api_version = os.getenv('OPENAI_API_VERSION')
azure_search_index = os.getenv('AZURE_SEARCH_INDEX')

In [7]:
# import spacy
# Carga modelo NLP para analizar coherencia
# nlp = spacy.load("xx_ent_wiki_sm")

# Cargar texto y dividir en fragmentos
with open('anatomy.txt', 'r', encoding='utf-8') as f:
    file = f.read()

In [8]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema import Document

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=0,
    length_function=len
)

chunks = text_splitter.split_text(file)

# Crear documentos
documentos = [Document(page_content=chunk) for chunk in chunks]

In [None]:
from langchain_community.vectorstores import AzureSearch
from langchain_ollama import OllamaEmbeddings


# Crear índice de búsqueda vectorial en Azure
chunk_store = AzureSearch.from_documents(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    documents=documentos,
    embedding=OllamaEmbeddings(model='nomic-embed-text:latest')
)

chunks_vector = chunk_store.as_retriever()


In [None]:
chunks_vector = AzureSearch(
          azure_search_endpoint=azure_search_endpoint,
          azure_search_key=azure_search_key,
          index_name=azure_search_index,
          embedding_function=OllamaEmbeddings(model='nomic-embed-text:latest')
)

chunks_vector = chunk_store.as_retriever(k=1)

In [60]:
# a_vector = a.as_retriever(k=1)
# docs = a_vector.invoke('qué contiene el libro proyecto gutenberg?')
# for doc in docs:
#           print(doc.page_content)
b = 'qué contiene el libro proyecto gutenberg?'
def buscar_documentos(query):
    vector = a.as_retriever(k = 1)
    docs = vector.invoke(query)
    documentos = [doc.page_content for doc in docs]
    return documentos

info = buscar_documentos(b)
for i, doc in enumerate(info, start=1):
    print(f"{i}. {doc}")

1. Section 2. Information about the Mission of Project Gutenberg™

Project Gutenberg™ is synonymous with the free distribution of
electronic works in formats readable by the widest variety of
computers including obsolete, old, middle-aged and new computers. It
exists because of the efforts of hundreds of volunteers and donations
from people in all walks of life.


In [34]:
chunks_vector = chunk_store.as_retriever(k=1)
question = "qué contiene el libro proyecto gutenberg?"
docs = chunks_vector.invoke(question)

In [35]:
for doc in docs:
          print(doc.page_content)

Section 2. Information about the Mission of Project Gutenberg™

Project Gutenberg™ is synonymous with the free distribution of
electronic works in formats readable by the widest variety of
computers including obsolete, old, middle-aged and new computers. It
exists because of the efforts of hundreds of volunteers and donations
from people in all walks of life.


Generate

In [36]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import AzureChatOpenAI

prompt = hub.pull("rlm/rag-prompt")

llm = AzureChatOpenAI(
    azure_deployment=openai_deployment,
    azure_endpoint=azure_openai_endpoint,
    api_key=openai_api_key,
    api_version=openai_api_version,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = prompt | llm | StrOutputParser()

generation = rag_chain.invoke({"context": format_docs(docs), "question": question})
print(generation)



El libro Proyecto Gutenberg contiene obras electrónicas de distribución gratuita en formatos compatibles con una amplia variedad de computadoras.


Hallucination Grader

In [37]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate
from typing import Literal

# Data model
class GradeHallucinations(BaseModel):
    """Binary score for hallucination present in generation answer."""

#Literal
    binary_score: bool = Field(
        description="Answer is grounded in the facts, 'yes' or 'no'"
    )


# LLM with function call
llm = AzureChatOpenAI(
    azure_deployment=openai_deployment,
    azure_endpoint=azure_openai_endpoint,
    api_key=openai_api_key,
    api_version=openai_api_version,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)


structured_llm_grader = llm.with_structured_output(GradeHallucinations)

# Prompt
system = """You are a grader assessing whether an LLM generation is grounded in / supported by a set of retrieved facts. \n 
     Give a binary score True or False. True means that the answer is grounded in / supported by the set of facts."""
hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Set of facts: \n\n {documents} \n\n LLM generation: {generation}"),
    ]
)

hallucination_grader = hallucination_prompt | structured_llm_grader
hallucination_grader.invoke({"documents": docs, "generation": generation})


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


GradeHallucinations(binary_score='yes')

In [None]:
import streamlit as st
import pandas as pd
import plotly.express as px
from datetime import datetime
from typing import List, Dict
from langchain_community.vectorstores import AzureSearch
from langchain_ollama import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser
from langchain import hub
import numpy as np

class RAGHallucinationMetrics:
    def __init__(self, azure_search_endpoint: str, azure_search_key: str):
        """Inicializa el sistema de métricas de alucinaciones con componentes RAG."""
        self.results_db = self.load_or_create_db()
        self.azure_search_endpoint = azure_search_endpoint
        self.azure_search_key = azure_search_key
        
    def load_or_create_db(self) -> pd.DataFrame:
        """Carga o crea la base de datos de resultados."""
        try:
            return pd.read_csv('rag_hallucination_results.csv')
        except FileNotFoundError:
            return pd.DataFrame({
                'timestamp': [],
                'question': [],
                'generation': [],
                'is_hallucination': [],
                'retrieval_score': [],
                'context_relevance': [],
                'num_retrieved_docs': [],
                'category': []
            })
    
    def calculate_retrieval_score(self, docs: List) -> float:
        """Calcula un score de calidad de recuperación basado en los documentos."""
        if not docs:
            return 0.0
        
        # Calcula la puntuación basada en la cantidad y relevancia de documentos
        scores = [getattr(doc, 'score', 0.5) for doc in docs]  # 0.5 por defecto si no hay score
        return np.mean(scores)
    
    def evaluate_context_relevance(self, question: str, docs: List) -> float:
        """Evalúa la relevancia del contexto recuperado para la pregunta."""
        if not docs:
            return 0.0
        
        # Implementa lógica de evaluación de relevancia
        # Por ejemplo, similitud coseno entre pregunta y documentos
        return sum(getattr(doc, 'score', 0.5) for doc in docs) / len(docs)
    
    def add_result(self, question: str, generation: str, docs: List, 
                   is_hallucination: bool, category: str = None):
        """Añade un nuevo resultado a la base de datos con métricas RAG."""
        retrieval_score = self.calculate_retrieval_score(docs)
        context_relevance = self.evaluate_context_relevance(question, docs)
        
        new_row = pd.DataFrame([{
            'timestamp': datetime.now(),
            'question': question,
            'generation': generation,
            'is_hallucination': is_hallucination,
            'retrieval_score': retrieval_score,
            'context_relevance': context_relevance,
            'num_retrieved_docs': len(docs),
            'category': category
        }])
        
        self.results_db = pd.concat([self.results_db, new_row], ignore_index=True)
        self.results_db.to_csv('rag_hallucination_results.csv', index=False)
    
    def calculate_metrics(self) -> Dict:
        """Calcula métricas detalladas sobre el sistema RAG y alucinaciones."""
        if len(self.results_db) == 0:
            return {}
        
        metrics = {
            'total_queries': len(self.results_db),
            'hallucination_rate': (self.results_db['is_hallucination'].mean() * 100),
            'avg_retrieval_score': self.results_db['retrieval_score'].mean(),
            'avg_context_relevance': self.results_db['context_relevance'].mean(),
            'avg_docs_retrieved': self.results_db['num_retrieved_docs'].mean(),
            'correlation_retrieval_hallucination': self.results_db['retrieval_score'].corr(self.results_db['is_hallucination'])
        }
        
        if 'category' in self.results_db.columns:
            metrics['category_breakdown'] = self.results_db.groupby('category')[
                ['is_hallucination', 'retrieval_score']].mean().to_dict()
            
        return metrics

def create_rag_dashboard():
    """Crea un dashboard en Streamlit para visualizar las métricas RAG."""
    st.title("Dashboard de RAG y Detección de Alucinaciones")
    
    # Configuración
    with st.sidebar:
        st.header("Configuración")
        azure_search_endpoint = st.text_input("Azure Search Endpoint")
        azure_search_key = st.text_input("Azure Search Key", type="password")
        
        if not all([azure_search_endpoint, azure_search_key]):
            st.warning("Por favor, completa la configuración de Azure Search")
            return
    
    # Inicializar sistema de métricas
    metrics = RAGHallucinationMetrics(azure_search_endpoint, azure_search_key)
    
    # Métricas RAG
    st.header("Métricas RAG")
    col1, col2, col3 = st.columns(3)
    
    general_metrics = metrics.calculate_metrics()
    if general_metrics:
        col1.metric("Tasa de Alucinaciones", f"{general_metrics['hallucination_rate']:.2f}%")
        col2.metric("Score Promedio de Recuperación", f"{general_metrics['avg_retrieval_score']:.2f}")
        col3.metric("Documentos Promedio", f"{general_metrics['avg_docs_retrieved']:.1f}")
        
        # Gráfico de correlación
        st.subheader("Correlación entre Recuperación y Alucinaciones")
        correlation_data = metrics.results_db[['retrieval_score', 'is_hallucination']]
        fig_corr = px.scatter(correlation_data, 
                            x='retrieval_score', 
                            y='is_hallucination',
                            title="Relación entre Score de Recuperación y Alucinaciones")
        st.plotly_chart(fig_corr)
        
        # Análisis temporal
        st.subheader("Tendencia Temporal")
        fig_time = px.line(metrics.results_db, 
                          x='timestamp', 
                          y=['retrieval_score', 'context_relevance'],
                          title="Evolución de Scores en el Tiempo")
        st.plotly_chart(fig_time)
        
        # Tabla de últimas consultas
        st.subheader("Últimas Consultas")
        st.dataframe(metrics.results_db.tail(10)[
            ['timestamp', 'question', 'is_hallucination', 'retrieval_score', 'num_retrieved_docs']])

def evaluate_rag_response(
    question: str,
    chunk_store: AzureSearch,
    rag_chain,
    hallucination_grader,
    metrics: RAGHallucinationMetrics,
    category: str = None
):
    """Evalúa una respuesta RAG y actualiza métricas."""
    # Recuperar documentos
    docs = chunk_store.as_retriever().get_relevant_documents(question)
    
    # Generar respuesta
    generation = rag_chain.invoke({
        "context": "\n\n".join(doc.page_content for doc in docs),
        "question": question
    })
    
    # Evaluar alucinación
    hallucination_result = hallucination_grader.invoke({
        "documents": docs,
        "generation": generation
    })
    
    # Almacenar resultado
    metrics.add_result(
        question=question,
        generation=generation,
        docs=docs,
        is_hallucination=not hallucination_result.binary_score,
        category=category
    )
    
    return generation, hallucination_result.binary_score, docs

if __name__ == "__main__":
    create_rag_dashboard()

In [67]:
from langchain_openai import AzureChatOpenAI
from langchain.schema import AIMessage, HumanMessage

def chat_with_azure_gpt(
    deployment_name,
    api_key,
    azure_endpoint,
    api_version
):
    
    # Configurar el chat
    chat = AzureChatOpenAI(
      azure_deployment=openai_deployment,
      azure_endpoint=azure_openai_endpoint,
      api_key=openai_api_key,
      api_version=openai_api_version,
      temperature=0,
      max_tokens=None,
      timeout=None,
      max_retries=2,
    )
    
    print("Chat iniciado. Escribe 'salir' para terminar.")
    
    while True:
        # Obtener input del usuario
        user_input = input("\nTú: ")
        
        # Verificar si el usuario quiere salir
        if user_input.lower() == 'salir':
            print("¡Hasta luego!")
            break
        
        try:
            # Crear el mensaje y obtener la respuesta
            messages = user_input
            response = chat.invoke(messages)
            
            # Imprimir la respuesta
            print("\nAssistant:", response.content)
            
        except Exception as e:
            print(f"\nError: {str(e)}")

# Ejemplo de uso:
if __name__ == "__main__":
    DEPLOYMENT_NAME = openai_deployment
    API_KEY = azure_search_key
    AZURE_ENDPOINT = azure_search_endpoint
    OPENAI_API_VERSION = openai_api_version
    
    chat_with_azure_gpt(
        deployment_name=DEPLOYMENT_NAME,
        api_key=API_KEY,
        azure_endpoint=AZURE_ENDPOINT,
        api_version=OPENAI_API_VERSION
    )

Chat iniciado. Escribe 'salir' para terminar.

Assistant: ¡Hola! ¿Cómo puedo ayudarte hoy?
¡Hasta luego!


In [25]:
print(prompt)

ChatPromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, metadata={'lc_hub_owner': 'rlm', 'lc_hub_repo': 'rag-prompt', 'lc_hub_commit_hash': '50442af133e61576e74536c6556cefe1fac147cad032f4377b60c436e6cdcb6e'}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context', 'question'], input_types={}, partial_variables={}, template="You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.\nQuestion: {question} \nContext: {context} \nAnswer:"), additional_kwargs={})])

Retrieval Grader

In [None]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import AzureChatOpenAI

# Data model
class GradeDocuments(BaseModel):
    """Binary score for relevance check on retrieved documents."""

    binary_score: str = Field(
        description="Documents are relevant to the question, 'yes' or 'no'"
    )

# LLM with function call
llm = AzureChatOpenAI(
    azure_deployment=openai_deployment,
    azure_endpoint=azure_openai_endpoint,
    api_key=openai_api_key,
    api_version=openai_api_version,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)
structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt
system = """You are a grader assessing relevance of a retrieved document to a user question. \n 
    It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
    If the document contains keyword(s) or semantic meaning related to the user question, grade it as relevant. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)
retrieval_grader = grade_prompt | structured_llm_grader
question = "agent memory"
docs = chunks_vector.get_relevant_documents(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": doc_txt}))

In [None]:
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import AzureChatOpenAI

prompt = hub.pull("rlm/rag-prompt")

llm = AzureChatOpenAI(
    azure_deployment=openai_deployment,
    azure_endpoint=azure_openai_endpoint,
    api_key=openai_api_key,
    api_version=openai_api_version,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

rag_chain = prompt | llm | StrOutputParser()

question = "agent memory"
docs = chunks_vector.invoke(question)

# Run
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)

In [None]:
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.prompts import ChatPromptTemplate


# Modelo de LLM para evaluación de relevancia
class GradeHallucinations(BaseModel):
    """Puntuación binaria para la alucinación presente en la respuesta de la generación."""
    binary_score: str = Field(
        description="La respuesta se basa en los hechos, «sí» o «no"
    )
 
llm = AzureChatOpenAI(
    azure_deployment=openai_deployment,
    azure_endpoint=azure_openai_endpoint,
    api_key=openai_api_key,
    api_version=openai_api_version,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

structured_llm_grader = llm.with_structured_output(GradeHallucinations)

system = """Usted es un calificador que evalúa si una generación de LLM está fundamentada / apoyada por un conjunto de hechos recuperados. \n 
     Da una puntuación binaria 'sí' o 'no'. Sí' significa que la respuesta está basada / apoyada por el conjunto de hechos."""
hallucination_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Set of facts: \n\n {document} \n\n LLM generation: {question}"),
    ]
)






hallucination_grader = hallucination_prompt | structured_llm_grader
generation = hallucination_grader.invoke({"context": docs, "question": question})
hallucination_grader.invoke({"documents": docs, "generation": generation})

In [None]:
import spacy
from langchain_core.prompts import ChatPromptTemplate
from langchain.schema import Document
from langchain_community.vectorstores import AzureSearch
from langchain_ollama import OllamaEmbeddings
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import AzureChatOpenAI

# Carga modelo NLP para analizar coherencia
nlp = spacy.load("xx_ent_wiki_sm")

# Cargar texto y dividir en fragmentos
with open('anatomy.txt', 'r', encoding='utf-8') as f:
    file = f.read()

from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=600,
    chunk_overlap=0,
    length_function=len
)

chunks = text_splitter.split_text(file)

# Crear documentos
documentos = [Document(page_content=chunk) for chunk in chunks]

# Crear índice de búsqueda vectorial en Azure
chunk_store = AzureSearch.from_documents(
    azure_search_endpoint=azure_search_endpoint,
    azure_search_key=azure_search_key,
    documents=documentos,
    embedding=OllamaEmbeddings(model='nomic-embed-text:latest')
)

chunks_vector = chunk_store.as_retriever()

# Modelo de LLM para evaluación de relevancia
class GradeDocuments(BaseModel):
    binary_score: str = Field(
        description="Los documentos son relevantes para la pregunta 'sí' o 'no'."
    )
 
llm = AzureChatOpenAI(
    azure_deployment=openai_deployment,
    azure_endpoint=azure_openai_endpoint,
    api_key=openai_api_key,
    api_version=openai_api_version,
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
)

structured_llm_grader = llm.with_structured_output(GradeDocuments)

# Prompt para evaluar relevancia
system = """Eres un calificador que evalúa la relevancia de un documento recuperado con respecto a una pregunta del usuario.
Si el documento contiene palabras clave o significados semánticos relacionados con la pregunta del usuario, califíquelo como pertinente.
Dar una puntuación binaria «sí» o «no» para indicar si el documento es pertinente para la pregunta."""
grade_prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
    ]
)

retrieval_grader = grade_prompt | structured_llm_grader

# Función para analizar coherencia interna
def analizar_coherencia(texto):
    """Analiza la coherencia interna de un texto."""
    doc = nlp(texto)
    inconsistencias = []
    for sent in doc.sents:
        tokens = [token.text for token in sent if token.dep_ == "neg"]
        if tokens:
            inconsistencias.append(sent.text)
    return inconsistencias

# Orquestador para detectar alucinaciones y problemas de coherencia
def evaluar_texto(question):
    """Evalúa un texto en busca de alucinaciones y problemas de coherencia."""
    # Recuperar documentos relevantes
    docs = chunks_vector.get_relevant_documents(question)
    if not docs:
        return {
            "alucinaciones_probabilidad": 1.0,
            "comentarios": "No se encontraron documentos relevantes para la pregunta.",
            "acciones_recomendadas": ["Agregar documentos relevantes al índice de Azure."]
        }

    # Evaluar relevancia con el modelo
    relevancia = []
    for doc in docs:
        doc_txt = doc.page_content
        result = retrieval_grader.invoke({"question": question, "document": doc_txt})
        relevancia.append(result["binary_score"] == "yes")

    # Detectar alucinaciones
    probabilidad_alucinacion = 1.0 - sum(relevancia) / len(relevancia) if relevancia else 1.0

    # Analizar coherencia de los documentos recuperados
    inconsistencias = []
    for doc in docs:
        inconsistencias.extend(analizar_coherencia(doc.page_content))

    # Generar reporte
    comentarios = []
    if probabilidad_alucinacion > 0.5:
        comentarios.append("La información parece ficticia o no respaldada por los documentos.")
    if inconsistencias:
        comentarios.append(f"Inconsistencias detectadas en los documentos: {', '.join(inconsistencias)}")

    return {
        "alucinaciones_probabilidad": probabilidad_alucinacion,
        "comentarios": " ".join(comentarios),
        "acciones_recomendadas": [
            "Verificar el contenido manualmente.",
            "Agregar más datos relevantes al índice de Azure."
        ]
    }

# Ejemplo de uso
question = "agent memory"
reporte = evaluar_texto(question)
print(reporte)
