In [1]:
import sys
sys.path.append('/data/libraries/grafa/')

In [2]:
from grafa import get_db_engine 
from langchain_aws import ChatBedrock
from grafa.settings import OPENAI_API_KEY


from grafa.stage_search.utils import extract_chunk_data, concept_extractor

In [3]:
kg = get_db_engine()

from langchain_aws import ChatBedrockConverse
llm_light = ChatBedrock(
        model_id="us.anthropic.claude-3-5-haiku-20241022-v1:0",
        model_kwargs=dict(temperature=0, max_tokens=4096),
    )

llm_light_nova = ChatBedrockConverse(
        model="amazon.nova-micro-v1:0"
    )

from grafa.stage_search.vector_search import two_stage_retrieval_parallel, advanced_search_parallel

  kg = Neo4jGraph(


In [4]:
queries = ["Cuales fueron mis mejores promociones?", "Dame la elasticidad promedio de productos para la region Occidente"
           , "Cual fue el ROI de mi producto con mejor incremental porcentual", "Encuentra los 10 clientes con mejor desempeño promocional en Enero de 2023. Para todos esos, encuentra las 10 promciones que tuvieron mejor incremental, y dime como les fue en canibalizacion",
         "Como puedo mejorar la canibalizacion de mis productos", "Como puedo medir mi desempeño promocional?", "Como mide ARCA sus productos"]

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, field_validator

llm = ChatBedrock(
        model_id="us.anthropic.claude-3-5-sonnet-20241022-v2:0",
        model_kwargs=dict(temperature=0, max_tokens=4096),
    )

class OutputRetrievalEval(BaseModel):
    """
    Pydantic model to validate and structure LLM evaluation output.
    """
    better_response: int = Field(description="Better response can only be 1 or 2")
    confidence_score: int = Field(description="Confidence score between 1 and 100.")
    justification: str = Field(description="Explanation of why the selected result is better.")

    @field_validator("better_response")
    @classmethod
    def validate_better_response(cls, value):
        if value not in [1, 2]:
            raise ValueError("better_response must be either 1 or 2")
        return value

    @field_validator("confidence_score")
    @classmethod
    def validate_confidence_score(cls, value):
        if not (1 <= value <= 100):
            raise ValueError("confidence_score must be between 1 and 100")
        return value


def evaluate_retrieval_results(
    query: str,
    result_1: list, 
    result_2: list,
    llm: "RunnableChain" = None,
) -> OutputRetrievalEval:
    """
    Evaluates two retrieval results using an LLM and determines which one provides a better response to the query.

    Parameters:
    - query (str): The original user query.
    - result_1 (list): The first retrieval result (e.g., from two_stage_retrieval_parallel).
    - result_2 (list): The second retrieval result (e.g., from advanced_search_parallel).
    - llm_api_key (str): OpenAI API key for calling GPT models.
    - model_name (str): The LLM model to use (default: "gpt-4").

    Returns:
    - OutputRetrievalEval: A structured evaluation output containing:
        - The better response (1 or 2)
        - Confidence score (1-100)
        - Justification for the choice
    """

    # Format the retrieval results for comparison
    result_1_text = "\n".join(result_1) if result_1 else "No relevant information found."
    result_2_text = "\n".join(result_2) if result_2 else "No relevant information found."

    # Define LLM messages
    PROMPT = """You are an expert in information retrieval evaluation. Your task is to analyze two different search results 
    and determine which one provides a more complete, informative, and contextually relevant answer to the given query.

    Consider the following factors:
    1. **Relevance**: Does the response directly answer the query?
    2. **Completeness**: Does it cover all key aspects of the topic?
    3. **Context**: Does it provide additional helpful background information?
    4. **Conciseness**: Does it avoid unnecessary or redundant information?

    Here is the user query:
    **Query:** {query}

    Here are the two retrieved responses:

    **Result 1:**
    {result_1_text}

    **Result 2:**
    {result_2_text}

    Please evaluate both responses and return a structured JSON object following the given format.

    """

    parser = PydanticOutputParser(pydantic_object=OutputRetrievalEval)

    values = {
            "query": query,
            "result_1_text": result_1,
            "result_2_text": result_2,
        }
    
    prompt = PromptTemplate(
    template=PROMPT + '\n {format_instructions}',
    input_variables=["query", "result_1_text", "result_2_text"],
    partial_variables={"format_instructions": parser.get_format_instructions()}
    )

    chain = prompt | llm | parser
    
    response = chain.invoke(values)

    return response


query = "What are the effects of climate change?"
result_1 = ["Climate change causes rising sea levels.", "It increases global temperatures and causes extreme weather."]
result_2 = ["Global warming affects ecosystems.", "It leads to droughts, hurricanes, and sea ice loss."]

evaluation_result = evaluate_retrieval_results(query, result_1, result_2, llm)

# Print the structured output
print(evaluation_result.model_dump())

In [None]:
import time

def evaluate_methods(query, index_name, top_k, kg, llm, api_key):
    results = {}

    # Evaluate Two-Stage Retrieval
    start_time = time.perf_counter()
    two_stage_results = two_stage_retrieval_parallel(
        question=query, index_name=index_name, top_k=top_k, kg=kg, api_key=api_key
    )
    two_stage_time = time.perf_counter() - start_time
    results["two_stage"] = {"time": two_stage_time, "results": two_stage_results, "Chunks": len(two_stage_results)}

    # Evaluate Advanced Search
    start_time = time.perf_counter()
    advanced_results = advanced_search_parallel(
        query=query, index_name=index_name, top_k=top_k, llm=llm, kg=kg, api_key=api_key, verbose=False
    )
    advanced_time = time.perf_counter() - start_time
    results["advanced"] = {"time": advanced_time, "results": advanced_results, "Chunks": len(advanced_results)}

    return results


In [14]:
llm = llm_light_nova
api_key = OPENAI_API_KEY
index_name = "vector_concepts"
top_k = 10

results_dict = {}

for query in queries:
    print(f"Evaluating query: {query}")
    results_dict[query] = evaluate_methods(query, index_name, top_k, kg, llm, api_key)

Evaluating query: Cuales fueron mis mejores promociones?
Evaluating query: Dame la elasticidad promedio de productos para la region Occidente
Evaluating query: Cual fue el ROI de mi producto con mejor incremental porcentual
Evaluating query: Encuentra los 10 clientes con mejor desempeño promocional en Enero de 2023. Para todos esos, encuentra las 10 promciones que tuvieron mejor incremental, y dime como les fue en canibalizacion
Evaluating query: Como puedo mejorar la canibalizacion de mis productos
Evaluating query: Como puedo medir mi desempeño promocional?
Evaluating query: Como mide ARCA sus productos


In [15]:
def compute_overlap(results1, results2):
    """Calculate the percentage of overlap between two result sets."""
    set1, set2 = set(results1), set(results2)
    if not set1 and not set2:
        return 100  # Both are empty
    if not set1 or not set2:
        return 0  # One is empty
    return len(set1 & set2) / len(set1 | set2) * 100  # Jaccard Similarity %

for query in queries:
    two_stage_res = results_dict[query]["two_stage"]["results"]
    advanced_res = results_dict[query]["advanced"]["results"]

    overlap = compute_overlap(two_stage_res, advanced_res)
    print(f"Query: {query}")
    print(f"Overlap: {overlap:.2f}%")

Query: Cuales fueron mis mejores promociones?
Overlap: 25.00%
Query: Dame la elasticidad promedio de productos para la region Occidente
Overlap: 38.46%
Query: Cual fue el ROI de mi producto con mejor incremental porcentual
Overlap: 19.05%
Query: Encuentra los 10 clientes con mejor desempeño promocional en Enero de 2023. Para todos esos, encuentra las 10 promciones que tuvieron mejor incremental, y dime como les fue en canibalizacion
Overlap: 31.58%
Query: Como puedo mejorar la canibalizacion de mis productos
Overlap: 20.00%
Query: Como puedo medir mi desempeño promocional?
Overlap: 33.33%
Query: Como mide ARCA sus productos
Overlap: 36.36%


In [17]:
import pandas as pd

comparison_data = []

for query in queries:
    two_stage_time = results_dict[query]["two_stage"]["time"]
    advanced_time = results_dict[query]["advanced"]["time"]
    overlap = compute_overlap(
        results_dict[query]["two_stage"]["results"],
        results_dict[query]["advanced"]["results"]
    )
    two_stage_chunks = results_dict[query]["two_stage"]["Chunks"]
    advanced_chunks = results_dict[query]["advanced"]["Chunks"]

    comparison_data.append({
        "Query": query,
        "Two-Stage Time (s)": two_stage_time,
        "Advanced Time (s)": advanced_time,
        "Overlap (%)": overlap,
        "Two-Stage Chunks": two_stage_chunks,
        "Advanced Chunks": advanced_chunks
    })

df = pd.DataFrame(comparison_data)
display(df)

Unnamed: 0,Query,Two-Stage Time (s),Advanced Time (s),Overlap (%),Two-Stage Chunks,Advanced Chunks
0,Cuales fueron mis mejores promociones?,2.888705,1.687309,25.0,8,2
1,Dame la elasticidad promedio de productos para...,1.073278,1.639234,38.461538,9,9
2,Cual fue el ROI de mi producto con mejor incre...,1.455424,1.668549,19.047619,11,14
3,Encuentra los 10 clientes con mejor desempeño ...,1.956123,1.964452,31.578947,10,15
4,Como puedo mejorar la canibalizacion de mis pr...,1.136357,1.267598,20.0,11,7
5,Como puedo medir mi desempeño promocional?,1.601835,1.538107,33.333333,10,6
6,Como mide ARCA sus productos,1.075828,1.312858,36.363636,10,5


In [27]:
queries = ["Cuales fueron mis mejores promociones?", "Dame la elasticidad promedio de productos para la region Occidente"
           , "Cual fue el ROI de mi producto con mejor incremental porcentual", "Encuentra los 10 clientes con mejor desempeño promocional en Enero de 2023. Para todos esos, encuentra las 10 promciones que tuvieron mejor incremental, y dime como les fue en canibalizacion",
         "Como puedo mejorar la canibalizacion de mis productos", "Como puedo medir mi desempeño promocional?", "Como mide ARCA sus productos"]

In [None]:
import time
import pandas as pd
from langchain_core.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser
from pydantic import BaseModel, Field, field_validator
from langchain_aws import ChatBedrock

# Initialize LLM
llm = ChatBedrock(
    model_id="us.anthropic.claude-3-5-sonnet-20241022-v2:0",
    model_kwargs=dict(temperature=0, max_tokens=4096),
)


# ---- Pydantic Model for LLM Evaluation ----
class OutputRetrievalEval(BaseModel):
    """
    Pydantic model to validate and structure LLM evaluation output.
    """
    better_response: int = Field(description="Better response can only be 1 or 2")
    confidence_score: int = Field(description="Confidence score between 1 and 100.")
    justification: str = Field(description="Explanation of why the selected result is better.")

    @field_validator("better_response")
    @classmethod
    def validate_better_response(cls, value):
        if value not in [1, 2]:
            raise ValueError("better_response must be either 1 or 2")
        return value

    @field_validator("confidence_score")
    @classmethod
    def validate_confidence_score(cls, value):
        if not (1 <= value <= 100):
            raise ValueError("confidence_score must be between 1 and 100")
        return value


# ---- Function to Evaluate Retrieval Results ----
def evaluate_retrieval_results(
    query: str,
    result_1: list, 
    result_2: list,
    llm: "RunnableChain",
) -> OutputRetrievalEval:
    """
    Evaluates two retrieval results using an LLM and determines which one provides a better response to the query.
    """

    result_1_text = "\n".join(result_1) if result_1 else "No relevant information found."
    result_2_text = "\n".join(result_2) if result_2 else "No relevant information found."

    # Define LLM messages
    PROMPT = """You are an expert in information retrieval evaluation. Your task is to analyze two different search results 
    and determine which one provides a more complete, informative, and contextually relevant answer to the given query. The best result 
    should give contextual informaiton for example what is certain metric, how to calculate it, formula, etc. Or what a concept means, it must
    give more CONTEXT for a further SQL query.


    Consider the following factors:
    1. **Relevance**: Does the response directly answer the query?
    2. **Completeness**: Does it cover all key aspects of the topic?
    3. **Context**: Does it provide additional helpful background information?
    4. **Conciseness**: Does it avoid unnecessary or redundant information?

    Here is the user query:
    **Query:** {query}

    Here are the two retrieved responses:

    **Result 1:**
    {result_1_text}

    **Result 2:**
    {result_2_text}

    Please evaluate both responses and return a structured JSON object following the given format.
    """

    parser = PydanticOutputParser(pydantic_object=OutputRetrievalEval)

    values = {
        "query": query,
        "result_1_text": result_1,
        "result_2_text": result_2,
    }
    
    prompt = PromptTemplate(
        template=PROMPT + '\n {format_instructions}',
        input_variables=["query", "result_1_text", "result_2_text"],
        partial_variables={"format_instructions": parser.get_format_instructions()}
    )

    chain = prompt | llm | parser
    
    response = chain.invoke(values)

    return response


# ---- Function to Retrieve and Evaluate Search Results ----
def evaluate_methods(query, index_name, top_k, kg, llm, api_key):
    results = {}

    # Evaluate Two-Stage Retrieval
    start_time = time.perf_counter()
    two_stage_results = two_stage_retrieval_parallel(
        question=query, index_name=index_name, top_k=top_k, kg=kg, api_key=api_key
    )
    two_stage_time = time.perf_counter() - start_time
    results["two_stage"] = {"time": two_stage_time, "results": two_stage_results, "Chunks": len(two_stage_results)}

    # Evaluate Advanced Search
    start_time = time.perf_counter()
    advanced_results = advanced_search_parallel(
        query=query, index_name=index_name, top_k=top_k, llm=llm_light_nova, kg=kg, api_key=api_key, verbose=False
    )
    advanced_time = time.perf_counter() - start_time
    results["advanced"] = {"time": advanced_time, "results": advanced_results, "Chunks": len(advanced_results)}

    # Evaluate with LLM
    evaluation_result = evaluate_retrieval_results(query, two_stage_results, advanced_results, llm)

    results["evaluation"] = evaluation_result

    return results

In [42]:
# ---- Running Multiple Queries and Storing Evaluations ----
api_key = OPENAI_API_KEY
index_name = "vector_concepts"
top_k = 10

results_dict = {}

for query in queries:
    print(f"Evaluating query: {query}")
    results_dict[query] = evaluate_methods(query, index_name, top_k, kg, llm, api_key)


# ---- Compare and Display Results ----
comparison_data = []

for query in queries:
    two_stage_time = results_dict[query]["two_stage"]["time"]
    advanced_time = results_dict[query]["advanced"]["time"]
    overlap = compute_overlap(
        results_dict[query]["two_stage"]["results"],
        results_dict[query]["advanced"]["results"]
    )
    two_stage_chunks = results_dict[query]["two_stage"]["Chunks"]
    advanced_chunks = results_dict[query]["advanced"]["Chunks"]
    eval_result = results_dict[query]["evaluation"]

    comparison_data.append({
        "Query": query,
        "Two-Stage Time (s)": two_stage_time,
        "Advanced Time (s)": advanced_time,
        "Overlap (%)": overlap,
        "Two-Stage Chunks": two_stage_chunks,
        "Advanced Chunks": advanced_chunks,
        "Better Response": eval_result.better_response,
        "Confidence Score": eval_result.confidence_score,
        "Justification": eval_result.justification
    })

df = pd.DataFrame(comparison_data)

display(df)

Evaluating query: Cuales fueron mis mejores promociones?
Evaluating query: Dame la elasticidad promedio de productos para la region Occidente
Evaluating query: Cual fue el ROI de mi producto con mejor incremental porcentual
Evaluating query: Encuentra los 10 clientes con mejor desempeño promocional en Enero de 2023. Para todos esos, encuentra las 10 promciones que tuvieron mejor incremental, y dime como les fue en canibalizacion
Evaluating query: Como puedo mejorar la canibalizacion de mis productos
Evaluating query: Como puedo medir mi desempeño promocional?
Evaluating query: Como mide ARCA sus productos


Unnamed: 0,Query,Two-Stage Time (s),Advanced Time (s),Overlap (%),Two-Stage Chunks,Advanced Chunks,Better Response,Confidence Score,Justification
0,Cuales fueron mis mejores promociones?,1.562294,1.492275,25.0,8,2,1,85,Result 1 provides a more comprehensive and det...
1,Dame la elasticidad promedio de productos para...,1.487575,1.794277,38.461538,9,9,2,85,Result 2 provides a more complete and contextu...
2,Cual fue el ROI de mi producto con mejor incre...,1.306277,1.755498,19.047619,11,14,1,85,Result 1 is better for several reasons: 1) It ...
3,Encuentra los 10 clientes con mejor desempeño ...,1.528816,2.080351,31.578947,10,15,1,85,Result 1 provides a more comprehensive and rel...
4,Como puedo mejorar la canibalizacion de mis pr...,1.15537,1.514676,20.0,11,7,1,85,Result 1 provides a more comprehensive and wel...
5,Como puedo medir mi desempeño promocional?,1.227443,1.600392,33.333333,10,6,1,85,Result 1 provides a more comprehensive and str...
6,Como mide ARCA sus productos,1.694415,1.438592,36.363636,10,5,1,85,Result 1 provides a more complete and relevant...


In [206]:
import time

# Start overall timer
overall_start_time = time.perf_counter()

for query in queries:
    start_time = time.perf_counter()  # Start time for this query
    result = concept_extractor(query, llm_light_nova)  # Run function
    end_time = time.perf_counter()  # End time for this query

    elapsed_time = end_time - start_time  # Time for this query
    print(f"Query: {query} -> Result: {result} (Time taken: {elapsed_time:.6f} sec)")

# End overall timer
overall_end_time = time.perf_counter()
total_time = overall_end_time - overall_start_time

print(f"\nTotal execution time: {total_time:.6f} seconds")


Query: Cuales fueron mis mejores promociones? -> Result: ['promociones'] (Time taken: 0.539630 sec)
Query: Dame la elasticidad promedio de productos para la region Occidente -> Result: ['elasticidad', 'productos', 'region', 'Occidente'] (Time taken: 0.346204 sec)
Query: Cual fue el ROI de mi producto con mejor incremental porcentual -> Result: ['ROI', 'producto', 'incremental', 'porcentual'] (Time taken: 0.336035 sec)
Query: Encuentra los 10 clientes con mejor desempeño promocional en Enero de 2023. Para todos esos, encuentra las 10 promciones que tuvieron mejor incremental, y dime como les fue en canibalizacion -> Result: ['clientes', 'desempeño', 'promocional', 'promciones', 'incremental', 'canibalizacion'] (Time taken: 0.442461 sec)
Query: Como puedo mejorar la canibalizacion de mis productos -> Result: ['canibalizacion', 'productos'] (Time taken: 0.301625 sec)
Query: Como puedo medir mi desempeño promocional? -> Result: ['desempeño', 'promocional'] (Time taken: 0.282784 sec)

Total

In [27]:
from grafa.stage_search.vector_search import two_stage_retrieval_parallel

query= "Encuentra los 10 clientes con mejor desempeño promocional en Enero de 2023. Para todos esos, encuentra las 10 promciones que tuvieron mejor incremental, y dime como les fue en canibalizacion"
results = two_stage_retrieval_parallel(query, "vector_concepts", 10, kg, OPENAI_API_KEY)

Unique concept names extracted: {'Elasticidad cruzada', 'Cuadrantes de desempeño de promociones', 'ROI', 'Optimización de precios', 'Incremental', 'Price Index', 'Impacto de Overstock', 'Precisión', 'Elasticidad', 'Canibalización'}
Extracting data for chunk: accuracy-chunk0001
Extracting data for chunk: cuadrantes-chunk0000
Extracting data for chunk: canibalizacion-chunk0001
Extracting data for chunk: accuracy-chunk0000
Extracting data for chunk: kuona_test-chunk0000
Extracting data for chunk: incremental-chunk0000
Extracting data for chunk: canibalizacion-chunk0002
Extracting data for chunk: cuadrantes-chunk0001
Extracting data for chunk: elasticidad-chunk0002
Extracting data for chunk: impact-chunk0001
Extracting data for chunk: roi-chunk0000
Extracting data for chunk: kuona_test-chunk0002
Extracting data for chunk: roi-chunk0001
Extracting data for chunk: kuona_test-chunk0001


In [20]:
from grafa.stage_search.vector_search import two_stage_retrieval_parallel

query=  "Dame la elasticidad promedio de productos para la region Occidente"
results = two_stage_retrieval_parallel(query, "vector_concepts", 10, kg, OPENAI_API_KEY)

Unique concept names extracted: {'Cuadrantes de desempeño de promociones', 'Unidades Operativas', 'Portafolio de Productos', 'Elasticidad Elástica', 'Elasticidad Unitaria', 'Incremental', 'Impacto de Overstock', 'Elasticidad precio de la demanda', 'Elasticidad', 'Elasticidad Inelástica'}
Extracting data for chunk: cuadrantes-chunk0001
Extracting data for chunk: elasticidad-chunk0000
Extracting data for chunk: company_info-chunk0000
Extracting data for chunk: elasticidad-chunk0002
Extracting data for chunk: elasticidad-chunk0001
Extracting data for chunk: cuadrantes-chunk0000
Extracting data for chunk: impact-chunk0001
Extracting data for chunk: incremental-chunk0000


In [39]:
from grafa.stage_search.vector_search import advanced_search_parallel

query=  "Como puedo medir mi desempeño promocional?"
results = advanced_search_parallel(query, "vector_concepts", 10, llm_light_nova, kg, verbose=True, api_key=OPENAI_API_KEY)

Starting advanced search...
Extracting relevant concepts...
Found 3 unique relevant concepts: {'desempeño', 'medir', 'promocional'}
Unique graph concept names extracted: {'Cuadrantes de desempeño de promociones', 'Promocode', 'Catálogo Promocional', 'Propósitos Promocionales'}
Unique chunk names found: {'customer_success_master_file-chunk0023', 'cuadrantes-chunk0001', 'customer_success_master_file-chunk0022', 'customer_success_master_file-chunk0010', 'customer_success_master_file-chunk0045', 'cuadrantes-chunk0000'}
Extracting data for chunk: cuadrantes-chunk0000
Extracting data for chunk: cuadrantes-chunk0001
Extracting data for chunk: customer_success_master_file-chunk0045
Extracting data for chunk: customer_success_master_file-chunk0023
Extracting data for chunk: customer_success_master_file-chunk0010
Extracting data for chunk: customer_success_master_file-chunk0022
Advanced search completed.


In [9]:
import cohere

co = cohere.ClientV2(api_key="GW1HwTTDzyX50jIz3uCPAQdmcr6uLIT3CbP7ljzf")


response = co.rerank(
    model="rerank-v3.5",
    query=query,
    documents=results,
    top_n=3,
)
print(response)

