# Importando as bibliotecas necessárias 

In [None]:
import os
from dotenv import load_dotenv
import getpass
import logging
import pandas as pd
import json
from langchain.chains import RetrievalQA
from langchain.document_loaders import CSVLoader
from langchain.llms import OpenAI
from langchain.text_splitter import TokenTextSplitter
from langchain.schema import Document
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_similarity, answer_correctness, context_precision, context_recall, context_entity_recall
from functions import load_VectorStore, generate_and_save_validation_examples, validate_results, compare_time_methods,  baseline_similarity_method, contextualize_query_with_categories, hierarchical_retrieval_with_categories

# Configurar o logging para salvar a saída de debug em um arquivo
logging.basicConfig(
    filename='./logs/debug_output.log',  # O arquivo onde os logs serão salvos
    level=logging.DEBUG,          # O nível de log (DEBUG para capturar tudo)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Formato do log
)

logger = logging.getLogger()

import warnings
warnings.filterwarnings("ignore")

# Carregando o documento e criando o split em chuncks

In [2]:
load_dotenv() # read local .env file

if not os.getenv("OPENAI_API_KEY"):
   os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [20]:
file = 'inputs/OutdoorClothingCatalog_1000_withCategories.csv'
loader = CSVLoader(file_path=file, encoding="utf-8")
docs = loader.load()

In [None]:
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=10)

split_documents = []
for doc in docs:
    split_docs = text_splitter.split_documents([doc])
    split_documents.extend(split_docs)
    
documents_dict = [
    {"page_content": doc.page_content, "metadata": doc.metadata} for doc in split_documents
]
# Save the list of dictionaries to a JSON file
with open("inputs/documents_split_langchain.json", "w") as file:
    json.dump(documents_dict, file, indent=4)

logger.debug("Documents have been saved to 'documents_split_langchain.json'")

In [None]:
# Load documents split
with open("inputs/documents_split_langchain.json", "r") as file:
    documents_dict = json.load(file)

# Convert the list of dictionaries back to a list of Document objects
documents = [
    Document(page_content=doc["page_content"], metadata=doc["metadata"])
    for doc in documents_dict
]

# Criando os embedding via OpenAIEmbeddings e criando um vector Store via Chroma

# Carregando o Vector Store

In [None]:
# Generating embeddings 
# openai_embedding_function(documents, model = "text-embedding-3-large")

In [18]:
embeddings_provider = 'OpenAI'
vector_store = load_VectorStore(embeddings_provider)

## Criando as querys para validar os métodos

In [None]:
validation_examples = generate_and_save_validation_examples(docs, embeddings_provider, llm_model="gpt-4o-mini", select_n_documents=5)



[{'context': Document(metadata={'source': 'inputs/OutdoorClothingCatalog_1000_withCategories.csv', 'row': 338}, page_content='Unnamed: 0: 363\nname: Men\'s Leather Lace-Ups by ® 8"\ndescription: Our legendary Maine-made  Boot – designed by "L.L." himself in 1912 and keeping feet dry and comfortable ever since. Chances are, you’ll only ever need one pair. \r\n\r\nSize & Fit\r\nWith light- or midweight socks: Whole sizes, order 1 size down. Half sizes, order 1½ sizes down.\r\nWith heavyweight socks: Whole sizes, order your normal size. Half sizes, order the next size down.\r\n\r\nWhy We Love Them\r\nWhoever says “they don’t build things like they used to,” doesn\'t own these boots. Today, our signature boots are still sewn right here in Maine – one pair at a time – by expert craftspeople whose technical skills and passion for their work is evident in every pair of boots they make. Warm, dry feet haven’t gone out of style in a hundred years, so we haven’t needed to change L.L.’s innovativ

# Comparando métodos de RAG

## Baseline - Técnica por similaridade entre query e documento

In [None]:
def baseline_similarity_method(validation_examples, vector_store, embeddings_provider):
    """
    Implements a baseline retrieval-augmented generation (RAG) method to answer queries 
    using a similarity-based retriever and a language model.

    Args:
        validation_examples (list): A list of validation examples to query the model.
        vector_store (object): A vector store instance for similarity-based retrieval.
        embeddings_provider (str): The name of the embeddings provider (used in result file naming).

    Returns:
        list: A dictionary containing the retrieved contexts, metadata, queries, 
              ground truths, and generated answers for each validation example.

    """
    logger.debug("Baseline similarity method to answer queries initiated.")
    
    start_time = time.time()
    model = 'gpt-3.5-turbo-instruct'

    # Initialize the language model
    llm = OpenAI(temperature=0.2, model=model)

    # Set up the retriever with similarity search
    retriever = vector_store.as_retriever(search_type="similarity", k=5)  # Retrieve the most relevant chunks

    # Initialize the RetrievalQA chain
    qa = RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=retriever,
        verbose=True,
        chain_type_kwargs={"document_separator": "\n"}
    )

    # Apply the model to the validation examples
    predictions = qa.apply(validation_examples)

    # Prepare the results
    baselinemethod_documents_dict = [
        [
            {
                "context": doc['context'].page_content,
                "metadata": doc['context'].metadata,
                "query": doc['query'],
                "ground_truths": doc['ground_truths'],
                "answer": doc['result']
            }
            for doc in data
        ]
        for data in [predictions]
    ]

    # Ensure the results directory exists
    os.makedirs('results', exist_ok=True)

    # Save the results to a JSON file
    results_filename = f"results/query_{embeddings_provider}_baselinemethod_documents_dict_results.json"
    with open(results_filename, "w") as file:
        json.dump(baselinemethod_documents_dict, file, indent=4)

    print(f"Documents have been saved to '{results_filename}'")
    
    end_time = time.time()
    logger.debug("Baseline similarity method documents have been saved to %s", results_filename)

    Result_time = end_time - start_time

    return Result_time, baselinemethod_documents_dict

baselinetime, baselinemethod_documents_dict = baseline_similarity_method(validation_examples, vector_store, embeddings_provider)

## Técnica por similaridade com adição de contextualização incluindo na query categorias referentes a pergunta para a busca textual 


In [None]:
def contextualize_query_with_categories(vector_store, validation_examples, embeddings_provider):
    """
    Enhances queries by contextualizing them with relevant categories before performing a document search and generating answers.

    Args:
        vector_store (object): The vector store used for similarity-based retrieval of documents.
        validation_examples (list): A list of validation examples containing queries and expected ground truths.
        embeddings_provider (str): The name of the embeddings provider, used in naming the results file.

    Returns:
        list: A dictionary containing the contextualized queries, retrieved contexts, metadata, ground truths, 
              and generated answers for each validation example.
    """
    logger.debug("Contextualized query method to answer queries initiated.")

    start_time = time.time()

    # Define the high-level query template for category search
    category_prompt = """
    You are an expert in outdoor clothing products. Based on the following query, 
    return a list of relevant categories (e.g., jackets, pants) from the catalog.
    Query: {query}
    """
    category_template = PromptTemplate(input_variables=["query"], template=category_prompt)
    category_chain = LLMChain(prompt=category_template, llm=OpenAI())

    # Define the refined search for documents within each category
    document_prompt = """
    You are an assistant for question-answering tasks about the products from a store that sells outdoor clothing. 
    Your function is to use the catalog of retrieved context to answer the client's questions based on the following query and the category {category}.
    Query: {query}
    Category: {category}
    Catalog: {catalog}
    """
    document_template = PromptTemplate(input_variables=["query", "category", "catalog"], template=document_prompt)
    document_chain = LLMChain(prompt=document_template, llm=OpenAI())

    for index, item in enumerate(validation_examples):
        query = item['query']
        
        # Obtain the relevant category for the query
        category_result = category_chain.run({"query": query})
        logger.debug("Category identified for query '%s': %s", query, category_result)

        # Combine query and category for refined search
        modified_query = f"{query} Category: {category_result}"
        catalog = vector_store.similarity_search(modified_query, k=5)  # Retrieve relevant catalog entries

        # Generate refined answer using retrieved catalog
        refined_answer = document_chain.run({
            "query": query,
            "category": category_result,
            "catalog": catalog
        })
        validation_examples[index]['result'] = refined_answer

    # Prepare the results in dictionary format
    contextualize_query_with_categories_dict = [
        [
            {
                "context": doc['context'].page_content,
                "metadata": doc['context'].metadata,
                "query": doc['query'],
                "ground_truths": doc['ground_truths'],
                "answer": doc['result'],
            }
            for doc in data
        ]
        for data in [validation_examples]
    ]

    # Ensure the results directory exists
    os.makedirs('results', exist_ok=True)

    # Save the results to a JSON file
    results_filename = f"results/query_{embeddings_provider}_contextualize_query_with_categories_dict_results.json"
    with open(results_filename, "w") as file:
        json.dump(contextualize_query_with_categories_dict, file, indent=4)

    print(f"Documents have been saved to '{results_filename}'")
    logger.debug("Contextualized query method documents have been saved to %s", results_filename)
    
    end_time = time.time()
    Result_time = end_time - start_time

    return Result_time, contextualize_query_with_categories_dict

# Assuming vector_store is defined as per the previous setup
contextualizetime, contextualize_query_with_categories_dict = contextualize_query_with_categories(vector_store, validation_examples,embeddings_provider)

## Técnica por similaridade incluindo filtro de documentos buscando similaridade entre categorias e documentos para a busca textual 

In [None]:
def hierarchical_retrieval_with_categories(vector_store, validation_examples, embeddings_provider):
    """
    Implements a hierarchical retrieval method to answer queries by categorizing and refining the search process.

    This method uses a two-step approach:
    1. Identifies relevant categories based on the input query using a language model.
    2. Retrieves relevant documents within each identified category and refines the answer using another language model.

    Parameters:
        vector_store (VectorStore): The vector store for similarity-based document retrieval.
        validation_examples (list): List of validation examples, each containing a query and related information.
        embeddings_provider (str): The name of the embeddings provider used for vectorizing the documents.

    Returns:
        list: A list of dictionaries containing context, metadata, query, ground truths, and answers.
    """
    logger.debug("Hierarchical retrieval method to answer queries initiated")

    start_time = time.time()

    # Define the high-level query template for category search
    category_prompt = """
    You are an expert in outdoor clothing products. Based on the following query, 
    return a list of relevant categories (e.g., jackets, pants) from the catalog.
    Query: {query}
    """
    category_template = PromptTemplate(input_variables=["query"], template=category_prompt)
    category_chain = LLMChain(prompt=category_template, llm=OpenAI())

    # Define the refined search for documents within each category
    document_prompt = """
    You are an assistant for question-answering tasks about the products from a store that sells outdoor clothing. 
    Your function is to use the catalog of retrieved context to answer the client's questions based on the following query.
    Query: {query}
    Catalog: {catalog}
    """
    document_template = PromptTemplate(input_variables=["query", "catalog"], template=document_prompt)
    document_chain = LLMChain(prompt=document_template, llm=OpenAI())

    for index, item in enumerate(validation_examples):
        query = item['query']
        category_result = category_chain.run({"query": query})
    
        # Convert category_result to vector (embedding) using the same model as the vector store
        relevant_documents = vector_store.similarity_search(category_result, k=5)

        if not relevant_documents:
            logger.debug(f"No relevant documents found for category '{category_result}'")  
            continue
        
        # Refine the answer using the filtered documents
        refined_answer = document_chain.run({"query": query, "catalog": relevant_documents})
        validation_examples[index]['result'] = refined_answer

    hierarchicalmethod_documents_dict = [
        [
            {
                "context": doc['context'].page_content, 
                "metadata": doc['context'].metadata,
                "query": doc['query'],
                "ground_truths": doc['ground_truths'],  
                "answer": doc['result'],
            }
            for doc in data
        ]
        for data in [validation_examples]
    ]

    os.makedirs('results', exist_ok=True)

    # Save the list of dictionaries to a JSON file
    results_filename = f"results/query_{embeddings_provider}_hierarchicalmethod_documents_dict_results.json"
    with open(results_filename, "w") as file:
        json.dump(hierarchicalmethod_documents_dict, file, indent=4)
        
    print(f"Documents have been saved to '{results_filename}'")
    logger.debug(f"Hierarchical method Documents have been saved to {results_filename}")

    end_time = time.time()
    Result_time = end_time - start_time
    
    return Result_time, hierarchicalmethod_documents_dict

# Assuming vector_store is defined as per the previous setup
hierarchicaltime, hierarchicalmethod_documents_dict = hierarchical_retrieval_with_categories(vector_store, validation_examples,embeddings_provider)

## Técnica de Recuperação-Para-Geração (RAG) com dois estágios

###  classificação de relevância de documentos e resposta gerada baseada nos documentos relevantes usando o ChatGroq 


In [None]:
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.pydantic_v1 import BaseModel, Field
# from langchain_groq import ChatGroq
# from langchain_core.output_parsers import StrOutputParser
# from concurrent.futures import ThreadPoolExecutor, as_completed

# # Data model
# class GradeDocuments(BaseModel):
#     """Ordinal score for relevance check on retrieved documents."""
#     OrdinalScore: int = Field(
#         description="Score between 1 (low) and 5 (high)"
#     )

# # Function to grade a document
# def grade_document(doc, question, llm, grade_prompt):
#     res = grade_prompt.invoke({"question": question, "document": doc.page_content})
#     return doc, res

# # Function to grade and retrieve relevant documents in parallel
# def grade_and_retrieve_documents(docs, question):
#     # LLM with function call
#     llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)
#     structured_llm_grader = llm.with_structured_output(GradeDocuments)

#     # Prompt for grading documents
#     system = """You are an assistant for question-answering tasks about the products from a store that sells outdoor clothing. 
#     Your function is to use the catalog of retrieved context to answer the client's questions based on the question.
#     Give a OrdinalScore score '1 (low) and 5 (high)' to indicate whether the document is relevant to the question."""

#     grade_prompt = ChatPromptTemplate.from_messages(
#         [
#             ("system", system),
#             ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
#         ]
#     )

#     retrieval_grader = grade_prompt | structured_llm_grader

#     # Use ThreadPoolExecutor to parallelize document grading
#     docs_to_use = []
#     with ThreadPoolExecutor() as executor:
#         futures = [executor.submit(grade_document, doc, question, llm, retrieval_grader) for doc in docs]
#         for future in as_completed(futures):
#             doc, res = future.result()
#             if res.OrdinalScore > 3:  # Keep documents with score greater than 3
#                 docs_to_use.append(doc)

#     return docs_to_use

# # Function to format documents for the next step
# def format_docs(docs):
#     return "\n".join(f"<doc{i+1}>:\nTitle:{doc.metadata['title']}\nSource:{doc.metadata['source']}\nContent:{doc.page_content}\n</doc{i+1}>\n" for i, doc in enumerate(docs))

# # Function to generate the final response using the relevant documents
# def generate_answer(docs_to_use, question):
#     # Prompt for answering the question based on relevant documents
#     system = """You are an assistant for question-answering tasks. Answer the question based upon your knowledge. 
#     Use three-to-five sentences maximum and keep the answer concise."""

#     prompt = ChatPromptTemplate.from_messages(
#         [
#             ("system", system),
#             ("human", "Retrieved documents: \n\n <docs>{documents}</docs> \n\n User question: <question>{question}</question>"),
#         ]
#     )

#     # LLM for answering the question
#     llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)

#     # Post-processing to parse the output
#     rag_chain = prompt | llm | StrOutputParser()

#     # Format documents for input
#     formatted_docs = format_docs(docs_to_use)

#     # Generate the answer
#     generation = rag_chain.invoke({"documents": formatted_docs, "question": question})
#     print(generation)

# # Main function to run the entire flow
# def main(docs, question):
#     # Step 1: Grade and retrieve relevant documents
#     docs_to_use = grade_and_retrieve_documents(docs, question)
    
#     # Step 2: Generate the final answer using the relevant documents
#     generate_answer(docs_to_use, question)

# # Example usage
# question = 'Product with sun protection'

# # Run the main function
# main(docs, question)


# Comparing the results 

In [None]:

embeddings_provider = 'OpenAI'
vector_store = load_VectorStore(embeddings_provider)


embeddings_provider = 'OpenAI'
vector_store = load_VectorStore(embeddings_provider)
with open("results/query_OpenAI_results.json", "r") as file:
        content = file.read()  # Read the entire content as a string
        database_result = json.loads(content)

validation_examples = [item for sublist in database_result for item in sublist]



[1m> Entering new RetrievalQA chain...[0m


TypeError: argument 'text': 'list' object cannot be converted to 'PyString'

In [4]:

# Example usage:
file_path_baseline = "results/query_OpenAI_baselinemethod_documents_dict_results.json"
RAG_baseline_results = validate_results(file_path_baseline)

file_path_contextualized = "results/query_OpenAI_contextualize_query_with_categories_dict_results.json"
RAG_contextualized_results = validate_results(file_path_baseline)

file_path_hierarchical = "results/query_OpenAI_hierarchicalmethod_documents_dict_results.json"
RAG_hierarchical_results = validate_results(file_path_hierarchical)


In [5]:

RAG_baseline_results_metrics = evaluate(RAG_baseline_results, metrics=[faithfulness, answer_correctness, answer_relevancy,
                                       context_precision, context_recall, context_entity_recall, 
                                       answer_similarity, answer_correctness])

RAG_baseline_results_df = RAG_baseline_results_metrics.to_pandas()
RAG_baseline_results_df['method']= 'baseline'

RAG_contextualized_results_metrics = evaluate(RAG_contextualized_results, metrics=[faithfulness, answer_correctness, answer_relevancy,
                                       context_precision, context_recall, context_entity_recall, 
                                       answer_similarity, answer_correctness])

RAG_contextualized_results_df = RAG_contextualized_results_metrics.to_pandas()
RAG_contextualized_results_df['method']= 'Contextualized'


RAG_hierarchical_results_metrics = evaluate(RAG_hierarchical_results, metrics=[faithfulness, answer_correctness, answer_relevancy,
                                       context_precision, context_recall, context_entity_recall, 
                                       answer_similarity, answer_correctness])

RAG_hierarchical_results_df = RAG_hierarchical_results_metrics.to_pandas()
RAG_hierarchical_results_df['method']= 'hierarchical'


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

In [6]:
comparing_results = pd.concat([RAG_baseline_results_df, RAG_contextualized_results_df, RAG_hierarchical_results_df],axis=0)

# Select only numeric columns
numeric_comparing_results = pd.concat([comparing_results.select_dtypes(include='number'), comparing_results['method']], axis=1)

mean_result_by_method = numeric_comparing_results.groupby(['method'])\
                                                .agg('mean').reset_index()

# Save the results to a JSON file
results_filename = f"results/comparing_methods_meanresults.csv"

# Save the DataFrame to a CSV file
mean_result_by_method.to_csv(results_filename, index=False)  # `index=False` prevents writing row numbers

print("Documents have been saved to {results_filename}")


Documents have been saved to {results_filename}


In [7]:
numeric_comparing_results = pd.concat([comparing_results.select_dtypes(include='number'), comparing_results['user_input']], axis=1)

mean_result_by_method_userinput = numeric_comparing_results.groupby(['user_input'])\
                                                           .agg('mean').reset_index()   

# Save the results to a JSON file
results_filename = f"results/comparing_methods_meanresults_byinputs.csv"

# Save the DataFrame to a CSV file
mean_result_by_method_userinput.to_csv(results_filename, index=False)  # `index=False` prevents writing row numbers

print("Documents have been saved to {results_filename}")

Documents have been saved to {results_filename}
