In [1]:
import time
import random
import numpy as np
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.document_loaders import CSVLoader
from IPython.display import display, Markdown

import langchain
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.llms import OpenAI
from langchain.evaluation.qa import QAGenerateChain
from langchain.evaluation.qa import QAEvalChain
from langchain.text_splitter import TokenTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from uuid import uuid4

import openai
import os
from dotenv import load_dotenv
import getpass
import logging
import pandas as pd
import json
from datasets import Dataset

# Configurar o logging para salvar a saída de debug em um arquivo
logging.basicConfig(
    filename='./logs/debug_output.log',  # O arquivo onde os logs serão salvos
    level=logging.DEBUG,          # O nível de log (DEBUG para capturar tudo)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Formato do log
)

logger = logging.getLogger()

import warnings
warnings.filterwarnings("ignore")

In [2]:
load_dotenv() # read local .env file

if not os.getenv("OPENAI_API_KEY"):
   os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [3]:
file = 'inputs/OutdoorClothingCatalog_1000_withCategories.csv'
loader = CSVLoader(file_path=file, encoding="utf-8")
docs = loader.load()

In [4]:
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=10)

split_documents = []
for doc in docs:
    split_docs = text_splitter.split_documents([doc])
    split_documents.extend(split_docs)
    
documents_dict = [
    {"page_content": doc.page_content, "metadata": doc.metadata} for doc in split_documents
]
# Save the list of dictionaries to a JSON file
with open("inputs/documents_split_langchain.json", "w") as file:
    json.dump(documents_dict, file, indent=4)

logger.debug("Documents have been saved to 'documents_split_langchain.json'")

In [5]:
# Load documents split
with open("inputs/documents_split_langchain.json", "r") as file:
    documents_dict = json.load(file)

# Convert the list of dictionaries back to a list of Document objects
documents = [
    Document(page_content=doc["page_content"], metadata=doc["metadata"])
    for doc in documents_dict
]

In [7]:
from functions import openai_embedding_function, hf_embeddings_function, google_embedding_function 
# Generating embeddings 
openai_embedding_function(documents, model = "text-embedding-3-large")

In [7]:
from functions import load_VectorStore

In [8]:
embeddings_provider = 'OpenAI'
vector_store = load_VectorStore(embeddings_provider)

OpenAI db loaded


## Criando as querys para validar os metodos

In [9]:
def generate_and_save_validationexamples(docs, embeddings_provider, llm_model="gpt-4o-mini", select_n_documents=5):
    
    logger.debug("Creating {select_n_documents} validation examples with {llm_model} {embeddings_provider}")

    # Initialize the example generation chain
    example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model))

    # Select 5 random documents from the list `docs`
    random_documents = random.sample(docs, select_n_documents)

    # Format documents for input
    formatted_documents = [{"doc": doc} for doc in random_documents]

    # Apply the example generation chain and parse the results
    gen_examples = example_gen_chain.apply_and_parse(formatted_documents)

    # Adjust the output to include context
    gen_examples_adjusted_examples = []
    for doc, example in zip(random_documents, gen_examples):
        qa_pair = example.get('qa_pairs', {})
        query = qa_pair.get('query', '')
        answer = qa_pair.get('answer', '')

        gen_examples_adjusted_examples.append({
            'context': doc,  # Add the original document as context
            'query': query,
            'ground_truths': answer
        })

    validation_examples = gen_examples_adjusted_examples

    # Prepare the results in the expected format
    documents_dict = [
        [{"context": doc['context'].page_content, 
          "metadata": doc['context'].metadata,
          "ground_truths": doc['ground_truths'], 
          "query": doc['query']}   
         for doc in data]
        for data in [validation_examples]
    ]

    # Ensure the results directory exists
    os.makedirs('results', exist_ok=True)

    # Save the results to a JSON file
    results_filename = f"results/query_{embeddings_provider}_results.json"
    with open(results_filename, "w") as file:
        json.dump(documents_dict, file, indent=4)

    logger.debug("Documents have been saved to {results_filename}")

    return validation_examples

validation_examples = generate_and_save_validationexamples(docs, embeddings_provider, llm_model="gpt-4o-mini", select_n_documents=5)

# Comparando métodos de RAG

In [10]:
def baseline_similarity_method(validation_examples, vector_store, embeddings_provider):

    logger.debug("baseline_similarity_method to answer queries iniciated")

    model = 'gpt-3.5-turbo-instruct'

    # Initialize the language model
    llm = OpenAI(temperature=0.2, model=model)

    # Set up the retriever with similarity search
    retriever = vector_store.as_retriever(search_type="similarity", k=1)  # Retrieve the most relevant chunk

    # Initialize the RetrievalQA chain
    qa = RetrievalQA.from_chain_type(
        llm=llm, 
        chain_type="stuff",
        retriever=retriever, 
        verbose=True,
        chain_type_kwargs={"document_separator": "\n"}
    )

    # Apply the model to the validation examples
    predictions = qa.apply(validation_examples)

    # Prepare the results
    baselinemethod_documents_dict = [
        [{"context": doc['context'].page_content, 
          "metadata": doc['context'].metadata,
          "query": doc['query'],
          "ground_truths": doc['ground_truths'],  
          "answer": doc['result']}
         for doc in data]
        for data in [predictions]
    ]

    # Ensure the results directory exists
    os.makedirs('results', exist_ok=True)

    # Save the results to a JSON file
    results_filename = f"results/query_{embeddings_provider}_baselinemethod_documents_dict_results.json"
    with open(results_filename, "w") as file:
        json.dump(baselinemethod_documents_dict, file, indent=4)

    print(f"Documents have been saved to '{results_filename}'")
    logger.debug("baseline_similarity_method Documents have been saved to {results_filename}")


    return baselinemethod_documents_dict

baselinemethod_documents_dict = baseline_similarity_method(validation_examples, vector_store, embeddings_provider)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Documents have been saved to 'results/query_OpenAI_baselinemethod_documents_dict_results.json'


# Técnica por similaridade incluindo na query categorias referentes a pergunta para a busca textual 


In [None]:
def contextualize_query_with_categories(vector_store, validation_examples,embeddings_provider):

    logger.debug("contextualized_query to answer queries iniciated")

    # Define the high-level query template for category search
    category_prompt = """
    You are an expert in outdoor clothing products. Based on the following query, 
    return a list of relevant categories (e.g., jackets, pants) from the catalog.
    Query: {query}
    """
    category_template = PromptTemplate(input_variables=["query"], template=category_prompt)
    category_chain = LLMChain(prompt=category_template, llm=OpenAI())

    # Define the refined search for documents within each category
    document_prompt = """
    You are an assistant for question-answering tasks about the products from a store that sells outdoor clothing. 
    Your function is to use the catalog of retrieved context to answer the client's questions based on the following query and the category {category}.
    Query: {query}
    Category: {category}
    Catalog: {catalog}
    """

    document_template = PromptTemplate(input_variables=["query", "category",'catalog'], template=document_prompt)
    document_chain = LLMChain(prompt=document_template, llm=OpenAI())

    for index, item in enumerate(validation_examples):
        query = item['query']
        category_result = category_chain.run({"query": query})

        modified_query = f"{query} Category: {category_result}"  # Combine query and category for search
        catalog = vector_store.similarity_search(modified_query, k=5) 

        print(category_result)
        refined_answer = document_chain.run({"query": query, "category": category_result, "catalog": catalog})
        validation_examples[index]['result'] = refined_answer

    contextualize_query_with_categories_dict = [
        [{"context": doc['context'].page_content, 
        "metadata": doc['context'].metadata,
        "query": doc['query'],
        "ground_truths": doc['ground_truths'],  
        "answer": doc['result'],
        } 
        for doc in data]
        for data in [validation_examples]
    ]

    os.makedirs('results', exist_ok=True)

    # Save the list of dictionaries to a JSON file
    results_filename = f"results/query_{embeddings_provider}_contextualize_query_with_categories_dict_results.json"

    with open(results_filename, "w") as file:
        json.dump(contextualize_query_with_categories_dict, file, indent=4)
        

    print(f"Documents have been saved to '{results_filename}'")
    logger.debug("Contextualized query method Documents have been saved to {results_filename}")

    return contextualize_query_with_categories_dict

# Assuming vector_store is defined as per the previous setup
contextualize_query_with_categories_dict = contextualize_query_with_categories(vector_store, validation_examples,embeddings_provider)



The Timeless Textured Stripe Cashmere Cardigan with Pocket would fall under the following categories:
- Sweaters
- Cardigans
- Cashmere Clothing
- Textured Clothing
- Striped Clothing
- Clothing with Pockets

1. Outdoor furniture
2. Garden benches
3. Patio furniture
4. Wood furniture
5. Weather-resistant furniture
6. Durable furniture
7. Rust-resistant furniture
8. Sturdy furniture
9. Outdoor seating
10. Wooden benches
11. Garden decor
12. Backyard furniture
13. Park benches
14. Commercial outdoor furniture
15. Natural wood furniture

1. Fishing gear
2. Fly rods
3. Outdoor equipment
4. Sportswear
5. Fishing rods
6. Fly fishing accessories
7. Fly fishing outfits
8. Fishing outfits
9. Fishing equipment
10. Fishing accessories

1. Hiking boots
2. Women's footwear
3. Outdoor gear
4. Adventure gear
5. All-terrain shoes
6. Hiking equipment
7. Keen Targhee II
8. Women's shoes
9. Outdoor footwear
10. Hiking essentials

1. Bedding 
2. Quilts 
3. Thermal Quilts 
4. Bedding Dimensions 
5. Bed S

Define a Retrieval Strategy (Hierarchical Retrieval)
Hierarchical retrieval means you will first retrieve high-level categories or subtopics and then refine the search to get more specific documents.

Step 1: Retrieve relevant high-level categories (e.g., product types like "jackets", "pants", etc.)
Step 2: Within each category, retrieve specific documents (e.g., details of specific jackets or pants).
To achieve this, you may use a two-step retrieval process. First, search for broad categories in the vector store, then use the results to refine the search.
    

# Técnica por similaridade incluindo filtro de documentos por categorias similares para a busca textual 

In [None]:
def hierarchical_retrieval_with_categories(vector_store, validation_examples,embeddings_provider):

    logger.debug("Hierarchical retrieval method to answer queries initiated")

    # Define the high-level query template for category search
    category_prompt = """
    You are an expert in outdoor clothing products. Based on the following query, 
    return a list of relevant categories (e.g., jackets, pants) from the catalog.
    Query: {query}
    """
    category_template = PromptTemplate(input_variables=["query"], template=category_prompt)
    category_chain = LLMChain(prompt=category_template, llm=OpenAI())

    # Define the refined search for documents within each category
    document_prompt = """
    You are an assistant for question-answering tasks about the products from a store that sells outdoor clothing. 
    Your function is to use the catalog of retrieved context to answer the client's questions based on the following query.
    Query: {query}
    Catalog: {catalog}
    """

    document_template = PromptTemplate(input_variables=["query", 'catalog'], template=document_prompt)
    document_chain = LLMChain(prompt=document_template, llm=OpenAI())

    for index, item in enumerate(validation_examples):
        query = item['query']
        category_result = category_chain.run({"query": query})
    
        # Convert category_result to vector (embedding) using the same model as the vector store
        relevant_documents = vector_store.similarity_search(category_result, k=5)

        # If no relevant documents are found, you can skip this iteration or handle accordingly
        if not relevant_documents:
            print(f"No relevant documents found for category '{category_result}'")
            continue
        
        # Proceed with refining the answer using the filtered documents
        refined_answer = document_chain.run({"query": query, "catalog": relevant_documents})
        validation_examples[index]['result'] = refined_answer

    hierarchicalmethod_documents_dict = [
        [{"context": doc['context'].page_content, 
          "metadata": doc['context'].metadata,
          "query": doc['query'],
          "ground_truths": doc['ground_truths'],  
          "answer": doc['result'],
        } 
        for doc in data]
        for data in [validation_examples]
    ]

    os.makedirs('results', exist_ok=True)

    # Save the list of dictionaries to a JSON file
    results_filename = f"results/query_{embeddings_provider}_hierarchicalmethod_documents_dict_results.json"

    with open(results_filename, "w") as file:
        json.dump(hierarchicalmethod_documents_dict, file, indent=4)
        
    print(f"Documents have been saved to '{results_filename}'")
    logger.debug(f"Hierarchical method Documents have been saved to {results_filename}")

    return hierarchicalmethod_documents_dict


# Assuming vector_store is defined as per the previous setup
hierarchicalmethod_documents_dict = hierarchical_retrieval_with_categories(vector_store, validation_examples,embeddings_provider)

Documents have been saved to 'results/query_OpenAI_hierarchicalmethod_documents_dict_results.json'


# Técnica de Recuperação-Para-Geração (RAG) com dois estágios

###  classificação de relevância de documentos e resposta gerada baseada nos documentos relevantes usando o ChatGroq 


In [None]:
# from langchain_core.prompts import ChatPromptTemplate
# from langchain_core.pydantic_v1 import BaseModel, Field
# from langchain_groq import ChatGroq
# from langchain_core.output_parsers import StrOutputParser
# from concurrent.futures import ThreadPoolExecutor, as_completed

# # Data model
# class GradeDocuments(BaseModel):
#     """Ordinal score for relevance check on retrieved documents."""
#     OrdinalScore: int = Field(
#         description="Score between 1 (low) and 5 (high)"
#     )

# # Function to grade a document
# def grade_document(doc, question, llm, grade_prompt):
#     res = grade_prompt.invoke({"question": question, "document": doc.page_content})
#     return doc, res

# # Function to grade and retrieve relevant documents in parallel
# def grade_and_retrieve_documents(docs, question):
#     # LLM with function call
#     llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)
#     structured_llm_grader = llm.with_structured_output(GradeDocuments)

#     # Prompt for grading documents
#     system = """You are an assistant for question-answering tasks about the products from a store that sells outdoor clothing. 
#     Your function is to use the catalog of retrieved context to answer the client's questions based on the question.
#     Give a OrdinalScore score '1 (low) and 5 (high)' to indicate whether the document is relevant to the question."""

#     grade_prompt = ChatPromptTemplate.from_messages(
#         [
#             ("system", system),
#             ("human", "Retrieved document: \n\n {document} \n\n User question: {question}"),
#         ]
#     )

#     retrieval_grader = grade_prompt | structured_llm_grader

#     # Use ThreadPoolExecutor to parallelize document grading
#     docs_to_use = []
#     with ThreadPoolExecutor() as executor:
#         futures = [executor.submit(grade_document, doc, question, llm, retrieval_grader) for doc in docs]
#         for future in as_completed(futures):
#             doc, res = future.result()
#             if res.OrdinalScore > 3:  # Keep documents with score greater than 3
#                 docs_to_use.append(doc)

#     return docs_to_use

# # Function to format documents for the next step
# def format_docs(docs):
#     return "\n".join(f"<doc{i+1}>:\nTitle:{doc.metadata['title']}\nSource:{doc.metadata['source']}\nContent:{doc.page_content}\n</doc{i+1}>\n" for i, doc in enumerate(docs))

# # Function to generate the final response using the relevant documents
# def generate_answer(docs_to_use, question):
#     # Prompt for answering the question based on relevant documents
#     system = """You are an assistant for question-answering tasks. Answer the question based upon your knowledge. 
#     Use three-to-five sentences maximum and keep the answer concise."""

#     prompt = ChatPromptTemplate.from_messages(
#         [
#             ("system", system),
#             ("human", "Retrieved documents: \n\n <docs>{documents}</docs> \n\n User question: <question>{question}</question>"),
#         ]
#     )

#     # LLM for answering the question
#     llm = ChatGroq(model="llama-3.1-8b-instant", temperature=0)

#     # Post-processing to parse the output
#     rag_chain = prompt | llm | StrOutputParser()

#     # Format documents for input
#     formatted_docs = format_docs(docs_to_use)

#     # Generate the answer
#     generation = rag_chain.invoke({"documents": formatted_docs, "question": question})
#     print(generation)

# # Main function to run the entire flow
# def main(docs, question):
#     # Step 1: Grade and retrieve relevant documents
#     docs_to_use = grade_and_retrieve_documents(docs, question)
    
#     # Step 2: Generate the final answer using the relevant documents
#     generate_answer(docs_to_use, question)

# # Example usage
# question = 'Product with sun protection'

# # Run the main function
# main(docs, question)


KeyboardInterrupt: 

In [None]:
def generate_and_save_validationexamples(docs, embeddings_provider, llm_model="gpt-4o-mini", select_n_documents=5):
    
    logger.debug("Creating {select_n_documents} validation examples with {llm_model} {embeddings_provider}")

    # Initialize the example generation chain
    example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model))

    # Select 5 random documents from the list `docs`
    random_documents = random.sample(docs, select_n_documents)

    # Format documents for input
    formatted_documents = [{"doc": doc} for doc in random_documents]

    # Apply the example generation chain and parse the results
    gen_examples = example_gen_chain.apply_and_parse(formatted_documents)

    # Adjust the output to include context
    gen_examples_adjusted_examples = []
    for doc, example in zip(random_documents, gen_examples):
        qa_pair = example.get('qa_pairs', {})
        query = qa_pair.get('query', '')
        answer = qa_pair.get('answer', '')

        gen_examples_adjusted_examples.append({
            'context': doc,  # Add the original document as context
            'query': query,
            'ground_truths': answer
        })

    validation_examples = gen_examples_adjusted_examples

    # Prepare the results in the expected format
    documents_dict = [
        [{"context": doc['context'].page_content, 
          "metadata": doc['context'].metadata,
          "ground_truths": doc['ground_truths'], 
          "query": doc['query']}   
         for doc in data]
        for data in [validation_examples]
    ]

    # Ensure the results directory exists
    os.makedirs('results', exist_ok=True)

    # Save the results to a JSON file
    results_filename = f"results/query_{embeddings_provider}_results.json"
    with open(results_filename, "w") as file:
        json.dump(documents_dict, file, indent=4)

    logger.debug("Documents have been saved to {results_filename}")

    return validation_examples

validation_examples = generate_and_save_validationexamples(docs, embeddings_provider, llm_model="gpt-4o-mini", select_n_documents=5)

# Comparing the results 

In [None]:

def validate_results(file_path):
    # Read the JSON file
    with open(file_path, "r") as file:
        content = file.read()  # Read the entire content as a string
        baseline_results = json.loads(content)

    # Flatten the list of results and rename to database_result
    database_result = [item for sublist in baseline_results for item in sublist]

    # Initialize lists to store the values
    queries = []  # List for queries
    answer = []  # List for predicted answers
    context = []  # List for context
    ground_truths = []  # List for ground truth answers

    # Populate the lists
    for i, eg in enumerate(database_result):
        queries.append(database_result[i]['query'])
        answer.append(database_result[i]['answer'])
        context.append([database_result[i]['context']])
        ground_truths.append(database_result[i]['answer'])

    # Create the final dictionary
    final_dict = {
        "question": queries,
        "answer": answer,
        "contexts": context,
        "ground_truth": ground_truths
    }

    # Convert the dictionary to a Dataset
    Dataset_results = Dataset.from_dict(final_dict)

    return Dataset_results

# Example usage:
file_path_baseline = "results/query_OpenAI_baselinemethod_documents_dict_results.json"
RAG_baseline_results = validate_results(file_path_baseline)

file_path_contextualized = "results/query_OpenAI_contextualize_query_with_categories_dict_results.json"
RAG_contextualized_results = validate_results(file_path_baseline)

file_path_hierarchical = "results/query_OpenAI_hierarchicalmethod_documents_dict_results.json"
RAG_hierarchical_results = validate_results(file_path_hierarchical)


In [None]:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_similarity, answer_correctness, context_precision, context_recall, context_entity_recall

RAG_baseline_results_metrics = evaluate(RAG_baseline_results, metrics=[faithfulness, answer_correctness, answer_relevancy,
                                       context_precision, context_recall, context_entity_recall, 
                                       answer_similarity, answer_correctness])

RAG_baseline_results_df = RAG_baseline_results_metrics.to_pandas()
RAG_baseline_results_df['method']= 'baseline'

RAG_contextualized_results_metrics = evaluate(RAG_contextualized_results, metrics=[faithfulness, answer_correctness, answer_relevancy,
                                       context_precision, context_recall, context_entity_recall, 
                                       answer_similarity, answer_correctness])

RAG_contextualized_results_df = RAG_contextualized_results_metrics.to_pandas()
RAG_contextualized_results_df['method']= 'Contextualized'


RAG_hierarchical_results_metrics = evaluate(RAG_hierarchical_results, metrics=[faithfulness, answer_correctness, answer_relevancy,
                                       context_precision, context_recall, context_entity_recall, 
                                       answer_similarity, answer_correctness])

RAG_hierarchical_results_df = RAG_hierarchical_results_metrics.to_pandas()
RAG_hierarchical_results_df['method']= 'hierarchical'


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
comparing_results = pd.concat([RAG_baseline_results_df, RAG_contextualized_results_df, RAG_hierarchical_results_df],axis=0)

# Select only numeric columns
numeric_comparing_results = pd.concat([comparing_results.select_dtypes(include='number'), comparing_results['method']], axis=1)

mean_result_by_method = numeric_comparing_results.groupby(['method'])\
                                                .agg('mean').reset_index()

# Save the results to a JSON file
results_filename = f"results/comparing_methods_meanresults.csv"

# Save the DataFrame to a CSV file
mean_result_by_method.to_csv(results_filename, index=False)  # `index=False` prevents writing row numbers

print("Documents have been saved to {results_filename}")


Documents have been saved to {results_filename}


In [138]:
numeric_comparing_results = pd.concat([comparing_results.select_dtypes(include='number'), comparing_results['user_input']], axis=1)

mean_result_by_method_userinput = numeric_comparing_results.groupby(['user_input'])\
                                                           .agg('mean').reset_index()   

# Save the results to a JSON file
results_filename = f"results/comparing_methods_meanresults_byinputs.csv"

# Save the DataFrame to a CSV file
mean_result_by_method_userinput.to_csv(results_filename, index=False)  # `index=False` prevents writing row numbers

print("Documents have been saved to {results_filename}")

Documents have been saved to {results_filename}
