In [None]:
import time
import random
import numpy as np
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.document_loaders import CSVLoader
from IPython.display import display, Markdown

import langchain
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.llms import OpenAI
from langchain.evaluation.qa import QAGenerateChain
from langchain.evaluation.qa import QAEvalChain
from langchain.text_splitter import TokenTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain

from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from uuid import uuid4

import openai
import os
from dotenv import load_dotenv
import getpass
import logging
import pandas as pd
import json
from datasets import Dataset

# Configurar o logging para salvar a saída de debug em um arquivo
logging.basicConfig(
    filename='./logs/debug_output.log',  # O arquivo onde os logs serão salvos
    level=logging.DEBUG,          # O nível de log (DEBUG para capturar tudo)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Formato do log
)

logger = logging.getLogger()

import warnings
warnings.filterwarnings("ignore")

In [2]:
load_dotenv() # read local .env file

if not os.getenv("OPENAI_API_KEY"):
   os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [3]:
file = 'inputs/OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file, encoding="utf-8")
docs = loader.load()

In [None]:
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=10)

split_documents = []
for doc in docs:
    split_docs = text_splitter.split_documents([doc])
    split_documents.extend(split_docs)
    
documents_dict = [
    {"page_content": doc.page_content, "metadata": doc.metadata} for doc in split_documents
]
# Save the list of dictionaries to a JSON file
with open("inputs/documents_split_langchain.json", "w") as file:
    json.dump(documents_dict, file, indent=4)

logger.debug("Documents have been saved to 'documents_split_langchain.json'")

In [5]:
# Load documents split
with open("inputs/documents_split_langchain.json", "r") as file:
    documents_dict = json.load(file)

# Convert the list of dictionaries back to a list of Document objects
documents = [
    Document(page_content=doc["page_content"], metadata=doc["metadata"])
    for doc in documents_dict
]

In [6]:
from functions import openai_embedding_function, hf_embeddings_function, google_embedding_function 
# Generating embeddings 
openai_embedding_function(documents, model = "text-embedding-3-large")

In [7]:
from functions import load_VectorStore

In [8]:
embeddings_provider = 'OpenAI'
vector_store = load_VectorStore(embeddings_provider)

OpenAI db loaded


## Criando as querys para validar os metodos

In [None]:
def generate_and_save_validationexamples(docs, embeddings_provider, llm_model="gpt-4o-mini", select_n_documents=5):
    
    logger.debug("Creating {select_n_documents} validation examples with {llm_model} {embeddings_provider}")

    # Initialize the example generation chain
    example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model))

    # Select 5 random documents from the list `docs`
    random_documents = random.sample(docs, select_n_documents)

    # Format documents for input
    formatted_documents = [{"doc": doc} for doc in random_documents]

    # Apply the example generation chain and parse the results
    gen_examples = example_gen_chain.apply_and_parse(formatted_documents)

    # Adjust the output to include context
    gen_examples_adjusted_examples = []
    for doc, example in zip(random_documents, gen_examples):
        qa_pair = example.get('qa_pairs', {})
        query = qa_pair.get('query', '')
        answer = qa_pair.get('answer', '')

        gen_examples_adjusted_examples.append({
            'context': doc,  # Add the original document as context
            'query': query,
            'ground_truths': answer
        })

    validation_examples = gen_examples_adjusted_examples

    # Prepare the results in the expected format
    documents_dict = [
        [{"context": doc['context'].page_content, 
          "metadata": doc['context'].metadata,
          "ground_truths": doc['ground_truths'], 
          "query": doc['query']}   
         for doc in data]
        for data in [validation_examples]
    ]

    # Ensure the results directory exists
    os.makedirs('results', exist_ok=True)

    # Save the results to a JSON file
    results_filename = f"results/query_{embeddings_provider}_results.json"
    with open(results_filename, "w") as file:
        json.dump(documents_dict, file, indent=4)

    logger.debug("Documents have been saved to {results_filename}")

    return validation_examples

validation_examples = generate_and_save_validationexamples(docs, embeddings_provider, llm_model="gpt-4o-mini", select_n_documents=5)

# Comparando métodos de RAG

In [None]:
def baseline_similarity_method(validation_examples, vector_store, embeddings_provider):

    logger.debug("baseline_similarity_method to answer queries iniciated")

    model = 'gpt-3.5-turbo-instruct'

    # Initialize the language model
    llm = OpenAI(temperature=0.2, model=model)

    # Set up the retriever with similarity search
    retriever = vector_store.as_retriever(search_type="similarity", k=1)  # Retrieve the most relevant chunk

    # Initialize the RetrievalQA chain
    qa = RetrievalQA.from_chain_type(
        llm=llm, 
        chain_type="stuff",
        retriever=retriever, 
        verbose=True,
        chain_type_kwargs={"document_separator": "\n"}
    )

    # Apply the model to the validation examples
    predictions = qa.apply(validation_examples)

    # Prepare the results
    baselinemethod_documents_dict = [
        [{"context": doc['context'].page_content, 
          "metadata": doc['context'].metadata,
          "query": doc['query'],
          "ground_truths": doc['ground_truths'],  
          "answer": doc['result']}
         for doc in data]
        for data in [predictions]
    ]

    # Ensure the results directory exists
    os.makedirs('results', exist_ok=True)

    # Save the results to a JSON file
    results_filename = f"results/query_{embeddings_provider}_baselinemethod_documents_dict_results.json"
    with open(results_filename, "w") as file:
        json.dump(baselinemethod_documents_dict, file, indent=4)

    print(f"Documents have been saved to '{results_filename}'")
    logger.debug("baseline_similarity_method Documents have been saved to {results_filename}")


    return baselinemethod_documents_dict

baselinemethod_documents_dict = baseline_similarity_method(validation_examples, vector_store, embeddings_provider)



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
Documents have been saved to 'results/query_OpenAI_baselinemethod_documents_dict_results.json'


# Técnica por similaridade apenas para os documentos mais relevantes 


In [23]:
def hierarchical_retrieval_for_multiple_queries(vector_store, validation_examples):

    logger.debug("hierarchical_retrievalmethod to answer queries iniciated")

    # Define the high-level query template for category search
    category_prompt = """
    You are an expert in outdoor clothing products. Based on the following query, 
    return a list of relevant categories (e.g., jackets, pants) from the catalog.
    Query: {query}
    """
    category_template = PromptTemplate(input_variables=["query"], template=category_prompt)
    category_chain = LLMChain(prompt=category_template, llm=OpenAI())

    # Define the refined search for documents within each category
    document_prompt = """
    You are an assistant for question-answering tasks about the products from a store that sells outdoor clothing. 
    Your function is to use the catalog of retrieved context to answer the client's questions based on the following query and the category {category}.
    Query: {query}
    Category: {category}
    Catalog: {catalog}
    """

    document_template = PromptTemplate(input_variables=["query", "category",'catalog'], template=document_prompt)
    document_chain = LLMChain(prompt=document_template, llm=OpenAI())

    for index, item in enumerate(validation_examples):
        query = item['query']
        category_result = category_chain.run({"query": query})

        modified_query = f"{query} Category: {category_result}"  # Combine query and category for search
        catalog = vector_store.similarity_search(modified_query, k=5) 

        print(category_result)
        refined_answer = document_chain.run({"query": query, "category": category_result, "catalog": catalog})
        validation_examples[index]['result'] = refined_answer

    hierarchicalmethod_documents_dict = [
        [{"context": doc['context'].page_content, 
        "metadata": doc['context'].metadata,
        "query": doc['query'],
        "ground_truths": doc['ground_truths'],  
        "answer": doc['result'],
        } 
        for doc in data]
        for data in [validation_examples]
    ]

    os.makedirs('results', exist_ok=True)

    # Save the list of dictionaries to a JSON file
    results_filename = f"results/query_{embeddings_provider}_hierarchicalmethod_documents_dict_results.json"

    with open(results_filename, "w") as file:
        json.dump(hierarchicalmethod_documents_dict, file, indent=4)
        

    print(f"Documents have been saved to '{results_filename}'")
    logger.debug("Hierarchical method Documents have been saved to {results_filename}")

    return hierarchicalmethod_documents_dict


# Assuming vector_store is defined as per the previous setup
hierarchicalmethod_documents_dict = hierarchical_retrieval_for_multiple_queries(vector_store, validation_examples)


Some relevant categories for the Women's Chunky Neck Blanket could include:

- Blankets
- Winter Accessories
- Women's Clothing
- Knitwear
- Wool Products

1. Jackets
2. Down Jackets
3. Outdoor Clothing
4. Winter Clothing
5. Insulated Jackets
6. Cold Weather Gear
7. Performance Clothing
8. Temperature Resistant Clothing
9. Outdoor Gear
10. Winter Sports Equipment

1. Hunting apparel
2. Duck hunting gear
3. Outdoor clothing
4. Waterproof jackets
5. Camouflage clothing
6. Hunting accessories
7. Waterfowl hunting gear
8. Duck calls
9. Hunting jackets
10. Hunting pants

1. Shirts
2. Cotton Shirts
3. Chambray Shirts
4. Easy-Care Shirts
5. Modern Shirts
6. Casual Shirts
7. Button-down Shirts
8. Short-sleeve Shirts
9. Lightweight Shirts
10. Comfortable Shirts

1. Jackets
2. Insulated Clothing
3. Outerwear
4. Pullovers
5. Winter Clothing
6. Cold Weather Gear
7. Windproof Clothing
8. Active Wear
9. Outdoor Gear
10. Performance Clothing
Documents have been saved to 'results/query_OpenAI_hierarc

Define a Retrieval Strategy (Hierarchical Retrieval)
Hierarchical retrieval means you will first retrieve high-level categories or subtopics and then refine the search to get more specific documents.

Step 1: Retrieve relevant high-level categories (e.g., product types like "jackets", "pants", etc.)
Step 2: Within each category, retrieve specific documents (e.g., details of specific jackets or pants).
To achieve this, you may use a two-step retrieval process. First, search for broad categories in the vector store, then use the results to refine the search.
    

In [None]:
# Método Hierarchical Retrieval: Busca Estruturada
def hierarchical_retrieval(query, embedding, top_n=3):
    # Categorias hipotéticas
    categories = {
        "Footwear": ["Women's Campside Oxfords", "Running Shoes"],
        "Dog Mats": ["Recycled Waterhog Dog Mat", "Outdoor Dog Mat"],
        "Swimsuits": ["Infant and Toddler Girls' Coastal Chill Swimsuit"]
    }
    
    # Etapa 1: Identificar a categoria relevante com base na consulta
    category_found = None
    if "dog mat" in query.lower():
        category_found = "Dog Mats"
    elif "shoe" in query.lower():
        category_found = "Footwear"
    elif "swimsuit" in query.lower():
        category_found = "Swimsuits"
    
    if not category_found:
        return "Category not found"

    # Etapa 2: Buscar documentos na categoria encontrada
    category_docs = [doc for doc in documents_dict if category_found in doc["page_content"]]
    
    # Etapa 3: Gerar embeddings para os documentos da categoria
    query_embedding = embedding.embed(query)
    category_embeddings = [embedding.embed(doc["page_content"]) for doc in category_docs]
    
    # Calcular similaridade entre o query e os documentos da categoria
    similarities = [
        (i, embedding.similarity(query_embedding, doc_emb)) for i, doc_emb in enumerate(category_embeddings)
    ]
    
    # Ordenar por similaridade e pegar os top N
    similarities.sort(key=lambda x: x[1], reverse=True)
    top_results = [category_docs[i] for i, _ in similarities[:top_n]]
    
    return top_results

# Exemplo de consulta
query = "Tell me about the dog mat"
results_hierarchical = hierarchical_retrieval(query)

for result in results_hierarchical:
    print(f"Document: {result['page_content']}")


# COMMAND ----------

# Comparing the results 

In [None]:

def validate_results(file_path):
    # Read the JSON file
    with open(file_path, "r") as file:
        content = file.read()  # Read the entire content as a string
        baseline_results = json.loads(content)

    # Flatten the list of results and rename to database_result
    database_result = [item for sublist in baseline_results for item in sublist]

    # Initialize lists to store the values
    queries = []  # List for queries
    answer = []  # List for predicted answers
    context = []  # List for context
    ground_truths = []  # List for ground truth answers

    # Populate the lists
    for i, eg in enumerate(database_result):
        queries.append(database_result[i]['query'])
        answer.append(database_result[i]['answer'])
        context.append([database_result[i]['context']])
        ground_truths.append(database_result[i]['answer'])

    # Create the final dictionary
    final_dict = {
        "question": queries,
        "answer": answer,
        "contexts": context,
        "ground_truth": ground_truths
    }

    # Convert the dictionary to a Dataset
    Dataset_results = Dataset.from_dict(final_dict)

    return Dataset_results

# Example usage:
file_path_baseline = "results/query_OpenAI_baselinemethod_documents_dict_results.json"
RAG_baseline_results = validate_results(file_path_baseline)


file_path_hierarchical = "results/query_OpenAI_hierarchicalmethod_documents_dict_results.json"
RAG_hierarchical_results = validate_results(file_path_hierarchical)


In [119]:
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, answer_similarity, answer_correctness, context_precision, context_recall, context_entity_recall

RAG_baseline_results_metrics = evaluate(RAG_baseline_results, metrics=[faithfulness, answer_correctness, answer_relevancy,
                                       context_precision, context_recall, context_entity_recall, 
                                       answer_similarity, answer_correctness])

RAG_baseline_results_df = RAG_baseline_results_metrics.to_pandas()
RAG_baseline_results_df['method']= 'baseline'

RAG_hierarchical_results_metrics = evaluate(RAG_hierarchical_results, metrics=[faithfulness, answer_correctness, answer_relevancy,
                                       context_precision, context_recall, context_entity_recall, 
                                       answer_similarity, answer_correctness])

RAG_hierarchical_results_df = RAG_hierarchical_results_metrics.to_pandas()
RAG_hierarchical_results_df['method']= 'hierarchical'


Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/40 [00:00<?, ?it/s]

In [None]:
comparing_results = pd.concat([RAG_baseline_results_df, RAG_hierarchical_results_df],axis=0)

# Select only numeric columns
numeric_comparing_results = pd.concat([comparing_results.select_dtypes(include='number'), comparing_results['method']], axis=1)

numeric_comparing_results.groupby(['method'])\
                 .agg('mean').reset_index()

Unnamed: 0,user_input,faithfulness,answer_correctness,answer_relevancy,context_precision,context_recall,context_entity_recall,semantic_similarity
0,What are the key features and benefits of the ...,0.725,0.965909,0.970188,1.0,0.833333,0.261111,1.0
1,What are the key features and specifications o...,0.958333,1.0,0.958467,1.0,1.0,0.583333,0.999998
2,What are the key features of the Burnt Oak Qua...,0.857143,1.0,0.969803,1.0,0.9,0.77381,1.0
3,What are the key features of the Easy-Care Cot...,0.947368,1.0,0.975345,1.0,0.75,0.3125,1.0
4,What are the key features of the Women's Chunk...,1.0,1.0,0.953991,1.0,1.0,0.5625,1.0


In [135]:
numeric_comparing_results = pd.concat([comparing_results.select_dtypes(include='number'), comparing_results['user_input']], axis=1)

numeric_comparing_results.groupby(['user_input'])\
                 .agg('mean').reset_index()   

Unnamed: 0,user_input,faithfulness,answer_correctness,answer_relevancy,context_precision,context_recall,context_entity_recall,semantic_similarity
0,What are the key features and benefits of the ...,0.725,0.965909,0.970188,1.0,0.833333,0.261111,1.0
1,What are the key features and specifications o...,0.958333,1.0,0.958467,1.0,1.0,0.583333,0.999998
2,What are the key features of the Burnt Oak Qua...,0.857143,1.0,0.969803,1.0,0.9,0.77381,1.0
3,What are the key features of the Easy-Care Cot...,0.947368,1.0,0.975345,1.0,0.75,0.3125,1.0
4,What are the key features of the Women's Chunk...,1.0,1.0,0.953991,1.0,1.0,0.5625,1.0
