In [6]:
import time
import random
import numpy as np
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.document_loaders import CSVLoader
from IPython.display import display, Markdown

import langchain
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.llms import OpenAI
from langchain.evaluation.qa import QAGenerateChain
from langchain.evaluation.qa import QAEvalChain
from langchain.text_splitter import TokenTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma

from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from uuid import uuid4

import openai
import os
from dotenv import load_dotenv
import getpass
import logging
import pandas as pd
import json

# Configurar o logging para salvar a saída de debug em um arquivo
logging.basicConfig(
    filename='./logs/debug_output.log',  # O arquivo onde os logs serão salvos
    level=logging.DEBUG,          # O nível de log (DEBUG para capturar tudo)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Formato do log
)

logger = logging.getLogger()

import warnings
warnings.filterwarnings("ignore")


In [7]:
load_dotenv() # read local .env file

if not os.getenv("OPENAI_API_KEY"):
   os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [8]:
file = 'inputs/OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file, encoding="utf-8")
docs = loader.load()

In [9]:
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=10)

split_documents = []
for doc in docs:
    split_docs = text_splitter.split_documents([doc])
    split_documents.extend(split_docs)
    
    documents_dict = [
    {"page_content": doc.page_content, "metadata": doc.metadata} for doc in split_documents
]

# Save the list of dictionaries to a JSON file
with open("inputs/documents_split_langchain.json", "w") as file:
    json.dump(documents_dict, file, indent=4)

logger.debug("Documents have been saved to 'documents_split_langchain.json'")

In [11]:
# Load documents split
with open("inputs/documents_split_langchain.json", "r") as file:
    documents_dict = json.load(file)

# Convert the list of dictionaries back to a list of Document objects
documents = [
    Document(page_content=doc["page_content"], metadata=doc["metadata"])
    for doc in documents_dict
]

In [12]:
from functions import openai_embedding_function, hf_embeddings_function, google_embedding_function 
# Generating embeddings 
openai_embedding_function(documents, model = "text-embedding-3-large")

In [13]:
from functions import load_VectorStore

In [42]:
embeddings_provider = 'OpenAI'
vector_store = load_VectorStore(embeddings_provider)

OpenAI db loaded


## Criando as querys para validar os metodos

In [None]:
langchain_debug = False

manual_examples = [
     {"query": "Do the Cozy Comfort Pullover Set have side pockets?", 
      "answer": "Yes"},
    
     {"query": "What collection is the Ultra-Lofty \850 Stretch Down Hooded Jacket from?",
      "answer": "The DownTek collection"
     }]

 # Gerando exemplos para validação com IA generativa
 # Gerando examples via LLM 

llm_model = "gpt-4o-mini"
example_gen_chain = QAGenerateChain.from_llm(ChatOpenAI(model=llm_model))

select_n_documents = 5

# Selecionar 5 documentos aleatórios da lista `docs`
random_documents = random.sample(docs, select_n_documents)

# Criando a estrutura de entrada (lista de dicionários)
formatted_documents = [{"doc": doc} for doc in random_documents]

# Aplicando o processo de exemplo genérico (example_gen_chain) e obtendo a saída
gen_examples = example_gen_chain.apply_and_parse(formatted_documents)

# Ajustando a saída para incluir o contexto
gen_examples_adjusted_examples = []
for doc, example in zip(random_documents, gen_examples):
    qa_pair = example.get('qa_pairs', {})
    query = qa_pair.get('query', '')
    answer = qa_pair.get('answer', '')
    
    gen_examples_adjusted_examples.append({
        'context': doc,  # Adicionando o documento original como contexto
        'query': query,
        'answer': answer
    })

validation_examples = gen_examples_adjusted_examples

documents_dict = [
    [{"context": doc['context'].page_content, 
      "metadata": doc['context'].metadata,
      "answer": doc['answer'],  # Assuming doc['answer'] is a string or direct key
      "query": doc['query']}   # Assuming doc['query'] is a string or direct key
     for doc in data]
    for data in [validation_examples]
]
documents_dict

os.makedirs('results', exist_ok=True)

# Save the list of dictionaries to a JSON file
with open(f"results/query_{embeddings_provider}_results.json", "w") as file:
    json.dump(documents_dict, file, indent=4)

print(f"Documents have been saved to 'query_{embeddings_provider}_results.json'")

# Comparando métodos de RAG

In [None]:
# Técnica simples por similaridade

model = 'gpt-3.5-turbo-instruct'

llm = OpenAI(temperature=0.2, model=model)

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff",
    retriever=vector_store.as_retriever(), 
    verbose=True,
    chain_type_kwargs = {"document_separator": "\n"})


# Supondo que você queira registrar a variável `langchain.debug`
# langchain_debug = True  # Exemplo de variável para controlar o modo debug

# Log de debug com interpolação correta
logger.debug(f"qa_baselineMethod predictions debug {langchain_debug}")

predictions = qa.apply(validation_examples)

documents_dict = [
    [{"context": doc['context'].page_content, 
      "metadata": doc['context'].metadata,
      "answer": doc['answer'],  # Assuming doc['answer'] is a string or direct key
      "query": doc['query']}   # Assuming doc['query'] is a string or direct key
     for doc in data]
    for data in [validation_examples]
]
documents_dict

os.makedirs('results', exist_ok=True)

# Save the list of dictionaries to a JSON file
with open(f"results/query_{embeddings_provider}_results.json", "w") as file:
    json.dump(documents_dict, file, indent=4)

print(f"Documents have been saved to 'query_{embeddings_provider}_results.json'")

In [52]:
predictions

[{'context': Document(metadata={'source': 'inputs/OutdoorClothingCatalog_1000.csv', 'row': 644}, page_content=': 644\nname: ShirtFab Classic COTU 2750 Sneakers\ndescription: Inspired by a love for the coast, this Superga fabric sneaker features a nautical stripe. With a woven shirt-fabric upper for cool comfort and timeless style. \r\n\r\nSize & Fit: Please order one-half size down from your regular shoe size. \r\n\r\nWhy We Love It: Made in Italy, Superga’s legendary sneakers have been a staple on city streets and sandy beaches since 1925. \r\n\r\nConstruction: Durable upper made from woven shirt-fabric material. Natural rubber outsole with pebble traction tread. Breathable cotton lining and cushioned insole. Lace-up front with metal eyelets. \r\n\r\nAdditional Features: Superga logo at back heel. Imported.'),
  'query': 'What are the key features and construction details of the ShirtFab Classic COTU 2750 Sneakers as described in the document?',
  'answer': 'The ShirtFab Classic COTU 

In [None]:
documents_dict = [
    [{"context": doc['context'].page_content, 
      "metadata": doc['context'].metadata,
       "query": doc['query'],
       "answer": doc['answer'],  
       "ground_truths": doc['result'],
      } 
     for doc in data]
    for data in [predictions]
]
documents_dict

os.makedirs('results', exist_ok=True)

# Save the list of dictionaries to a JSON file
with open(f"results/query_{embeddings_provider}_results.json", "w") as file:
    json.dump(documents_dict, file, indent=4)

print(f"Documents have been saved to 'query_{embeddings_provider}_results.json'")

In [51]:
predictions

[{'context': Document(metadata={'source': 'inputs/OutdoorClothingCatalog_1000.csv', 'row': 644}, page_content=': 644\nname: ShirtFab Classic COTU 2750 Sneakers\ndescription: Inspired by a love for the coast, this Superga fabric sneaker features a nautical stripe. With a woven shirt-fabric upper for cool comfort and timeless style. \r\n\r\nSize & Fit: Please order one-half size down from your regular shoe size. \r\n\r\nWhy We Love It: Made in Italy, Superga’s legendary sneakers have been a staple on city streets and sandy beaches since 1925. \r\n\r\nConstruction: Durable upper made from woven shirt-fabric material. Natural rubber outsole with pebble traction tread. Breathable cotton lining and cushioned insole. Lace-up front with metal eyelets. \r\n\r\nAdditional Features: Superga logo at back heel. Imported.'),
  'query': 'What are the key features and construction details of the ShirtFab Classic COTU 2750 Sneakers as described in the document?',
  'answer': 'The ShirtFab Classic COTU 