In [1]:
import time
import random
import numpy as np
from langchain.chains import RetrievalQA
from langchain.vectorstores import FAISS
from langchain.document_loaders import CSVLoader
from IPython.display import display, Markdown

import langchain
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain.llms import OpenAI
from langchain.evaluation.qa import QAGenerateChain
from langchain.evaluation.qa import QAEvalChain
from langchain.text_splitter import TokenTextSplitter
from langchain.schema import Document
from langchain_chroma import Chroma

from langchain_openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceInferenceAPIEmbeddings
from langchain_google_genai import GoogleGenerativeAIEmbeddings

import openai
import os
from dotenv import load_dotenv
import getpass
import logging
import pandas as pd
import json

# Configurar o logging para salvar a saída de debug em um arquivo
logging.basicConfig(
    filename='./logs/debug_output.log',  # O arquivo onde os logs serão salvos
    level=logging.DEBUG,          # O nível de log (DEBUG para capturar tudo)
    format='%(asctime)s - %(levelname)s - %(message)s'  # Formato do log
)

logger = logging.getLogger()

In [2]:
load_dotenv() # read local .env file

if not os.getenv("OPENAI_API_KEY"):
   os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter your OpenAI API key: ")

In [3]:
file = 'inputs/OutdoorClothingCatalog_1000.csv'
loader = CSVLoader(file_path=file, encoding="utf-8")
docs = loader.load()

In [4]:
text_splitter = TokenTextSplitter(chunk_size=200, chunk_overlap=10)

split_documents = []
for doc in docs:
    split_docs = text_splitter.split_documents([doc])
    split_documents.extend(split_docs)
    
    documents_dict = [
    {"page_content": doc.page_content, "metadata": doc.metadata} for doc in split_documents
]

# Save the list of dictionaries to a JSON file
with open("inputs/documents_split_langchain.json", "w") as file:
    json.dump(documents_dict, file, indent=4)

logger.debug("Documents have been saved to 'documents_split_langchain.json'")

In [5]:
# Load documents split
with open("inputs/documents_split_langchain.json", "r") as file:
    documents_dict = json.load(file)

# Convert the list of dictionaries back to a list of Document objects
documents = [
    Document(page_content=doc["page_content"], metadata=doc["metadata"])
    for doc in documents_dict
]

In [6]:
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# Suprimir alertas de certificado inseguro
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

# Fazer requisições HTTPS sem verificar SSL
response = requests.post('https://us.i.posthog.com/batch/', verify=False)


In [None]:
import httpx

httpx.Client(verify=False)


In [14]:
from functions import openai_embedding_function, hf_embeddings_function, google_embedding_function 
# import httpx

# httpx_client = httpx.Client(verify=False) # Sem isso não roda no computador blip 

# Generating embeddings 
openai_embedding_function(documents, model = "text-embedding-3-large")
# hf_embeddings_function(documents, model = "all-MiniLM-l6-v2")

#google_embedding_function(documents, model = "text-embedding-004")


In [16]:

def load_VectorStore(embeddings_provider):
    
    if embeddings_provider == 'HF':
        embeddings = HuggingFaceInferenceAPIEmbeddings(
        api_key=os.getenv('HF_TOKEN'), model_name="sentence-transformers/all-MiniLM-l6-v2"
        )
        directory = "./hf_collection"
        print("HF db loaded")
        
    if embeddings_provider == 'Google':
        embeddings = GoogleGenerativeAIEmbeddings(model="models/text-embedding-004")
        directory = "./google_collection"
        print("Google db loaded")

    if embeddings_provider == 'OpenAI':
        embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
        directory = "./langchain_collection"
        print("OpenAI db loaded")
    
    collection_name = f"langchain_collection_{embeddings_provider}_embeddings"
    
    vector_store = Chroma(
            collection_name=collection_name,
            embedding_function=embeddings,
            persist_directory=directory
        )  
    
    logger.debug("Loading vector store from {collection_name}")   
    
    return vector_store    

In [17]:
vector_store = load_VectorStore('HF')

HF db loaded


In [18]:
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain.indexes import VectorstoreIndexCreator

In [24]:
try:
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",  # Replace with "text-davinci-003" if needed
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Hello, world!"}
        ],
        max_tokens=5
    )
    print(response["choices"][0]["message"]["content"])
except openai.error.APIConnectionError as e:
    print(f"Connection error: {e}")
except openai.error.OpenAIError as e:
    print(f"OpenAI API error: {e}")

AttributeError: module 'openai' has no attribute 'error'

In [28]:
model='gpt-3.5-turbo'
   
retrieval_results = baseline_retrieval(query, vector_store)
llm = OpenAI(temperature=0, model=model)
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever()
)
response = chain.run(query)
return response

OpenAI db loaded


In [30]:
query = "Do you have shirts with sun protection?"
n_chunks = 5  # Número de documentos mais semelhantes a serem retornados

# Busca por similaridade
retrieval_baseline_results = vector_store.similarity_search(query, k=n_chunks)

# Exibir os resultados
for idx, result in enumerate(retrieval_baseline_results, 1):
    print(f"Resultado {idx}:")
    print(f"Documento: {result.page_content}")
    print(f"Metadados: {result.metadata}")
    print("---")

APIConnectionError: Connection error.

In [23]:
model='gpt-3.5-turbo'
   
query('')
k = 2
retrieval_results = vector_store.similarity_search(query, k=n)
llm = OpenAI(temperature=0, model=model)
chain = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vector_store.as_retriever()
)
response = chain.run(query)
response

NameError: name 'query' is not defined