In [None]:
!pip install -q apify-client langchain langchain-openai python-dotenv langchain-community

In [None]:
from langchain_community.utilities import ApifyWrapper

In [None]:
from langchain_community.document_loaders import ApifyDatasetLoader
from langchain_core.documents import Document

In [None]:
from langchain.indexes import VectorstoreIndexCreator
from langchain_community.utilities import ApifyWrapper
from langchain_core.documents import Document
from langchain_openai import OpenAI
from langchain_openai.embeddings import OpenAIEmbeddings

In [None]:
import os 
from dotenv import find_dotenv, load_dotenv
load_dotenv(find_dotenv())
print(isinstance(os.environ["APIFY_API_TOKEN"], str) and len(os.environ["APIFY_API_TOKEN"]) > 0)

In [None]:
from apify_client import ApifyClient

# Initialize the client with your API key
client = ApifyClient(os.environ["APIFY_API_TOKEN"])

# List datasets
datasets = client.datasets().list()
for dataset in datasets.items:
    print(dataset['name'], dataset['id'])


In [None]:
os.environ["APIFY_DATASET_ID"] = "XcGo2ENbdpb9shDvz"
loader = ApifyDatasetLoader(
    dataset_id=os.environ["APIFY_DATASET_ID"],
    dataset_mapping_function=lambda item: Document(
        page_content=item["title"] or "", metadata={"source": item["url"]}
    ),
)
loader.load()

In [None]:
from langchain.document_loaders import ApifyDatasetLoader
from langchain.schema import Document

def process_item(product):
    """
    Processes a single product item into metadata and page_content.

    Args:
        product (dict): A dictionary containing product data.

    Returns:
        dict: A dictionary with `metadata` and `page_content`, or None if invalid.
    """
    try:
        # Ensure essential fields are present
        if not product.get("title") or not product.get("price", {}).get("value"):
            return None

        # Extract metadata
        metadata = {
            "title": product.get("title", ""),
            "price": float(product.get("price", {}).get("value", 0)),  # Ensure numeric price
            "currency": product.get("price", {}).get("currency", "USD"),  # Default to USD
            "brand": product.get("brand", "Unknown"),  # Default brand
            "stars": product.get("stars", 0.0),  # Default to 0 stars
            "reviews_count": product.get("reviewsCount", 0),  # Default to 0 reviews
            "category": " > ".join(product.get("breadCrumbs", [])),  # Flatten breadcrumbs
            "in_stock": product.get("inStock", False),  # Default to False
            # "return_policy": product.get("returnPolicy", "No policy specified"),
            "seller": product.get("seller", {}).get("name", "Unknown seller"),  # New field
            "shipping_cost": product.get("shippingDetails", {}).get("cost", 0),  # Default to 0
            "source": product.get("url", ""),
            "thumbnail_url": product.get("thumbnailImage", ""),
        }

        # Generate free text for page_content
        page_content = f"""
        Title: {product.get("title", "No title available")}
        Description: {product.get("description", "No description available")}
        Features: {product.get("features", "No feature descriptions available")}
        Attributes: {product.get("attributes", "No attribute descriptions available")}
        Reviews: {product.get("reviewsText", "No reviews available.")}
        Seller: {metadata["seller"]}
        """
        # Return Policy: {metadata["return_policy"]}

        # Strip unnecessary whitespace
        page_content = page_content.strip()

        return Document(metadata=metadata, page_content=page_content)

    except Exception as e:
        print(f"Error processing product: {e}")
        return None


# Initialize the loader with the custom mapping function
loader = ApifyDatasetLoader(
    dataset_id=os.environ["APIFY_DATASET_ID"],
    dataset_mapping_function=process_item,
)

# Load the data
documents = loader.load()
documents

In [None]:
!pip install -q chromadb langchain-chroma

In [None]:
from langchain.vectorstores import Chroma 

persist_directory = "embedding/chroma"
embedding = OpenAIEmbeddings()
# doc = [doc for doc in documents if doc is not None and doc.page_content is not None and doc.metadata is not None and isinstance(doc, Document)]

def filter_complex_metadata(document, allowed_types=Document):
    filtered_documents = []
    for document in documents:
        filtered_metadata = {}
        # print(document)
        if not isinstance(document, allowed_types):
            continue
        for key, value in document.metadata.items():
            filtered_metadata[key] = value
        document.metadata = filtered_metadata
        filtered_documents.append(document)
    return filtered_documents
# Filter out complex metadata from the documents
filtered_documents = filter_complex_metadata(documents)

# Create the Chroma database

In [None]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
vectorstore = Chroma(persist_directory=persist_directory, embedding_function=embedding)

index = VectorStoreIndexWrapper(vectorstore=vectorstore)
#.from_documents(documents=doc, embedding=OpenAIEmbeddings())
query = "Which keyboards has USB-C charging?"
result = index.query_with_sources(query, llm=OpenAI())
print(result["answer"])
print(result["sources"])

In [None]:
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
question = "Which keyboard has USB-C charging?"
vectorstore = Chroma.from_documents(filtered_documents, embedding=embedding, persist_directory=persist_directory)
retriever_kwargs = {} # retriever_kwargs or {}
kwargs = {}
llm=OpenAI()
chain = RetrievalQAWithSourcesChain.from_chain_type(
    llm, retriever=vectorstore.as_retriever(**retriever_kwargs), **kwargs
)
chain.invoke({chain.question_key: question})

In [None]:
from langchain.indexes.vectorstore import VectorStoreIndexWrapper
vectorstore = Chroma.from_documents(filtered_documents, embedding=embedding, persist_directory=persist_directory)

index = VectorStoreIndexWrapper(vectorstore=vectorstore)
#.from_documents(documents=doc, embedding=OpenAIEmbeddings())
query = "Which keyboard has USB-C charging?"
result = index.query_with_sources(query, llm=OpenAI())
print(result["answer"])
print(result["sources"])

In [None]:
products = vectorstore.search("", search_type="similarity", filter={"source": "https://www.amazon.com/dp/B0CNT6GP41"}, k=5)
[product.metadata['title'] for product in products]
#.query("",retriever_kwargs={"sources": ["https://www.amazon.com/dp/B0CNT6GP41"]})

In [None]:
index = VectorstoreIndexCreator(embedding=OpenAIEmbeddings(), vectorstore_cls=Chroma, vectorstore_kwargs={"persist_directory": persist_directory}).from_documents(documents=filtered_documents)
query = "Which keyboard has USB-C charging?"
result = index.query_with_sources(query, llm=OpenAI())
print(result["answer"])
print(result["sources"])

In [None]:
!pip install -q streamlit
