In [1]:
import json
import uuid
from datetime import datetime

from langchain_core.documents import Document
from langchain_core.messages import AIMessage
from langchain_community.vectorstores import VectorStore
from unstructured.documents.elements import Image

# from common.utils.minio import minio_put_object
from common.utils.unstructured_io import partition_web_page
from common.schemas.vector_metadata import VectorMetadata
from common.db.vector_store import ChromaVectorStore
from common.config.base_config import BaseConfig

In [2]:
config = BaseConfig.from_yaml("config.yml")

In [3]:
vector_store = ChromaVectorStore.from_config(config).vector_store

In [4]:
def save_tavily_res_to_vector_db(
    tavily_res: AIMessage, vector_store: VectorStore
) -> None:
    chunk_id = str(uuid.uuid4())
    search_msg = tavily_res.content[0][
        "content"
    ]  # TODO: Find a better way to break the content down into finer chunks
    search_url = tavily_res.content[0]["url"]
    
    metadata = VectorMetadata(
        query=search_msg,
        url=search_url,
        image_urls=extract_tavily_res_content(search_url),
        chunk_id=chunk_id,
        timestamp=datetime.now().isoformat(),
        source_type="web_page",
        content_summary="Overview of spring/summer fashion trends for 2023",
        relevance_score=0.85,
    ).model_dump(mode="json")
    print(f"Metadata: {metadata}")

    doc = Document(
        page_content=search_msg,
        metadata=metadata,
        id=chunk_id,
    )

    vector_store.add_documents(documents=[doc], ids=[chunk_id])


def extract_tavily_res_content(url: str) -> str:
    """
    Extracts all images and other media from the Tavily search results and stores them in Minio.
    Returns a comma-delimited string of presigned URLs to the images.
    """
    elements = partition_web_page(url)
    res = []
    for element in elements:
        if isinstance(element, Image):
            # response = minio_put_object(element.get_content(), element.mime_type)
            res.append("http://test_url.com")
        # TODO: handle other element types
    return ",".join(res)

In [5]:
res = AIMessage(content=[{"content": "test", "url": "https://google.com"}])

In [6]:
save_tavily_res_to_vector_db(res, vector_store)

Metadata: {'query': 'test', 'url': 'https://google.com', 'image_urls': '', 'chunk_id': '04006a4d-9b01-455c-b7dc-473fe3495215', 'timestamp': '2024-10-05T12:06:55.047245', 'source_type': 'web_page', 'content_summary': 'Overview of spring/summer fashion trends for 2023', 'relevance_score': 0.85}
