In [130]:
import os
import requests
from dotenv import load_dotenv
import base64
from io import BytesIO

from pydantic import BaseModel
from PIL import Image as PILImage, UnidentifiedImageError

from langchain_openai import ChatOpenAI
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_core.messages import HumanMessage
from langchain_postgres.vectorstores import PGVector
from langchain_openai import OpenAIEmbeddings
from unstructured.documents.elements import Image
from playwright.async_api import async_playwright

from common.utils.unstructured_io import partition_web_page
from common.schemas.image_metadata import ImageMetadata

In [131]:
load_dotenv()

True

In [132]:
test_url = "https://www.marieclaire.com/fashion/summer-2024-accessory-trends/"

In [133]:
async def scrape_images_from_page(url: str) -> list[str]:
    image_urls = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)

        # Scrape all img tags and get their 'src' attributes
        images = await page.query_selector_all("img")
        for img in images:
            src = await img.get_attribute("src")
            if src and src.startswith("http"):
                image_urls.append(src)

        await browser.close()

    return image_urls

In [134]:
llm = ChatOpenAI(
    model="gpt-4o",
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0.0,
    streaming=True,
)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")
collection_name = "vector_index"
connection = f"postgresql+psycopg://postgres:postgres@localhost:5432/{collection_name}"  # Uses psycopg3!

In [135]:
def summarize_image(image: PILImage.Image) -> str:
    """
    Uses a powerful LLM to generate a 1-sentence summary of an image
    """
    # Encode the image as base64
    buffered = BytesIO()
    image_format = image.format.upper()
    image.save(buffered, format=image_format)

    image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
    image_url_prefix = f"data:image/{image_format.lower()};base64,"

    messages = [
        HumanMessage(
            content=[
                {
                    "type": "text",
                    "text": "Give a concise, 1 sentence summary of this image.",
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"{image_url_prefix}{image_base64}"},
                },
            ],
        )
    ]
    response = llm.generate([messages])
    return response.generations[0][0].text

In [136]:
async def extract_tavily_res_content(url: str) -> list[ImageMetadata]:
    """
    Extracts all images and other media from the Tavily search results and stores them in Minio.
    Returns a presigned URL to use in the metadata of the document the images etc. were extracted from.
    """
    try:
        # TODO: Is there a better way to scrape images from the web page?
        image_urls = await scrape_images_from_page(url)
    except Exception as e:
        print(f"Error scraping images from page: {e}")
        return []
    res: list[ImageMetadata] = []
    for image_url in image_urls:
        print(f"Image URL: {image_url}")
        # response = minio_put_object(element.get_content(), element.mime_type)
        # TODO: Replace requests with httpx
        try:
            image_response = PILImage.open(requests.get(image_url, stream=True).raw)
            res.append(
                ImageMetadata(url=image_url, summary=summarize_image(image_response))
            )
        except UnidentifiedImageError as e:
            print(f"WARNING: Could not process unidentified image: {image_url}, continuing")
        # TODO: handle other element types
    return res

In [137]:
extracted_images = await extract_tavily_res_content(test_url)
print(extracted_images)

Image URL: https://cdn.mos.cms.futurecdn.net/q3n9aGDnBLGfz9MYqTGL4S-415-80.png
Image URL: https://vanilla.futurecdn.net/cyclingnews/media/img/missing-image.svg
Image URL: https://cdn.mos.cms.futurecdn.net/zpas7TrADNt8gJD7AVTRWg-320-80.png
Image URL: https://cdn.mos.cms.futurecdn.net/REj4A72zyxNPQpvWBQRxrR-320-80.jpg
Image URL: https://cdn.mos.cms.futurecdn.net/33FYMVie2NxVtCAnMb9LN6-320-80.jpg
Image URL: https://cdn.mos.cms.futurecdn.net/kt3kkTbqkQCuqzF2XbTEJM-320-80.jpg
Image URL: https://cdn.mos.cms.futurecdn.net/ySjEwgji5XguYLEs9EJ2Qo-320-80.jpg
Image URL: https://cdn.mos.cms.futurecdn.net/r2hSR789c6HeMF9FCGukKG-320-80.jpg
Image URL: https://cdn.mos.cms.futurecdn.net/VjK8fGbdRZoRhEG5jyXYZA-320-80.jpg
Image URL: https://cdn.mos.cms.futurecdn.net/PaEkjf5KybHVHwq5x7NdAk-320-80.png
Image URL: https://cdn.mos.cms.futurecdn.net/7uHygrVzriAP72QEs2N5mQ-320-80.jpg
Image URL: https://cdn.mos.cms.futurecdn.net/UfzaQE4EjNzb8ojnqwbuLL-320-80.jpg
Image URL: https://cdn.mos.cms.futurecdn.net/546Cq

In [138]:
vector_store = PGVector(
    embeddings=embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [143]:
class ImageMetadata(BaseModel):
    url: str
    summary: str

In [144]:
# PGVector supports complex metadata!
docs = [
    Document(
        page_content="Hello world!",
        metadata={
            "image_metadata": [
                extracted_image.model_dump() for extracted_image in extracted_images
            ]
        },
    ),
]

In [145]:
vector_store.add_documents(docs)

['5a12dbeb-3c9a-49d9-a92c-72f9a183f7dd']

In [147]:
vector_store.similarity_search("Hello world!", k=10)

[Document(id='888da605-f5c0-4504-8d15-fc23f1412a1f', metadata={'image_metadata': [{'url': 'test', 'summary': 'test2'}]}, page_content='Hello world!'),
 Document(id='ed5dcb6b-57ee-40ad-a048-0a8da62cf6c0', metadata={'image_metadata': [{'url': 'test', 'summary': 'test2'}]}, page_content='Hello world!'),
 Document(id='5a12dbeb-3c9a-49d9-a92c-72f9a183f7dd', metadata={'image_metadata': [{'url': 'https://cdn.mos.cms.futurecdn.net/q3n9aGDnBLGfz9MYqTGL4S-415-80.png', 'summary': 'The image showcases a collage of diverse fashion styles and outfits.'}, {'url': 'https://cdn.mos.cms.futurecdn.net/zpas7TrADNt8gJD7AVTRWg-320-80.png', 'summary': 'The image showcases a collage of various fashion outfits, including tailored blazers, a sheer top, and a colorful, ruffled dress.'}, {'url': 'https://cdn.mos.cms.futurecdn.net/REj4A72zyxNPQpvWBQRxrR-320-80.jpg', 'summary': 'The image shows a black leather belt with a silver buckle.'}, {'url': 'https://cdn.mos.cms.futurecdn.net/33FYMVie2NxVtCAnMb9LN6-320-80.jpg