In [36]:
import requests
import os
import uuid
from dotenv import load_dotenv
from datetime import datetime, timedelta
import base64
from io import BytesIO

from pydantic import BaseModel, ConfigDict
from minio import Minio
from PIL import Image as PILImage
from langchain_core.documents import Document
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage, HumanMessage
from langchain_community.vectorstores import VectorStore
from unstructured.documents.elements import Image
from playwright.async_api import async_playwright

In [37]:
load_dotenv()

True

In [38]:
class ImageMetadata(BaseModel):
    url: str
    summary: str
    
class MinioResponse(BaseModel):
    bucket_name: str
    file_name: str
    url: str

In [47]:
async def summarize_image(image: PILImage.Image) -> str:
    """
    Uses a powerful LLM to generate a 1-sentence summary of an image.

    Args:
        image (PILImage.Image): The image to summarize.

    Returns:
        str: A one-sentence summary of the image.
    """
    llm = ChatOpenAI(
        model="gpt-4o",
        api_key=os.getenv("OPENAI_API_KEY"),
        temperature=0.0,
        streaming=True,
    )
    # Encode the image as base64
    buffered = BytesIO()
    image_format = image.format.upper()
    image.save(buffered, format=image_format)

    image_base64 = base64.b64encode(buffered.getvalue()).decode("utf-8")
    image_url_prefix = f"data:image/{image_format.lower()};base64,"

    messages = [
        HumanMessage(
            content=[
                {
                    "type": "text",
                    "text": "Summarize the following image in one sentence:",
                },
                {
                    "type": "image_url",
                    "image_url": {"url": f"{image_url_prefix}{image_base64}"},
                },
            ],
        )
    ]
    return AIMessage.model_validate(await llm.ainvoke(messages)).content

In [40]:
def get_minio_client():
    minio_url = f"localhost:9002"
    return Minio(
        endpoint=minio_url,
        access_key=os.getenv("MINIO_ROOT_USER"),
        secret_key=os.getenv("MINIO_ROOT_PASSWORD"),
        secure=False,
    )


def minio_presigned_get_object(bucket_name: str, object_name: str) -> str:
    minio_client = get_minio_client()
    return minio_client.presigned_get_object(
        bucket_name,
        object_name,
        expires=timedelta(days=7),
    )


def minio_put_object(file_data: BytesIO, content_type: str) -> MinioResponse:
    """
    Uploads a file to Minio and returns a presigned URL to the file.
    """
    minio_client = get_minio_client()
    file_id = str(uuid.uuid4())
    minio_client.put_object(
        bucket_name=os.getenv("MINIO_BUCKET"),
        object_name=file_id,
        data=file_data,
        length=file_data.getbuffer().nbytes,
        content_type=content_type,
    )
    return MinioResponse(
        bucket_name=os.getenv("MINIO_BUCKET"),
        file_name=file_id,
        url=minio_presigned_get_object(os.getenv("MINIO_BUCKET"), file_id),
    )

In [41]:
async def scrape_images_from_page(url: str) -> list[str]:
    """
    Scrapes all image URLs from a given web page.

    Args:
        url (str): The URL of the web page to scrape.

    Returns:
        list[str]: A list of image URLs found on the page.
    """
    image_urls = []
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        await page.goto(url)

        # Scrape all img tags and get their 'src' attributes
        images = await page.query_selector_all("img")
        for img in images:
            src = await img.get_attribute("src")
            if src and src.startswith("http"):
                image_urls.append(src)

        await browser.close()

    return image_urls

In [49]:
async def extract_images(urls: list[str]) -> list[ImageMetadata]:
    res: list[ImageMetadata] = []
    for image_url in urls:
        try:
            image = PILImage.open(requests.get(image_url, stream=True).raw)
            print(f"Opened image: {image_url}")
        except Exception:
            print(f"Error opening image: {image_url}")
            print(f"Skipping image: {image_url}")
            continue
        content_type = (
            "image/jpeg"
            if image.format.lower() == "jpeg" or image.format.lower() == "jpg"
            else "image/png"
        )
        # Save the image to a BytesIO object to preserve format and metadata
        image_bytes_io = BytesIO()
        image.save(image_bytes_io, format=image.format)
        image_bytes_io.seek(0)
        minio_response = minio_put_object(image_bytes_io, content_type)
        res.append(
            ImageMetadata(
                url=minio_response.url, 
                summary=await summarize_image(image)
            )
        )
    return res

In [43]:
images = await scrape_images_from_page(
    url="https://www.whooyeah.com/blog/best-sustainable-fashion-brands-in-2024"
)
print(images)

['https://prod-image-cdn.whooyeah.com/media/best-sustainable-fashion-brands-in-2024/patagonia/img-1.jpg?format=auto&width=720&quality=90', 'https://prod-image-cdn.whooyeah.com/media/best-sustainable-fashion-brands-in-2024/patagonia/img-2.jpg?format=auto&width=720&quality=90', 'https://prod-image-cdn.whooyeah.com/media/best-sustainable-fashion-brands-in-2024/patagonia/img-3.jpg?format=auto&width=720&quality=90', 'https://prod-image-cdn.whooyeah.com/media/best-sustainable-fashion-brands-in-2024/patagonia/img-4.jpg?format=auto&width=720&quality=90', 'https://prod-image-cdn.whooyeah.com/media/best-sustainable-fashion-brands-in-2024/patagonia/img-1.jpg?format=auto&width=720&quality=90', 'https://prod-image-cdn.whooyeah.com/media/best-sustainable-fashion-brands-in-2024/patagonia/img-2.jpg?format=auto&width=720&quality=90', 'https://prod-image-cdn.whooyeah.com/media/best-sustainable-fashion-brands-in-2024/patagonia/img-3.jpg?format=auto&width=720&quality=90', 'https://prod-image-cdn.whooyeah.

In [51]:
images = images[:1]
image_metadata = await extract_images(images)
print(image_metadata)

Opened image: https://prod-image-cdn.whooyeah.com/media/best-sustainable-fashion-brands-in-2024/patagonia/img-1.jpg?format=auto&width=720&quality=90
[ImageMetadata(url='http://localhost:9002/fashion-images/6398ccee-e297-4981-ae7a-1c1c1dc96565?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=minioadmin%2F20241024%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20241024T135109Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=b3bd90569337505cd656f21ed74a593f8ad40715a356720a32e3d211108f88b3', summary='The image features a red jacket with the text "Worn Wear: a film about the stories we wear" and the Patagonia logo.')]
