In [1]:
from langchain_core.documents import Document
from langchain_community.vectorstores import FAISS
from langchain_experimental.open_clip import OpenCLIPEmbeddings
import glob
import base64

paths = glob.glob('../images/*.jpeg', recursive=True)

In [2]:
lc_docs = []
def encode_image(path):
    with open(path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

for path in paths:
    doc = Document(
        page_content=encode_image(path),
        metadata ={
            'source': path
        }
    )
    lc_docs.append(doc)

In [3]:
vector_store = FAISS.from_documents(lc_docs, embedding=OpenCLIPEmbeddings())

In [4]:
retriever = vector_store.as_retriever()

In [5]:
from io import BytesIO
from PIL import Image

def resize_base64_image(base64_string, size=(128, 128)):
    # Decode the Base64 string
    img_data = base64.b64decode(base64_string)
    img = Image.open(BytesIO(img_data))

    # Resize the image
    resized_img = img.resize(size, Image.LANCZOS)

    # Save the resized image to a bytes buffer
    buffered = BytesIO()
    resized_img.save(buffered, format=img.format)

    # Encode the resized image to Base64
    return base64.b64encode(buffered.getvalue()).decode("utf-8")


def is_base64(s):
    try:
        return base64.b64encode(base64.b64decode(s)) == s.encode()
    except Exception:
        return False


def split_image_text_types(docs):
    images = []
    text = []
    for doc in docs:
        doc = doc.page_content  # Extract Document contents
        if is_base64(doc):
            # Resize image to avoid OAI server error
            images.append(
                resize_base64_image(doc)
            )  # base64 encoded str
        else:
            text.append(doc)
    return {"images": images, "texts": text}

In [7]:
from langchain_core.messages import HumanMessage
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI


def prompt_func(data_dict):
    # Joining the context texts into a single string
    formatted_texts = "\n".join(data_dict["context"]["texts"])
    messages = []

    # Adding image(s) to the messages if present
    if data_dict["context"]["images"]:
        image_message = {
            "type": "image_url",
            "image_url": {
                "url": f"data:image/jpeg;base64,{data_dict['context']['images'][0]}"
            },
        }
        messages.append(image_message)

    # Adding the text message for analysis
    text_message = {
        "type": "text",
        "text": (
            "As an animal lover, your task is to analyze and interpret images of cute animals, "
            "Please use your extensive knowledge and analytical skills to provide a "
            "summary that includes:\n"
            "- A detailed description of the visual elements in the image.\n"
            f"User-provided keywords: {data_dict['question']}\n\n"
            "Text and / or tables:\n"
            f"{formatted_texts}"
        ),
    }
    messages.append(text_message)

    return [HumanMessage(content=messages)]


foundation = ChatOpenAI(temperature=0, model="gpt-4o-mini", max_tokens=1024)

# RAG pipeline
chain = (
    {
        "context": retriever | RunnableLambda(split_image_text_types),
        "question": RunnablePassthrough(),
    }
    | RunnableLambda(prompt_func)
    | foundation
    | StrOutputParser()
)

In [8]:
chain.invoke("rottweiler")

"The image features a Rottweiler, a breed known for its strong build and confident demeanor. Here’s a detailed description of the visual elements:\n\n- **Coloration**: The Rottweiler has a predominantly black coat with distinct tan markings on its face, chest, and legs. The contrast between the black and tan colors is striking and characteristic of the breed.\n\n- **Facial Expression**: The dog has a friendly and approachable expression, with a slightly open mouth that suggests a relaxed demeanor. Its eyes are dark and round, conveying intelligence and warmth.\n\n- **Posture**: The Rottweiler is standing upright, showcasing its muscular build. The stance is confident, with a straight back and a slight tilt of the head, which adds to its alertness.\n\n- **Background**: The background is blurred, featuring soft greens and yellows, which helps to emphasize the dog in the foreground. This natural setting suggests an outdoor environment, possibly a park or a garden.\n\n- **Lighting**: The l

In [9]:
docs = retriever.invoke("rottweiler", k=4)

for doc in docs:
    print(doc.metadata)

{'source': '../images/dog_5.jpeg'}
{'source': '../images/dog_3.jpeg'}
{'source': '../images/cat_3.jpeg'}
{'source': '../images/dog_2.jpeg'}
