In [198]:
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.runnables import RunnableLambda
import base64
from PIL import Image
import io
import os

In [199]:
def encode_image(image_path, max_size=(512, 512), quality=80):
    image = Image.open(image_path)

    # Redimensionner l'image
    image.thumbnail(max_size)

    # Convertir en bytes avec compression
    buffer = io.BytesIO()
    image.save(buffer, format="JPEG", quality=quality)

    # Encoder en Base64
    encoded_string = base64.b64encode(buffer.getvalue()).decode("utf-8")

    return encoded_string

In [200]:
def get_vision_system_message():
    system_message_text = '''
You're an expert image and photo analyzer.
You are very perceptive in analyzing images and photos.
You possess excelent vision.
Do not read any text unless it is the most prominent in the image.
Your description should be neutral in tone.
'''
    return system_message_text

In [201]:
def get_object_system_message():
    system_message_text = '''
You're an expert image and photo analyzer.
You are very perceptive in analyzing images and photos.
You possess excelent vision.
Do not read any text unless it is the most prominent in the image.
You should always output your results in json format, for example:

[
 {'name': 'a detected object', 'description': 'the detected object's description'},
 {'name': 'another detected object', 'description': 'the other detected object's description'}
]
'''
    return system_message_text

In [202]:
def prompt_func(data):
    text = data["text"]
    image = data["image"]
    system_message = SystemMessage(content=data["system_message_text"])
    content_parts = []

    image_part = {
        "type": "image_url",
        "image_url": f"data:image/jpeg;base64,{image}",
    }
    text_part = {"type": "text", "text": text}
    content_parts.append(image_part)
    content_parts.append(text_part)

    human_message = HumanMessage(content=content_parts)

    return [system_message, human_message]

In [203]:
def analyze_image(image_file, vision_chain, object_chain):

    print(f'PROCESSING FILE: {image_file}')
    print('CONVERTING TO B64...')
    image_b64 = encode_image(image_file)
    print('OK')

    print('DETECTING OBJECTS...')
    detected_objects = object_chain.invoke({"text":"Identify objects in the image. Return a json list of json items of the detected objects. Include only the names of each object and a short description of the object. The field names should be 'name' and 'description' respectively.", "image": image_b64, "system_message_text":get_object_system_message()})
    print('OK')

    print('GENERATING DESCRIPTION...')
    image_description = vision_chain.invoke({"text": "Describe the image in as much detail as possible. Do not try to read any text.", "image": image_b64, "system_message_text":get_vision_system_message()})
    print('OK')

    image_details = {"image_description": image_description, "detected_objects": detected_objects}

    return image_details

In [204]:
image_path = r"test_data/20240828_174310.jpg"

llm = ChatOllama(model="llava")

prompt_chain = RunnableLambda(prompt_func)

vision_chain = prompt_chain | llm | StrOutputParser()
object_chain = prompt_chain | llm | JsonOutputParser()

In [205]:
image_details = analyze_image(image_path, vision_chain, object_chain)

PROCESSING FILE: test_data/20240828_174310.jpg
CONVERTING TO B64...
OK
DETECTING OBJECTS...
OK
GENERATING DESCRIPTION...
OK


In [206]:
print(image_details)

{'image_description': " The image depicts an outdoor scene during daylight, with a focus on a mailbox. There are two distinct areas in the photo: the foreground and the background.\n\nIn the foreground, there is a metal mailbox standing alone on grass that appears to be cut short. The mailbox has a traditional design with a curved top and a flat base, painted in a dark color that matches the background environment.\n\nThe background shows a pathway made of paving stones leading towards the camera. The ground surrounding this path is covered with fallen leaves, indicating it might be autumn. On the left side of the image, partially obscured by the mailbox, there's a patch of grass that contrasts with the leaf-covered ground.\n\nThe sun is shining brightly in the background, creating a lens flare effect across the image. This gives the photo an artistic quality with beams of light streaking diagonally from the bottom left to the top right corner.\n\nThe overall tone of the image is neutr