In [233]:
from langchain_ollama import ChatOllama
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.runnables import RunnableLambda
import base64
from PIL import Image
import io

In [234]:
def encode_image(image_path, max_size=(512, 512), quality=80):
    image = Image.open(image_path)

    # Redimensionner l'image
    image.thumbnail(max_size)

    # Convertir en bytes avec compression
    buffer = io.BytesIO()
    image.save(buffer, format="JPEG", quality=quality)

    # Encoder en Base64
    encoded_string = base64.b64encode(buffer.getvalue()).decode("utf-8")

    return encoded_string

In [235]:
def get_vision_system_message():
    system_message_text = '''
You're an expert image and photo analyzer.
You are very perceptive in analyzing images and photos.
You possess excelent vision.
Do not read any text unless it is the most prominent in the image.
Your description should be neutral in tone.
'''
    return system_message_text

In [236]:
def get_object_system_message():
    system_message_text = '''
You're an expert image and photo analyzer.
You are very perceptive in analyzing images and photos.
You possess excelent vision.
Do not read any text unless it is the most prominent in the image.
You should always output your results in json format, for example:

[
 {'name': 'a detected object', 'description': 'the detected object's description'},
 {'name': 'another detected object', 'description': 'the other detected object's description'}
]
'''
    return system_message_text

In [237]:
def prompt_func(data):
    text = data["text"]
    image = data["image"]
    system_message = SystemMessage(content=data["system_message_text"])
    content_parts = []

    image_part = {
        "type": "image_url",
        "image_url": f"data:image/jpeg;base64,{image}",
    }
    text_part = {"type": "text", "text": text}
    content_parts.append(image_part)
    content_parts.append(text_part)

    human_message = HumanMessage(content=content_parts)

    return [system_message, human_message]

In [238]:
def analyze_image(image_file, vision_chain, object_chain):

    print(f'PROCESSING FILE: {image_file}')
    print('CONVERTING TO B64...')
    image_b64 = encode_image(image_file)
    print('OK')

    print('DETECTING OBJECTS...')
    detected_objects = object_chain.invoke({"text":"Identify objects in the image. Return a json list of json items of the detected objects. Include only the names of each object and a short description of the object. The field names should be 'name' and 'description' respectively.", "image": image_b64, "system_message_text":get_object_system_message()})
    print('OK')

    print('GENERATING DESCRIPTION...')
    image_description = vision_chain.invoke({"text": "Describe the image in as much detail as possible. Do not try to read any text.", "image": image_b64, "system_message_text":get_vision_system_message()})
    print('OK')

    image_details = {"image_description": image_description, "detected_objects": detected_objects}

    return image_details

In [239]:
image_path = r"test_data/20240828_174310.jpg"

llm = ChatOllama(model="llava")

prompt_chain = RunnableLambda(prompt_func)

vision_chain = prompt_chain | llm | StrOutputParser()
object_chain = prompt_chain | llm | JsonOutputParser()

In [240]:
image_details = analyze_image(image_path, vision_chain, object_chain)

PROCESSING FILE: test_data/20240828_174310.jpg
CONVERTING TO B64...
OK
DETECTING OBJECTS...
OK
GENERATING DESCRIPTION...
OK


In [243]:
print(image_details["detected_objects"])

[{'name': 'Mailbox', 'description': 'The mailboxes are on a concrete slab.'}, {'name': 'Sunbeam', 'description': 'A sunbeam is shining on the mailboxes.'}, {'name': 'Concrete Slab', 'description': 'The concrete slab holds the mailboxes and is located in a park-like setting.'}, {'name': 'Mailbox Covering', 'description': 'There are metal covers over the mailboxes for protection.'}, {'name': 'Sunlight', 'description': 'Direct sunlight is shining on the scene from behind the trees in the background.'}]


In [242]:
print(image_details["image_description"])

 The image shows a scene with a focus on mailboxes. There are two prominent mailboxes in the photo; one is located closer to the foreground and features a red top with white lettering, while the other is situated further back and has a green top with blue lettering. Both mailboxes appear to be modern style designs, possibly for an urban or suburban setting.

To the left of the mailbox in the background, there's a wooden bench facing away from the viewer. The bench has a weathered appearance, suggesting it might be located outdoors and exposed to the elements over time.

In front of the mailboxes, there is a small concrete path leading towards them. This path seems to be part of a larger paved area that extends into the distance, hinting at a residential or public area.

The sun shines brightly in the background, casting shadows and creating a contrast with the well-lit foreground. The angle of the photo appears to have been taken from a low perspective, looking up towards the mailboxes