In [170]:
from langchain_ollama.llms import OllamaLLM
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_core.output_parsers import JsonOutputParser, StrOutputParser
from langchain_core.runnables import RunnableLambda
import base64
from PIL import Image
import io
import os

In [171]:
def encode_image(image_path, max_size=(512, 512), quality=80):
    image = Image.open(image_path)

    # Redimensionner l'image
    image.thumbnail(max_size)

    # Convertir en bytes avec compression
    buffer = io.BytesIO()
    image.save(buffer, format="JPEG", quality=quality)

    # Encoder en Base64
    encoded_string = base64.b64encode(buffer.getvalue()).decode("utf-8")

    return encoded_string

In [172]:
def prompt_func(data):
    text = data["text"]
    image = data["image"]
    system_message = SystemMessage(content="You are an AI that helps analyzing images")
    content_parts = []

    image_part = {
        "type": "image_url",
        "image_url": f"data:image/jpeg;base64,{image}",
    }
    text_part = {"type": "text", "text": text}
    content_parts.append(image_part)
    content_parts.append(text_part)

    human_message = HumanMessage(content=content_parts)

    return [system_message, human_message]

In [173]:
def analyze_image(image_file, vision_chain, object_chain):

    print(f'PROCESSING FILE: {image_file}')
    print('CONVERTING TO B64...')
    image_b64 = encode_image(image_file)
    print('OK')

    print('DETECTING OBJECTS...')
    detected_objects = object_chain.invoke({"text":"Identify objects in the image. Return a json list of json items of the detected objects. Include only the names of each object and a short description of the object. The field names should be 'name' and 'description' respectively.", "image": image_b64})
    print('OK')

    print('GENERATING DESCRIPTION...')
    image_description = vision_chain.invoke({"text": "Do you see snow in this image", "image": image_b64})
    print('OK')

    image_details = {"image_description": image_description, "detected_objects": detected_objects}

    return image_details

In [174]:
image_path = r"test_data/20240828_174310.jpg"

llm = OllamaLLM(model="llava:13b")

prompt_chain = RunnableLambda(prompt_func)

vision_chain = prompt_chain | llm
object_chain = prompt_chain | llm

In [175]:
image_details = analyze_image(image_path, vision_chain, object_chain)

PROCESSING FILE: test_data/20240828_174310.jpg
CONVERTING TO B64...
OK
DETECTING OBJECTS...


ResponseError: model 'llava:13b' not found (status code: 404)

In [169]:
print(image_details)

{'image_description': " Na, I don't see any images here. Can you please provide an image for me to describe? ", 'detected_objects': ' {\n  "objects": [\n    {\n      "name": "cat",\n      "description": "A cat is visible in the image."\n    },\n    {\n      "name": "dog",\n      "description": "A dog is visible in the image."\n    }\n  ]\n} '}
