In [1]:
import logging
from hydra import compose, initialize

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

with initialize(version_base=None, config_path="./config"):
    cfg = compose(config_name="properties")

In [13]:
import base64
from omegaconf import DictConfig
from openai import RateLimitError
from operator import itemgetter
from langchain_community.chat_models import ChatOllama
from langchain_openai.chat_models import AzureChatOpenAI
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_core.runnables import RunnableParallel, RunnableLambda, RunnablePassthrough, ConfigurableField

def build_chat_model(cfg: DictConfig):
    model = AzureChatOpenAI(
        **cfg.llm.openai
    ).configurable_alternatives(
        ConfigurableField(id="llm_type"),
        default_key="openai",
        llama=ChatOllama(**cfg.llm.llama)
    )
    return model

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')
    
def format_llm_messages(inputs: dict):
    if (human_message:= inputs.get("question")) is None:
        raise KeyError("`question` parameter not found in during `invoke`.")
    system_message = (
        "You are a helpful AI bot." 
        if not inputs.get("system_prompt") else inputs.get("system_prompt")
    )
    return ChatPromptTemplate([
        ("system", system_message), ("human", human_message)
    ])

def format_lmm_messages(inputs: dict):
    # set system prompt
    system_message = (
        "You are a helpful AI bot." 
        if not inputs.get("system_prompt") else inputs.get("system_prompt")
    )
    # format prompt
    human_messages = [{"type": "text", "text" : inputs["question"]}]
    image_urls = inputs["images"]
    for image_url in image_urls:
        human_messages += [
            {
                "type" : "image_url",
                "image_url" : {"url" : image_url}
            }     
        ]
    return [SystemMessage(content=system_message), HumanMessage(content=human_messages)]

In [None]:
MAX_ATTEMPT = 3
model = build_chat_model(cfg)

llm_chain = (
    format_llm_messages
    | model.with_retry(
        retry_if_exception_type=(RateLimitError,),
        stop_after_attempt=MAX_ATTEMPT,
        wait_exponential_jitter=True
    )
)

response = llm_chain.with_config(
    configurable={"llm_type": "openai"}
).invoke({"question": "What's your name?"})

In [None]:
MAX_ATTEMPT = 3
model = build_chat_model(cfg)

lmm_chain = (
    RunnableParallel(
        {
            "system_prompt": itemgetter("system_prompt"),
            "question": itemgetter("question"),
            "images": itemgetter("images") | RunnableLambda(
                lambda x: [f"data:image/jpeg;base64,{encode_image(_)}" for _ in x]
            )
        }
    )
    | format_lmm_messages
    | model.with_retry(
        retry_if_exception_type=(RateLimitError,),
        stop_after_attempt=MAX_ATTEMPT,
        wait_exponential_jitter=True
    )
)

system_prompt = """You are an Optical Character Recognition machine.
You will extract all the characters from the image provided by the user, and you will only privide the extracted text in your response.
As an OCR machine, You can only respond with the extracted text according to the following intruction.
* Do not modify any of the content in the given image.
* Skip the preamble in your answer.
* Format your answer with structurized information such as markdown or html.
* Do not translate any of the content in the given image. Return as it is."""

response = lmm_chain.with_config(
    configurable={"llm_type": "openai"}
).invoke(
    {
        "system_prompt": system_prompt,
        "question": "이미지에 있는 텍스트를 원본 그대로 추출해줘.",
        "images": ["./data/food_list.png"]
    }
)

In [None]:
print(response.content)

In [12]:
from langchain_community.chat_models import ChatOllama
    
model = ChatOllama(
    base_url="http://localhost:11434",
    model="llama3.2-vision",
    temperature=0
)

In [13]:
messages = [
    ("system", "You are a helpful translator. Translate the user sentence to French."),
    ("human", "I love programming."),
    
]
ai_message = model.invoke(messages)

In [14]:
ai_message

AIMessage(content='Je suis amoureux de la programmation.', response_metadata={'model': 'llama3.2-vision', 'created_at': '2024-11-12T02:34:08.28585434Z', 'message': {'role': 'assistant', 'content': ''}, 'done_reason': 'stop', 'done': True, 'total_duration': 21542337784, 'load_duration': 4831561776, 'prompt_eval_count': 32, 'prompt_eval_duration': 5160000000, 'eval_count': 11, 'eval_duration': 10822000000}, id='run-043099dc-59d7-4f62-9e60-b940f795d7c8-0')