# Use Llama.3.2-Vision via Ollama with Langchain 

In [3]:
!ollama list
#!ollama rm llama3.2-vision
#!ollama pull mxbai-embed-large
#!ollama run llama3.2-vision
#!ollama pull llama3.2-vision 
#!ollama --version

NAME                      ID              SIZE      MODIFIED       
llama3.2-vision:latest    085a1fdae525    7.9 GB    28 seconds ago    


In [6]:
file_path = "/home/alexv84/Documents/GitHub/genai_playground/pred.pdf"

In [7]:
import fitz
from PIL import Image

# Load PDF
doc = fitz.open(file_path)

In [None]:
# Convert each page to an image in jpg format
for page_number in range(len(doc)):
    page = doc.load_page(page_number)
    pix = page.get_pixmap()
    img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

    # Save the image
    img.save(f"../data/page_{page_number + 1}.jpg")

print("PDF pages converted to images successfully!")

In [None]:
#import streamlit as st
from PIL import Image
import io

image = Image.open("../data/page_1.jpg")
#image

In [12]:
import ollama

# Define the image path
image_path = '/home/alexv84/Documents/GitHub/genai_playground/data/page_1.jpg'

# Query the model
try:
    response = ollama.chat(
        model='llama3.2-vision',  # Use the full model name
        messages=[
            {
                'role': 'user',
                'content': 'What is in this image?',
                'images': [image_path]
            }
        ]
    )
    # print(response)
except Exception as e:
    print(f"An error occurred: {e}")


In [None]:
# Extract the model's response about the image
cleaned_text = response["message"]["content"].strip()

print(f"Model Response: {cleaned_text}")

In [14]:
#from langchain_community.chat_models import ChatOllama
from langchain_ollama import ChatOllama
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate

llm = ChatOllama(model="llama3.2-vision", temperature=0.1)

In [None]:
import base64
from io import BytesIO
import pprint
from IPython.display import HTML, display
from PIL import Image

def convert_to_base64(pil_image):
    """
    Convert PIL images to Base64 encoded strings

    :param pil_image: PIL image
    :return: Re-sized Base64 string
    """

    buffered = BytesIO()
    pil_image.save(buffered, format="JPEG")  # You can change the format if needed
    img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
    return img_str

def plt_img_base64(img_base64):
    """
    Disply base64 encoded string as image

    :param img_base64:  Base64 string
    """
    # Create an HTML img tag with the base64 string as the source
    image_html = f'<img src="data:image/jpeg;base64,{img_base64}" />'
    # Display the image by rendering the HTML
    display(HTML(image_html))

page_nr = 1
file_path = f'/home/alexv84/Documents/GitHub/genai_playground/data/page_{page_nr}.jpg'
pil_image = Image.open(file_path)

# Convert the image to base64
image_b64 = convert_to_base64(pil_image)

# Display the image
plt_img_base64(image_b64)

In [None]:
from langchain_core.messages import HumanMessage

llm = ChatOllama(model="llama3.2-vision", temperature=0)

def prompt_func(data):
    text = data["text"]
    image = data["image"]

    image_part = {
        "type": "image_url",
        "image_url": f"data:image/jpeg;base64,{image}",
    }

    content_parts = []

    text_part = {"type": "text", "text": text}

    content_parts.append(image_part)
    content_parts.append(text_part)

    return [HumanMessage(content=content_parts)]


chain = prompt_func | llm | StrOutputParser()

question = "Give an example about the topic of section 3.1"

text = f"""
    You will be provided with an image of a document. 
    Your task is to answer the user question. 
    If you are unable to answer the question, please do not come up with a random answer.     
Question: {question}
"""

query_chain = chain.invoke(
    {
    "text": text, 
    "image": image_b64
    }
)

pprint.pprint(query_chain)

In [None]:
#from langchain.tools import Tool
from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
from langchain.agents import Tool, initialize_agent
from langchain_ollama import ChatOllama


# Initialize DuckDuckGo Search
duck_search = DuckDuckGoSearchAPIWrapper()

llm = ChatOllama(model="llama3", temperature=0.2)

# Create the tool
duck_tool = Tool(
    name="DuckDuckGoSearch",
    func=duck_search.run,
    description="Use this tool to search the web for real-time information using DuckDuckGo."
)

# Create the agent with the tool
agent = initialize_agent(
    tools=[duck_tool],
    llm=llm,
    agent="zero-shot-react-description",
    verbose=True
)

# Query the agent
query = "What are the top benefits of using solar energy in 2024?"

response = agent.run(query)
print(response)

In [2]:
import os
from langchain_openai import AzureChatOpenAI #, AzureOpenAIEmbeddings
from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain.agents import create_tool_calling_agent, AgentExecutor
from langchain.tools import Tool
from langchain.chains import LLMMathChain, LLMChain
from dotenv import load_dotenv
from pprint import PrettyPrinter

load_dotenv()

# Use GPT-4 as it supports tool calling:
llm = AzureChatOpenAI(
    openai_api_base=os.getenv("OPENAI_URL"),
    openai_api_version="2024-02-15-preview",
    openai_api_type="azure",
    temperature=0.2,
    openai_api_key=os.getenv("OPENAI_API_KEY")
)

# calculator tool for arithmetics
problem_chain = LLMMathChain.from_llm(llm=llm)

math_tool = Tool.from_function(name="Calculator",
                                func=problem_chain.run,
                                description="Useful for when you need to answer numeric questions. This tool is only for math questions and nothing else. Only input math expressions, without text",
                                )
# Create list of tools
tools = [
    # duck_tool,
    math_tool,
]

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are a helpful assistant."),
    ("human", "{input}"),
    ("human", "Use the available tools if necessary."),
    MessagesPlaceholder(variable_name="agent_scratchpad"),
])

agent = create_tool_calling_agent(llm, tools, prompt)

agent_executor = AgentExecutor(agent=agent, tools=tools)

In [None]:
question = "What is the cosinus of 2*pi and of pi?"

response = agent_executor.invoke({"input": question})

PrettyPrinter().pprint(response)