# Multimodal Use Case

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
## Load helper functions
from utils import load_env
load_env()
from utils import llama32
from utils import encode_image
from utils import llama32pi

# OCR (Optical Character Recognition) with Llama 3

In [None]:
from utils import disp_image
for i in range(1, 4):
  disp_image(f"images/receipt-{i}.jpg")

In [None]:
question = "What's the total charge in the receipt?"
results = ""
for i in range(1, 4):
    base64_image = encode_image(f"images/receipt-{i}.jpg")
    res = llama32pi(question, f"data:image/jpeg;base64,{base64_image}")
    results = results + f"{res}\n"
print(results) #results saves the charge in each receipt

In [None]:
messages = [
    {"role": "user",
     "content": f"""What's the total charge of all the receipts below?
{results}"""
  }
]

In [None]:
response = llama32(messages)
print(response)

## Handling multiple images

In [None]:
from utils import merge_images
import matplotlib.pyplot as plt

merged_image = merge_images("images/receipt-1.jpg",
                            "images/receipt-2.jpg",
                            "images/receipt-3.jpg")
plt.imshow(merged_image)
plt.axis('off')
plt.show()

In [None]:
from utils import resize_image
resized_img = resize_image(merged_image)

In [None]:
base64_image = encode_image("images/resized_image.jpg")
question = "What's the total charge of all the receipts below?"
result = llama32pi(question,
                      f"data:image/jpeg;base64,{base64_image}")
print(result)

The model can interpreting nutrition labels and offer recommendations. Also it can interpret complex diagrams

# Interpretation of graphics and code generation

In [None]:
disp_image("images/llama32mm.png")

In [None]:
question = ("I see this diagram in the Llama 3 paper. "
            "Summarize the flow in text and then return a "
            "python script that implements the flow.")
base64_image = encode_image("images/llama32mm.png")
result = llama32pi(question, f"data:image/png;base64,{base64_image}")
print(result)

## Converting input to another format

In [None]:
disp_image("images/llama31speed.png")

In [None]:
question = "Convert the chart to an HTML table."
base64_image = encode_image("images/llama31speed.png")
result = llama32pi(question, f"data:image/png;base64,{base64_image}")
print(result)

In [None]:
from IPython.display import HTML
#this is the previous response
minified_html_table = "<table><thead><tr><th>Model</th><th>Output Tokens per Second</th></tr></thead><tbody><tr><td>Llama 2 1.5B</td><td>217</td></tr><tr><td>Google's PaLM 2 540B</td><td>214</td></tr><tr><td>Google's PaLM 2 540B</td><td>163</td></tr><tr><td>Meta's LLaMA 2 70B</td><td>133</td></tr><tr><td>Meta's LLaMA 2 70B</td><td>129</td></tr><tr><td>Google's T5 3.5B</td><td>123</td></tr><tr><td>OPT-6B</td><td>111</td></tr><tr><td>OPT-6B</td><td>75</td></tr><tr><td>ChatGPT-3.5</td><td>64</td></tr><tr><td>Google's T5 3.5B</td><td>62</td></tr><tr><td>Google's T5 3.5B</td><td>61</td></tr><tr><td>Meta's LLaMA 2 7B</td><td>68</td></tr><tr><td>Meta's LLaMA 2 7B</td><td>38</td></tr><tr><td>Meta's LLaMA 2 7B</td><td>38</td></tr><tr><td>Meta's LLaMA 2 7B</td><td>25</td></tr></tbody></table>"
HTML(minified_html_table)

# Following up a question in multimodal context (images and text)

In [None]:
disp_image("images/fridge-3.jpg")

In [None]:
question = ("What's in the fridge? What kind of food can be made? Give "
            "me 2 examples, based on only the ingredients in the fridge.")
base64_image = encode_image("images/fridge-3.jpg")
result = llama32pi(question, f"data:image/jpg;base64,{base64_image}")
print(result)

### Asking a follow up question

In [None]:
from utils import llama32repi #helper function for the follow-up question

new_question = "is there banana in the fridge? where?"
messages = [
  {"role": "user", "content": [
      {"type": "text", "text": question},
      {"type": "image_url", "image_url": {"url": f"data:image/jpg;base64,{base64_image}"}}
  ]},
  {"role": "assistant", "content": result},
  {"role": "user", "content": new_question}
]
new_result = llama32(messages)
print(new_result)

In [None]:
#equivalently
new_result = llama32repi(question, f"data:image/jpg;base64,{base64_image}", result, new_question)
print(new_result) 

It can be used like an interior design assistant, a math grader...etc
```python 
disp_image("images/math_hw3.jpg")
prompt = ("Check carefully each answer in a kid's math homework, first "
          "do the calculation, then compare the result with the kid's "
          "answer, mark correct or incorrect for each answer, and finally"
          " return a total score based on all the problems answered.")
base64_image = encode_image("images/math_hw3.jpg")
result = llama32pi(prompt, f"data:image/jpg;base64,{base64_image}")
print(result)
```

# Tool calling with image and follow-up response

In [None]:
disp_image("images/golden_gate.png")

In [None]:
question = ("Where is the location of the place shown in the picture?")
base64_image = encode_image("images/golden_gate.png")
result = llama32pi(question, f"data:image/png;base64,{base64_image}")
print(result)

In [None]:
weather_question = ("What is the current weather in the location "
                 "mentioned in the text below: \n"  f"{result}")

In [None]:
from datetime import datetime

current_date = datetime.now()
formatted_date = current_date.strftime("%d %B %Y")

messages = [
    {"role": "system",
     "content":  f"""
Environment: ipython
Tools: brave_search, wolfram_alpha
Cutting Knowledge Date: December 2023
Today Date: {formatted_date}
"""},
    {"role": "user",
     "content": weather_question}
  ]
print(llama32(messages)) #it may answer with the function calling for later using it
#manually by the user (or directly with the response from the tool)