In [6]:
!pip install google-ai-haystack

Collecting google-ai-haystack
  Downloading google_ai_haystack-2.0.0-py3-none-any.whl.metadata (1.8 kB)
Downloading google_ai_haystack-2.0.0-py3-none-any.whl (12 kB)
Installing collected packages: google-ai-haystack
Successfully installed google-ai-haystack-2.0.0


In [170]:
from google.colab import userdata
from haystack.dataclasses.byte_stream import ByteStream
from haystack_integrations.components.generators.google_ai import GoogleAIGeminiGenerator
import requests
from PIL import Image

In [116]:
os.environ["GOOGLE_API_KEY"] = userdata.get('MY_GEMINI_API_KEY')

In [158]:
templates = """
Given the following images, for each image list all the objects within that image one by one with their bounding boxes and generate a detailed description of the image.
{{ image }}
Image Description:

Please provide:
1. A list of all objects present in the image with their boundary boxes.
2. A basic description of the each object image, including context or any notable details.

Output format:
- Image Number: [Image Number]
- Object 1: [Object Name] - [Bounding Box Coordinates]
- Object 2: [Object Name] - [Bounding Box Coordinates]
- etc.,
- Description: [Brief description of the image]

"""

In [189]:
file_path = ['/content/robot1.png','/content/animals.jpg','/content/download.jfif']
images = [
    ByteStream(data=open(path, "rb").read(), mime_type="image/jpeg")
    for path in file_path
]

In [190]:
print(len(images))

3


In [191]:
gemini = GoogleAIGeminiGenerator(model="gemini-1.5-flash")
result = gemini.run(parts = [templates, *images])
for answer in result["replies"]:
  print(answer)

1
- Image Number: 1
- Object 1: C-3PO - [108, 178, 904, 707]
- Object 2: R2-D2 - [334, 497, 924, 903]
- Description:  C-3PO, a golden droid, stands with his hand resting on the dome of R2-D2. Both droids are in a desert environment with a faded, washed-out background. 

- Image Number: 2
- Object 1: Elephant - [79, 13, 220, 217]
- Object 2: Duck - [66, 229, 153, 344]
- Object 3: Rooster - [56, 364, 170, 472]
- Object 4: Chicken - [143, 469, 216, 540]
- Object 5: Cat - [55, 561, 147, 680]
- Object 6: Dog - [64, 709, 174, 913]
- Object 7: Lion - [260, 34, 360, 220]
- Object 8: Pig - [241, 269, 342, 410]
- Object 9: Cow - [248, 441, 395, 656]
- Object 10: Giraffe - [261, 690, 522, 839]
- Object 11: Toucan - [269, 864, 344, 959]
- Object 12: Owl - [389, 48, 484, 147]
- Object 13: Monkey - [393, 174, 501, 275]
- Object 14: Flamingo - [419, 289, 532, 386]
- Object 15: Bear - [423, 420, 561, 594]
- Object 16: Goat - [495, 616, 584, 738]
- Object 17: Horse - [594, 758, 725, 956]
- Object 18: D

In [145]:
print(result)

{'replies': ['- Image 1\n- Object 1: Elephant - [33, 26, 294, 317]\n- Object 2: Chicken - [72, 415, 227, 508]\n- Object 3: Chicken - [74, 526, 220, 621]\n- Object 4: Bird - [85, 298, 225, 409]\n- Object 5: Bird - [159, 626, 226, 696]\n- Object 6: Cat - [71, 705, 238, 794]\n- Object 7: Dog - [53, 806, 257, 948]\n- Object 8: Lion - [308, 23, 486, 236]\n- Object 9: Pig - [268, 288, 415, 446]\n- Object 10: Cow - [249, 477, 527, 721]\n- Object 11: Flamingo - [387, 245, 741, 372]\n- Object 12: Bear - [437, 364, 679, 575]\n- Object 13: Goat - [527, 572, 725, 733]\n- Object 14: Giraffe - [253, 747, 710, 962]\n- Object 15: Bird - [317, 853, 432, 952]\n- Object 16: Bird - [444, 899, 515, 971]\n- Object 17: Monkey - [481, 148, 648, 263]\n- Object 18: Dolphin - [642, 95, 862, 358]\n- Object 19: Penguin - [738, 29, 925, 138]\n- Object 20: Tiger - [701, 372, 981, 509]\n- Object 21: Crocodile - [730, 495, 874, 714]\n- Object 22: Horse - [602, 714, 917, 917]\n- Object 23: Bird - [865, 241, 966, 343]\n