<a href="https://colab.research.google.com/github/El20082/LLM-Synthetic-Geometric-Dataset/blob/main/model_evaluation_gemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This colab is based on Google's official colab for Gemini: https://colab.research.google.com/github/google/generative-ai-docs/blob/main/site/en/gemini-api/docs/get-started/python.ipynb#scrollTo=9OEoeosRTv-5

# Install Python SDK

In [None]:
!pip install -q -U google-generativeai

# Import packages and mount Drive

In [None]:
import pathlib
import textwrap
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
import PIL.Image
import os
import re
import json
import time
import matplotlib.pyplot as plt

# Used to securely store API key
from google.colab import userdata

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Setup API key

Obtain an API key from the link: <a class="button button-primary" href="https://makersuite.google.com/app/apikey" target="_blank" rel="noopener noreferrer">Get an API key</a> and add it to Secrets environment variable.

In [None]:
GOOGLE_API_KEY=userdata.get('GOOGLE_API_KEY')
genai.configure(api_key=GOOGLE_API_KEY)

# Get Gemini model

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro-latest
models/gemini-1.0-pro
models/gemini-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-pro-exp-0801
models/gemini-1.5-pro-exp-0827
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-exp-0827
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-2.0-flash-exp
models/gemini-exp-1206
models/gemini-exp-1121
models/gemini-exp-1114
models/gemini-2.0-flash-thinking-exp
models/gemini-2.0-flash-thinking-exp-1219
models/learnlm-1.5-pro-experimental


In [None]:
generation_config = {
                "temperature": 0.1,
                "top_p": 0.9,
                "top_k": 2,
                "max_output_tokens": 1000000000,
            }

safety_settings = [{"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"},
                {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_MEDIUM_AND_ABOVE"}
            ]


model = genai.GenerativeModel('gemini-1.5-flash', generation_config=generation_config, safety_settings=safety_settings)

# Run eval on synthesized dataset

In [None]:
def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

## Load data

In [None]:
dataset_name = 'dataset10' #@param
num_file_limit = 100 #@param Maximum number of examples to examine
dataset_dir = f'/content/drive/MyDrive/llm_geometric_project/Datasets/{dataset_name}/'
directory_files = os.listdir(dataset_dir)
image_files = []
json_files = []
for file in directory_files:
  if file.endswith(".jpeg"):
    image_files.append(file)
  elif file.endswith(".json"):
    json_files.append(file)
  else:
    continue
image_files.sort()
json_files.sort()
image_files = image_files[:num_file_limit]
json_files = json_files[:num_file_limit]

## Prompt context

In [None]:
context_prompt = "Imagine you are a mathematician, and you are given an image of a 2D polygon with lines connected within it."

## Question 1: Shape of the polygon

In [None]:
question_prompt = "What is the shape of the polygon in the image? Choose one of the answers in 'triangle, quadrilateral, pentagon, hexagon, none of above'."
prompt = context_prompt+' '+question_prompt
print(prompt)

Imagine you are a mathematician, and you are given an image of a 2D polygon with lines connected within it. What is the shape of the polygon in the image? Choose one of the answers in 'triangle, quadrilateral, pentagon, hexagon, none of above'.


In [None]:
# image_file = '00019.jpeg'
# json_file = '00019.json'
# img = PIL.Image.open(os.path.join(dataset_dir, image_file))
# with open(os.path.join(dataset_dir, json_file), 'r') as f:
#   metadata = json.load(f)
# plt.imshow(img)
# response = model.generate_content([prompt, img], stream=True)
# response.resolve()
# response

In [None]:
num_correct = 0
num_wrong = 0

for image_file in image_files:
  print(image_file)
  json_file = image_file[:-4]+'json'
  if json_file not in json_files:
    continue
  img = PIL.Image.open(os.path.join(dataset_dir, image_file))
  with open(os.path.join(dataset_dir, json_file), 'r') as f:
    metadata = json.load(f)

  # Get model answer
  max_times_to_try = 20
  for num_tried in range(max_times_to_try):
    if num_tried >= 2:
      time.sleep(5)
    try:
      response = model.generate_content([prompt, img], stream=True)
      response.resolve()
      if response.candidates:
        answer = response.text
        # Validate the answer
        groundtruth = metadata['shape']
        if groundtruth in answer:
          num_correct+=1
        elif answer != "No response":
          num_wrong+=1
        print(image_file, "Successful!!!!!")
        break
      else :
        answer = "No response"
        print("NO RESPONSE")
    except:
      print("ERROR")

00000.jpeg
00000.jpeg Successful!!!!!
00001.jpeg
00001.jpeg Successful!!!!!
00002.jpeg
00002.jpeg Successful!!!!!
00003.jpeg
00003.jpeg Successful!!!!!
00004.jpeg
00004.jpeg Successful!!!!!
00005.jpeg
00005.jpeg Successful!!!!!
00006.jpeg
00006.jpeg Successful!!!!!
00007.jpeg
00007.jpeg Successful!!!!!
00008.jpeg
00008.jpeg Successful!!!!!
00009.jpeg
00009.jpeg Successful!!!!!
00010.jpeg
00010.jpeg Successful!!!!!
00011.jpeg
00011.jpeg Successful!!!!!
00012.jpeg
00012.jpeg Successful!!!!!
00013.jpeg
00013.jpeg Successful!!!!!
00014.jpeg
00014.jpeg Successful!!!!!
00015.jpeg
00015.jpeg Successful!!!!!
00016.jpeg
00016.jpeg Successful!!!!!
00017.jpeg
00017.jpeg Successful!!!!!
00018.jpeg
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
00018.jpeg Successful!!!!!
00019.jpeg
00019.jpeg Successful!!!!!
00020.jpeg
00020.jpeg Successful!!!!!
00021.jpeg
00021.jpeg Successful!!!!!
00022.jpeg
00022.jpeg Successful!!!!!
00023.jpeg
00023.j

In [None]:
print("Hit rate:", num_correct/(num_correct+num_wrong)*100,"%", "\nNumber correct:", num_correct , "\nNumber wrong:", num_wrong)

Hit rate: 55.00000000000001 % 
Number correct: 55 
Number wrong: 45


## Question 2: Number of sides

In [None]:
question_prompt = "How many sides does the polygon in the image have? Just give one number."
prompt = context_prompt+' '+question_prompt
print(prompt)

Imagine you are a mathematician, and you are given an image of a 2D polygon with lines connected within it. How many sides does the polygon in the image have? Just give one number.


In [None]:
num_correct = 0
num_wrong = 0

for image_file in image_files:
  print(image_file)
  json_file = image_file[:-4]+'json'
  if json_file not in json_files:
    continue
  img = PIL.Image.open(os.path.join(dataset_dir, image_file))
  with open(os.path.join(dataset_dir, json_file), 'r') as f:
    metadata = json.load(f)

  # Get model answer
  max_times_to_try = 20
  for num_tried in range(max_times_to_try):
    if num_tried >= 2:
      time.sleep(5)
    try:
      response = model.generate_content([prompt, img], stream=True)
      response.resolve()
      if response.candidates:
        answer = response.text
        # Validate the answer
        groundtruth = metadata['num_sides']
        if groundtruth == [int(i) for i in re.findall(r'\d+', answer)][0]:
          num_correct+=1
        elif answer != "No response":
          num_wrong+=1
        print(image_file, "Successful!!!!!")
        break
      else :
        answer = "No response"
        print("NO RESPONSE")
    except:
      print("ERROR")

00000.jpeg
00000.jpeg Successful!!!!!
00001.jpeg
00001.jpeg Successful!!!!!
00002.jpeg
00002.jpeg Successful!!!!!
00003.jpeg
00003.jpeg Successful!!!!!
00004.jpeg
00004.jpeg Successful!!!!!
00005.jpeg
00005.jpeg Successful!!!!!
00006.jpeg
00006.jpeg Successful!!!!!
00007.jpeg
00007.jpeg Successful!!!!!
00008.jpeg
00008.jpeg Successful!!!!!
00009.jpeg
00009.jpeg Successful!!!!!
00010.jpeg
00010.jpeg Successful!!!!!
00011.jpeg
00011.jpeg Successful!!!!!
00012.jpeg
00012.jpeg Successful!!!!!
00013.jpeg
00013.jpeg Successful!!!!!
00014.jpeg
00014.jpeg Successful!!!!!
00015.jpeg
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
00015.jpeg Successful!!!!!
00016.jpeg
NO RESPONSE
00016.jpeg Successful!!!!!
00017.jpeg
NO RESPONSE
NO RESPONSE
00017.jpeg Successful!!!!!
00018.jpeg
00018.jpeg Successful!!!!!
00019.jpeg
00019.jpeg Successful!!!!!
00020.jpeg
00020.jpeg Successful!!!!!
00021.jpeg
00021.jpeg Successful!!!!!
00022.jpeg
00022.jpeg Successful!!!!!
00023.jpeg
00023.jpeg Successful!!!!!
0002

In [None]:
print("Hit rate:", num_correct/(num_correct+num_wrong)*100,"%", "\nNumber correct:", num_correct , "\nNumber wrong:", num_wrong)

Hit rate: 21.0 % 
Number correct: 21 
Number wrong: 79


## Question 3: Total lines in the image

In [None]:
question_prompt = "How many line segments are there in total in the image where each endpoint of the line segment is either a vertex or a point on the sides of the polygon? Just give one number."
prompt = context_prompt+' '+question_prompt
print(prompt)

Imagine you are a mathematician, and you are given an image of a 2D polygon with lines connected within it. How many line segments are there in total in the image where each endpoint of the line segment is either a vertex or a point on the sides of the polygon? Just give one number.


In [None]:
num_correct = 0
num_wrong = 0

for image_file in image_files:
  print(image_file)
  json_file = image_file[:-4]+'json'
  if json_file not in json_files:
    continue
  img = PIL.Image.open(os.path.join(dataset_dir, image_file))
  with open(os.path.join(dataset_dir, json_file), 'r') as f:
    metadata = json.load(f)

  # Get model answer
  max_times_to_try = 20
  for num_tried in range(max_times_to_try):
    if num_tried >= 2:
      time.sleep(5)
    try:
      response = model.generate_content([prompt, img], stream=True)
      response.resolve()
      if response.candidates:
        answer = response.text
        # Validate the answer
        groundtruth = metadata['num_lines']
        if groundtruth == [int(i) for i in re.findall(r'\d+', answer)][0]:
          num_correct+=1
        elif answer != "No response":
          num_wrong+=1
        print(image_file, "Successful!!!!!")
        break
      else :
        answer = "No response"
        print("NO RESPONSE")
    except:
      print("ERROR")

00000.jpeg
00000.jpeg Successful!!!!!
00001.jpeg
00001.jpeg Successful!!!!!
00002.jpeg
00002.jpeg Successful!!!!!
00003.jpeg
00003.jpeg Successful!!!!!
00004.jpeg
00004.jpeg Successful!!!!!
00005.jpeg
00005.jpeg Successful!!!!!
00006.jpeg
00006.jpeg Successful!!!!!
00007.jpeg
00007.jpeg Successful!!!!!
00008.jpeg
00008.jpeg Successful!!!!!
00009.jpeg
00009.jpeg Successful!!!!!
00010.jpeg
00010.jpeg Successful!!!!!
00011.jpeg
00011.jpeg Successful!!!!!
00012.jpeg
00012.jpeg Successful!!!!!
00013.jpeg
00013.jpeg Successful!!!!!
00014.jpeg
00014.jpeg Successful!!!!!
00015.jpeg
NO RESPONSE
NO RESPONSE
00015.jpeg Successful!!!!!
00016.jpeg
NO RESPONSE
NO RESPONSE
00016.jpeg Successful!!!!!
00017.jpeg
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
00017.jpeg Successful!!!!!
00018.jpeg
00018.jpeg Successful!!!!!
00019.jpeg
00019.jpeg Successful!!!!!
00020.jpeg
00020.jpeg Successful!!!!!
00021.jpeg
00021.jpeg Successful!!!!!
00022.jpeg
00022.jpeg Successful!!!!!
00023.jpeg
00023.j

In [None]:
print("Hit rate:", num_correct/(num_correct+num_wrong)*100,"%", "\nNumber correct:", num_correct , "\nNumber wrong:", num_wrong)

Hit rate: 5.0 % 
Number correct: 5 
Number wrong: 95


## Question 4: Number of points on sides

In [None]:
question_prompt = "How many points (including vertices) in total are there on the sides of the polygon? Just give one number."
prompt = context_prompt+' '+question_prompt
print(prompt)

Imagine you are a mathematician, and you are given an image of a 2D polygon with lines connected within it. How many points (including vertices) in total are there on the sides of the polygon? Just give one number.


In [None]:
num_correct = 0
num_wrong = 0

for image_file in image_files:
  print(image_file)
  json_file = image_file[:-4]+'json'
  if json_file not in json_files:
    continue
  img = PIL.Image.open(os.path.join(dataset_dir, image_file))
  with open(os.path.join(dataset_dir, json_file), 'r') as f:
    metadata = json.load(f)

  # Get model answer
  max_times_to_try = 20
  for num_tried in range(max_times_to_try):
    if num_tried >= 2:
      time.sleep(5)
    try:
      response = model.generate_content([prompt, img], stream=True)
      response.resolve()
      if response.candidates:
        answer = response.text
        # Validate the answer
        groundtruth = metadata['num_points']
        if groundtruth == [int(i) for i in re.findall(r'\d+', answer)][0]:
          num_correct+=1
        elif answer != "No response":
          num_wrong+=1
        print(image_file, "Successful!!!!!")
        break
      else :
        answer = "No response"
        print("NO RESPONSE")
    except:
      print("ERROR")

00000.jpeg
00000.jpeg Successful!!!!!
00001.jpeg
00001.jpeg Successful!!!!!
00002.jpeg
00002.jpeg Successful!!!!!
00003.jpeg
00003.jpeg Successful!!!!!
00004.jpeg
00004.jpeg Successful!!!!!
00005.jpeg
00005.jpeg Successful!!!!!
00006.jpeg
00006.jpeg Successful!!!!!
00007.jpeg
00007.jpeg Successful!!!!!
00008.jpeg
00008.jpeg Successful!!!!!
00009.jpeg
00009.jpeg Successful!!!!!
00010.jpeg
00010.jpeg Successful!!!!!
00011.jpeg
00011.jpeg Successful!!!!!
00012.jpeg
00012.jpeg Successful!!!!!
00013.jpeg
00013.jpeg Successful!!!!!
00014.jpeg
00014.jpeg Successful!!!!!
00015.jpeg
00015.jpeg Successful!!!!!
00016.jpeg
00016.jpeg Successful!!!!!
00017.jpeg
NO RESPONSE
NO RESPONSE
NO RESPONSE
00017.jpeg Successful!!!!!
00018.jpeg
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
00018.jpeg Successful!!!!!
00019.jpeg
00019.jpeg Successful!!!!!
00020.jpeg
00020.jpeg Successful!!!!!
00021.jpeg
00021.jpeg Successful!!!!!
00022.jpeg
00022.jpeg Successful

In [None]:
print("Hit rate:", num_correct/(num_correct+num_wrong)*100,"%", "\nNumber correct:", num_correct , "\nNumber wrong:", num_wrong)

Hit rate: 0.0 % 
Number correct: 0 
Number wrong: 100


## Question 5: Max number of points on any edge


In [None]:
question_prompt = "What is the maximum number of endpoints (not including vertices) on any single edge of the polygon? Just give one number"
prompt = context_prompt+' '+question_prompt
print(prompt)

Imagine you are a mathematician, and you are given an image of a 2D polygon with lines connected within it. What is the maximum number of endpoints (not including vertices) on any single edge of the polygon? Just give one number


In [None]:
num_correct = 0
num_wrong = 0

for image_file in image_files:
  print(image_file)
  json_file = image_file[:-4]+'json'
  if json_file not in json_files:
    continue
  img = PIL.Image.open(os.path.join(dataset_dir, image_file))
  with open(os.path.join(dataset_dir, json_file), 'r') as f:
    metadata = json.load(f)

  # Get model answer
  max_times_to_try = 20
  for num_tried in range(max_times_to_try):
    if num_tried >= 2:
      time.sleep(5)
    try:
      response = model.generate_content([prompt, img], stream=True)
      response.resolve()
      if response.candidates:
        answer = response.text
        # Validate the answer
        if len(metadata['num_points_on_sides']) == 0:
          groundtruth = 0
        else:
          groundtruth = max(metadata['num_points_on_sides'])
        if groundtruth == [int(i) for i in re.findall(r'\d+', answer)][0]:
          num_correct+=1
        elif answer != "No response":
          num_wrong+=1
        print(image_file, "Successful!!!!!")
        break
      else :
        answer = "No response"
        print("NO RESPONSE")
    except:
      print("ERROR")

00000.jpeg
00000.jpeg Successful!!!!!
00001.jpeg
00001.jpeg Successful!!!!!
00002.jpeg
00002.jpeg Successful!!!!!
00003.jpeg
00003.jpeg Successful!!!!!
00004.jpeg
00004.jpeg Successful!!!!!
00005.jpeg
00005.jpeg Successful!!!!!
00006.jpeg
00006.jpeg Successful!!!!!
00007.jpeg
00007.jpeg Successful!!!!!
00008.jpeg
00008.jpeg Successful!!!!!
00009.jpeg
00009.jpeg Successful!!!!!
00010.jpeg
00010.jpeg Successful!!!!!
00011.jpeg
00011.jpeg Successful!!!!!
00012.jpeg
00012.jpeg Successful!!!!!
00013.jpeg
00013.jpeg Successful!!!!!
00014.jpeg
00014.jpeg Successful!!!!!
00015.jpeg
00015.jpeg Successful!!!!!
00016.jpeg
00016.jpeg Successful!!!!!
00017.jpeg
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
00017.jpeg Successful!!!!!
00018.jpeg
00018.jpeg Successful!!!!!
00019.jpeg
00019.jpeg Successful!!!!!
00020.jpeg
00020.jpeg Successful!!!!!
00021.jpeg
00021.jpeg Successful!!!!!
00022.jpeg
00022.jpeg Successful!!!!!
00023.

In [None]:
print("Hit rate:", num_correct/(num_correct+num_wrong)*100,"%", "\nNumber correct:", num_correct , "\nNumber wrong:", num_wrong)

Hit rate: 22.0 % 
Number correct: 22 
Number wrong: 78


## Question 6: How many bound edges have more than one point

In [None]:
question_prompt = "How many sides of the polygon contain endpoints which are not vertices? Just give one number"
prompt = context_prompt+' '+question_prompt
print(prompt)

Imagine you are a mathematician, and you are given an image of a 2D polygon with lines connected within it. How many sides of the polygon contain endpoints which are not vertices? Just give one number


In [None]:
num_correct = 0
num_wrong = 0

for image_file in image_files:
  print(image_file)
  json_file = image_file[:-4]+'json'
  if json_file not in json_files:
    continue
  img = PIL.Image.open(os.path.join(dataset_dir, image_file))
  with open(os.path.join(dataset_dir, json_file), 'r') as f:
    metadata = json.load(f)

  # Get model answer
  max_times_to_try = 20
  for num_tried in range(max_times_to_try):
    if num_tried >= 2:
      time.sleep(5)
    try:
      response = model.generate_content([prompt, img], stream=True)
      response.resolve()
      if response.candidates:
        answer = response.text
        # Validate the answer
        groundtruth = len(metadata['num_points_on_sides'])
        if groundtruth == [int(i) for i in re.findall(r'\d+', answer)][0]:
          num_correct+=1
        elif answer != "No response":
          num_wrong+=1
        print(image_file, "Successful!!!!!")
        break
      else :
        answer = "No response"
        print("NO RESPONSE")
    except:
      print("ERROR")

00000.jpeg
00000.jpeg Successful!!!!!
00001.jpeg
00001.jpeg Successful!!!!!
00002.jpeg
00002.jpeg Successful!!!!!
00003.jpeg
00003.jpeg Successful!!!!!
00004.jpeg
00004.jpeg Successful!!!!!
00005.jpeg
00005.jpeg Successful!!!!!
00006.jpeg
00006.jpeg Successful!!!!!
00007.jpeg
00007.jpeg Successful!!!!!
00008.jpeg
00008.jpeg Successful!!!!!
00009.jpeg
00009.jpeg Successful!!!!!
00010.jpeg
00010.jpeg Successful!!!!!
00011.jpeg
00011.jpeg Successful!!!!!
00012.jpeg
00012.jpeg Successful!!!!!
00013.jpeg
00013.jpeg Successful!!!!!
00014.jpeg
00014.jpeg Successful!!!!!
00015.jpeg
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
00015.jpeg Successful!!!!!
00016.jpeg
NO RESPONSE
NO RESPONSE
NO RESPONSE
NO RESPONSE
00016.jpeg Successful!!!!!
00017.jpeg
NO RESPONSE
NO RESPONSE
NO RESPONSE
00017.jpeg Successful!!!!!
00018.jpeg
00018.jpeg Successful!!!!!
00019.jpeg
00019.jpeg Successful!!!!!
00020.jpeg
00020.jpeg Successful!!!!!
00021.jpeg
00021.jpeg Successful!!!!!
00022.jpeg
00022.jpe

In [None]:
print("Hit rate:", num_correct/(num_correct+num_wrong)*100,"%", "\nNumber correct:", num_correct , "\nNumber wrong:", num_wrong)

Hit rate: 24.0 % 
Number correct: 24 
Number wrong: 76
