# Getting Locations with Gemini 2.0 Flash

In [12]:
from google import genai
from google.genai import types
import PIL.Image
import os
import base64
from io import BytesIO

# Load the image
image = PIL.Image.open('./assets/contact_form_google.png')

# Convert the image to base64
buffered = BytesIO()
image.save(buffered, format="PNG")
image_bytes = buffered.getvalue()
image_base64 = base64.b64encode(image_bytes).decode('utf-8')

# Define the prompt
prompt_input_forms_locations = 'Point to the locations of the input forms with no more than 10 items. The answer should follow the json format: [{"point": <point>, "label": <label1>}, ...]. The points are in [y, x] format normalized to 0-1000.'

# Construct the contents list
contents = [
    {
        "role": "user",
        "parts": [
            {"text": prompt_input_forms_locations},
            {
                "inlineData": {
                    "mimeType": "image/png",
                    "data": image_base64
                }
            }
        ]
    }
]

# Initialize the client
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
client = genai.Client(api_key=GEMINI_API_KEY)

# Generate content
response = client.models.generate_content(
    model="gemini-2.0-flash",
    contents=contents
)

# Print the response
print(response.text)

```json
[
  {"point": [296, 499], "label": "Lucas form"},
  {"point": [393, 499], "label": "Short answer text"},
  {"point": [506, 499], "label": "Short answer text"},
  {"point": [619, 499], "label": "Long answer text"},
  {"point": [731, 499], "label": "Short answer text"},
  {"point": [844, 499], "label": "Long answer text"}
]
```


In [13]:
def parse_form_locations(json_output):
    """
    Parse the form locations output from the model and return points and labels.
    
    Args:
        json_output (str or list): Either a JSON string (possibly with markdown code blocks) 
                                  or an already parsed list of dictionaries
    
    Returns:
        tuple: Two lists containing:
            - points: List of [y, x] coordinates
            - labels: List of corresponding labels
    """
    import json
    import re
    
    # If input is a string, try to extract and parse JSON
    if isinstance(json_output, str):
        # Remove markdown code blocks if present
        json_str = re.sub(r'```json\n|\n```', '', json_output)
        try:
            data = json.loads(json_str)
        except json.JSONDecodeError as e:
            raise ValueError(f"Failed to parse JSON: {e}")
    else:
        # Assume it's already parsed
        data = json_output
    
    # Extract points and labels
    points = []
    labels = []
    
    for item in data:
        points.append(item["point"])
        labels.append(item["label"])
    
    return points, labels

# Parse the output
points, labels = parse_form_locations(response.text)

# Print the results
print("Points:", points)
print("Labels:", labels)

# You can also access specific points and labels by index
for i, (point, label) in enumerate(zip(points, labels)):
    print(f"Form {i+1}: {label} at position {point}")

Points: [[296, 499], [393, 499], [506, 499], [619, 499], [731, 499], [844, 499]]
Labels: ['Lucas form', 'Short answer text', 'Short answer text', 'Long answer text', 'Short answer text', 'Long answer text']
Form 1: Lucas form at position [296, 499]
Form 2: Short answer text at position [393, 499]
Form 3: Short answer text at position [506, 499]
Form 4: Long answer text at position [619, 499]
Form 5: Short answer text at position [731, 499]
Form 6: Long answer text at position [844, 499]


In [4]:
import pyautogui
import time

def automate_text_input(points, texts, delay=0.5):
    """
    Automatically click on specified coordinates and type text.
    
    Args:
        points (list): List of [x, y] coordinates where to click
        texts (list): List of strings to type at each coordinate
        delay (float): Delay in seconds between actions (default: 0.5)
    """
    # Safety check
    if len(points) != len(texts):
        raise ValueError("Number of points must match number of texts")
    
    # Give user time to switch to the target window
    print("Starting in 3 seconds... Switch to your target window!")
    time.sleep(2)
    
    # Set up pyautogui safety features
    pyautogui.FAILSAFE = True  # Move mouse to upper-left corner to abort
    pyautogui.PAUSE = delay    # Add delay between actions
    
    # Process each point and text pair
    for (x, y), text in zip(points, texts):
        # Move to position and click
        pyautogui.click(x, y)
        time.sleep(delay)  # Wait a bit after clicking
        
        # Type the text
        pyautogui.write(text)
        time.sleep(delay)  # Wait a bit after typing

# Extract Pydantic Object with the Individual Forms as Attributes 

In [15]:
import anthropic
import re
import importlib.util
import sys
from pathlib import Path

MODEL_PYDANTIC_OBJECTS = "claude-3-7-sonnet-latest"
anthropic_client = anthropic.Anthropic()

def encode_image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

def extract_python_code(text):
    """Extract code between <python> tags"""
    pattern = r'<python>(.*?)</python>'
    match = re.search(pattern, text, re.DOTALL)
    if match:
        return match.group(1).strip()
    return None

def save_outputs(raw_output, extracted_code=None):
    """Save both raw output and extracted code to files"""
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    # Save raw output
    raw_filename = f"form_analysis_raw_{timestamp}.txt"
    with open(raw_filename, "w") as f:
        f.write(raw_output)
    print(f"Raw output saved to: {raw_filename}")
    
    # Save extracted code if available
    if extracted_code:
        code_filename = f"form_model_{timestamp}.py"
        with open(code_filename, "w") as f:
            f.write("from pydantic import BaseModel\n\n")  # Add import statement
            f.write(extracted_code)
        print(f"Extracted Pydantic model saved to: {code_filename}")
    
    

def call_anthropic_for_pydantic(image_path: str) -> str:
    """
    Sends a screenshot to Claude (Anthropic) with instructions:
      - "Create the appropriate pydantic object with the attributes for the input forms."
    Returns the raw textual response from Claude, which should have <python> ... </python>.
    """
    print("\n=== [1] Sending screenshot to Anthropic (Claude) to produce Pydantic model ===")
    # The user’s custom prompt
    prompt_message = """
                    Create the appropriate pydantic object with the attributes (each attribute should be of strictly type str not EmailStr nor any other special type) from this input forms page you see in front of you.
                    Output your Python code enclosed by XML tags <python> and </python>.
                    """
    img_data = encode_image_to_base64(image_path)

    # Important note: This usage is conceptual. The real Anthropic Chat API
    # might differ in usage/parameters. Adjust to your environment’s requirements.
    # The typical chat calls are more like anthropic_client.completions.create(...)
    # but the user’s snippet references an older "messages.create" approach.
    # 
    # If your version differs, adapt accordingly.
    message = anthropic_client.messages.create(
        model=MODEL_PYDANTIC_OBJECTS,
        max_tokens=1024,
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "image",
                        "source": {
                            "type": "base64",
                            "media_type": "image/png",
                            "data": img_data,
                        },
                    },
                    {
                        "type": "text",
                        "text": prompt_message
                    }
                ],
            }
        ],
    )

    raw_output = message.content[0].text
    return raw_output

def load_dynamic_model(model_code, module_name="dynamic_model"):
    # Write code to a temporary file
    temp_file = Path(f"{module_name}.py")
    temp_file.write_text(model_code)
    
    # Create spec and load module
    spec = importlib.util.spec_from_file_location(module_name, temp_file)
    module = importlib.util.module_from_spec(spec)
    sys.modules[module_name] = module
    spec.loader.exec_module(module)
    
    return module

image_path = "./assets/contact_form_google.png"
raw_output = call_anthropic_for_pydantic(image_path)
extract_python_code(raw_output)
pydantic_object = extract_python_code(raw_output)
print(pydantic_object)
# Usage
module = load_dynamic_model(pydantic_object, module_name="form_model_20250413_145451.py")
module


=== [1] Sending screenshot to Anthropic (Claude) to produce Pydantic model ===
from pydantic import BaseModel
from typing import Optional

class ContactInformation(BaseModel):
    question: str
    email: str
    address: str
    phone_number: Optional[str] = None
    comments: Optional[str] = None


<module 'form_model_20250413_145451.py' from '/Users/greatmaster/Desktop/projects/learning/copy-paste-structured-llm/form_model_20250413_145451.py.py'>

In [24]:
from openai import OpenAI
import os

OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

client = OpenAI(api_key=OPENAI_API_KEY)

def parse_text_from_raw_input(raw_input, pydantic_object):
    """
    Leverages OpenAI's API with Structured Outputs using Pydantic
    to parse the raw input text into a Pydantic object.
    """
    
    response = client.beta.chat.completions.parse(
        model="gpt-4o",
        messages=[
            {"role": "system", "content": "Extract the structured data specified from the following text."},
            {"role": "user", "content": raw_input}
        ],
        response_format=pydantic_object
    )
    forms_inputs = list(response.choices[0].message.parsed.__dict__.values())
    return forms_inputs
raw_input = "Hi I'm Lucas my email is lucas@gmail.com and my number is 2229998765, I live in X street 123 and I love to program and use AI to be productive. question is what is the best way to learn AI?"
forms_inputs = parse_text_from_raw_input(raw_input, module.ContactInformation)
forms_inputs

['what is the best way to learn AI?',
 'lucas@gmail.com',
 'X street 123',
 '2229998765',
 'I love to program and use AI to be productive.']

In [27]:
import pandas as pd

df = pd.read_csv("./locs.csv")
df

Unnamed: 0,Index,RelativeX,RelativeY,ScreenX,ScreenY,Color,Timestamp
0,1,826,572,836,607,#ff0000,2025-04-13T14:51:05.910Z
1,2,830,699,840,734,#ff0000,2025-04-13T14:51:07.133Z
2,3,824,823,834,858,#ff0000,2025-04-13T14:51:08.645Z
3,4,836,949,846,984,#ff0000,2025-04-13T14:51:09.946Z
4,5,817,1074,827,1109,#ff0000,2025-04-13T14:51:11.522Z


In [28]:
points = df[["ScreenX", "ScreenY"]].values
points


array([[ 836,  607],
       [ 840,  734],
       [ 834,  858],
       [ 846,  984],
       [ 827, 1109]])

In [29]:
forms_inputs

['what is the best way to learn AI?',
 'lucas@gmail.com',
 'X street 123',
 '2229998765',
 'I love to program and use AI to be productive.']

In [26]:

automate_text_input(points, forms_inputs)

Starting in 3 seconds... Switch to your target window!


![](./window-screenshot.png)