In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


**NOTE:** This notebook has been tested in the following environment:

* Python version = 3.9


# Prerequisites

## Install Vertex AI SDK for Rapid Evaluation

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
!pip3 install --upgrade --quiet google-cloud-aiplatform[rapid_evaluation]==1.47

!pip install --quiet --upgrade nest_asyncio
!pip install --upgrade -q openai

## Setup

### Import libraries

In [None]:
# General
import inspect
from uuid import uuid4
from google.colab import auth
from IPython.display import display, Markdown, HTML
import json
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import logging
from tqdm import tqdm
import nest_asyncio
import warnings
import random
import string
import os

# Main
import vertexai
from vertexai.preview.evaluation import EvalTask, PromptTemplate, CustomMetric, make_metric
import pandas as pd
from google.cloud import aiplatform
from vertexai.generative_models import GenerativeModel, HarmCategory, HarmBlockThreshold
from openai import OpenAI

### Library settings

In [None]:
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
nest_asyncio.apply()
warnings.filterwarnings("ignore")

### Initialize Vertex AI SDK for Python

In [None]:
PROJECT_ID = "cloud-llm-preview1" # @param {type:"string"}
REGION = "us-central1"  # @param {type:"string"}

aiplatform.init(project=PROJECT_ID, location=REGION)

### Helper functions

In [None]:
def generate_uuid(length: int = 8) -> str:
    """Generate a uuid of a specifed length (default=8)."""
    return "".join(random.choices(string.ascii_lowercase + string.digits, k=length))

def print_doc(function):
    print(f"{function.__name__}:\n{inspect.getdoc(function)}\n")

def display_eval_report(eval_result, metrics = None):
    """Display the evaluation results."""

    title, summary_metrics, report_df = eval_result
    metrics_df = pd.DataFrame.from_dict(summary_metrics, orient='index').T
    if metrics:
      metrics_df = metrics_df.filter([metric for metric in metrics_df.columns if any(selected_metric in metric for selected_metric in metrics)])
      report_df = report_df.filter([metric for metric in report_df.columns if any(selected_metric in metric for selected_metric in metrics)])


    # Display the title with Markdown for emphasis
    display(Markdown(f"## {title}"))

    # Display the metrics DataFrame
    display(Markdown("### Summary Metrics"))
    display(metrics_df)

    # Display the detailed report DataFrame
    display(Markdown(f"### Report Metrics"))
    display(report_df)

def display_explanations(df, metrics=None, n=1):

    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)
    if metrics:
          df = df.filter(
              ['instruction','context', 'reference', 'completed_prompt', 'response'] +
              [metric for metric in df.columns if any(selected_metric in metric for selected_metric in metrics)]
              )

    for index, row in df.iterrows():
      for col in df.columns:
        display(
            HTML(
                f"<h2>{col}:</h2> <div style='{style}'>{row[col]}</div>"
            )
        )
      display(HTML("<hr>"))


def plot_radar_plot(eval_results, metrics = None):

  fig = go.Figure()

  for eval_result in eval_results:

    title, summary_metrics, report_df = eval_result

    if metrics:
      summary_metrics = {k: summary_metrics[k] for k, v in summary_metrics.items() if any(selected_metric in k for selected_metric in metrics)}

    fig.add_trace(go.Scatterpolar(
          r=list(summary_metrics.values()),
          theta=list(summary_metrics.keys()),
          fill='toself',
          name=title
    ))

  fig.update_layout(
    polar=dict(
      radialaxis=dict(
        visible=True,
        range=[0, 5]
      )),
    showlegend=True
  )

  fig.show()

def plot_bar_plot(eval_results, metrics=None):

  fig = go.Figure()
  data = []

  for eval_result in eval_results:

    title, summary_metrics, _ = eval_result
    if metrics:
      summary_metrics = {k: summary_metrics[k] for k, v in summary_metrics.items() if any(selected_metric in k for selected_metric in metrics)}

    data.append(go.Bar(
          x=list(summary_metrics.keys()),
          y=list(summary_metrics.values()),
          name=title
    ))

  fig = go.Figure(data=data)

  # Change the bar mode
  fig.update_layout(barmode='group')
  fig.show()

def print_aggregated_metrics(job):
  """Print AutoMetrics"""

  rougeLSum = round(job.rougeLSum, 3) * 100
  display(HTML(f"<h3>The {rougeLSum}% of the reference summary is represented by LLM when considering the longest common subsequence (LCS) of words.</h3>"))

def print_autosxs_judgments(df, n=3):
    """Print AutoSxS judgments in the notebook"""

    style = "white-space: pre-wrap; width: 800px; overflow-x: auto;"
    df = df.sample(n=n)

    for index, row in df.iterrows():
        if row["confidence"] >= 0.5:
            display(
                HTML(
                    f"<h2>Document:</h2> <div style='{style}'>{row['id_columns']['document']}</div>"
                )
            )
            display(
                HTML(
                    f"<h2>Response A:</h2> <div style='{style}'>{row['response_a']}</div>"
                )
            )
            display(
                HTML(
                    f"<h2>Response B:</h2> <div style='{style}'>{row['response_b']}</div>"
                )
            )
            display(
                HTML(
                    f"<h2>Explanation:</h2> <div style='{style}'>{row['explanation']}</div>"
                )
            )
            display(
                HTML(
                    f"<h2>Confidence score:</h2> <div style='{style}'>{row['confidence']}</div>"
                )
            )
            display(HTML("<hr>"))


def print_autosxs_win_metrics(scores):
    """Print AutoSxS aggregated metrics"""

    score_b = round(scores["autosxs_model_b_win_rate"] * 100)
    display(
        HTML(
            f"<h3>AutoSxS Autorater prefers {score_b}% of time Model B over Model A </h3>"
        )
    )

# Evaluate Tool use and Function Calling quality for Gemini

#### Use Metric Bundle

Metric Bundle `tool_call_quality` contains 4 metrics:
* `tool_call_valid`
* `tool_name_match`
* `tool_parameter_key_match`
* `tool_parameter_kv_match`

In [None]:
metrics = ["tool_call_quality"]

### 1. Evaluate a Bring-Your-Own-Prediction dataset

Generative model's tool use quality can be evaluated if the eval dataset contains saved model tool call responses, and expected references.

In [None]:
response = [
    '{"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Mission Impossible Dead Reckoning Part 1", "theater": "Regal Edwards 14", "location": "Mountain View CA", "showtime": "7:30", "date": "2024-03-30", "num_tix": "2"}}]}',
    '{"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Mission Impossible Dead Reckoning Part 1", "theater": "Regal Edwards 14", "location": "Mountain View CA", "showtime": "7:30", "date": "2024-03-30", "num_tix": "2"}}]}',
    '{"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Mission Impossible Dead Reckoning Part 1", "theater": "Regal Edwards 14"}}]}',
    '{"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Mission Impossible Dead Reckoning Part 1", "theater": "Cinemark", "location": "Mountain View CA", "showtime": "5:30", "date": "2024-03-30", "num_tix": "2"}}]}',
]

reference = [
    '{"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Mission Impossible Dead Reckoning Part 1", "theater": "Regal Edwards 14", "location": "Mountain View CA", "showtime": "7:30", "date": "2024-03-30", "num_tix": "2"}}]}',
    '{"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Godzilla", "theater": "Regal Edwards 14", "location": "Mountain View CA", "showtime": "9:30", "date": "2024-03-30", "num_tix": "2"}}]}',
    '{"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Mission Impossible Dead Reckoning Part 1", "theater": "Regal Edwards 14", "location": "Mountain View CA", "showtime": "7:30", "date": "2024-03-30", "num_tix": "2"}}]}',
    '{"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Mission Impossible Dead Reckoning Part 1", "theater": "Regal Edwards 14", "location": "Mountain View CA", "showtime": "7:30", "date": "2024-03-30", "num_tix": "2"}}]}',
]

eval_dataset = pd.DataFrame({
    "response"   : response,
    "reference"  : reference,
})

#### Define EvalTask

In [None]:
experiment_name = "eval-saved-llm-tool-use" # @param {type:"string"}

tool_use_eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=metrics,
    experiment=experiment_name,
)

In [None]:
run_id = generate_uuid()

experiment_run_name = f"eval-{run_id}"

eval_result = tool_use_eval_task.evaluate(
              experiment_run_name=experiment_run_name
)
display_eval_report((f'Tool Use Quality Evaluation Metrics', eval_result.summary_metrics, eval_result.metrics_table))

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/eval-saved-llm-tool-use-eval-3qx77913 to Experiment: eval-saved-llm-tool-use


## Tool Use Quality Evaluation Metrics

### Summary Metrics

Unnamed: 0,row_count,tool_call_valid/mean,tool_call_valid/std,tool_name_match/mean,tool_name_match/std,tool_parameter_key_match/mean,tool_parameter_key_match/std,tool_parameter_kv_match/mean,tool_parameter_kv_match/std
0,4.0,1.0,0.0,1.0,0.0,0.8325,0.335,0.6675,0.273542


### Report Metrics

Unnamed: 0,response,reference,tool_call_valid,tool_name_match,tool_parameter_key_match,tool_parameter_kv_match
0,"{""content"": """", ""tool_calls"": [{""name"": ""book_...","{""content"": """", ""tool_calls"": [{""name"": ""book_...",1.0,1.0,1.0,1.0
1,"{""content"": """", ""tool_calls"": [{""name"": ""book_...","{""content"": """", ""tool_calls"": [{""name"": ""book_...",1.0,1.0,1.0,0.67
2,"{""content"": """", ""tool_calls"": [{""name"": ""book_...","{""content"": """", ""tool_calls"": [{""name"": ""book_...",1.0,1.0,0.33,0.33
3,"{""content"": """", ""tool_calls"": [{""name"": ""book_...","{""content"": """", ""tool_calls"": [{""name"": ""book_...",1.0,1.0,1.0,0.67


In [None]:
tool_use_eval_task.display_runs()

Unnamed: 0,experiment_name,run_name,run_type,state,metric.tool_parameter_kv_match/mean,metric.tool_parameter_kv_match/std,metric.tool_name_match/std,metric.tool_call_valid/mean,metric.tool_parameter_key_match/std,metric.tool_parameter_key_match/mean,metric.tool_call_valid/std,metric.tool_name_match/mean,metric.row_count
0,eval-saved-llm-tool-use,eval-3qx77913,system.ExperimentRun,COMPLETE,0.6675,0.273542,0.0,1.0,0.335,0.8325,0.0,1.0,4.0


## 2. Tool Use and Function Calling with Gemini

[Function Calling Documentation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/function-calling)

### Define a function and tool

Define an API specification and register the function in a tool with the latest version of [Vertex AI SDK for Python](https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk).



In [None]:
from vertexai.generative_models import (
    Content,
    FunctionDeclaration,
    GenerativeModel,
    Part,
    Tool,
)

book_tickets_func = FunctionDeclaration(
    name="book_tickets",
    description="Book movie tickets",
    parameters={
      "type": "object",
      "properties": {
          "movie": {
              "type": "string",
              "description": "The title of the movie."
          },
          "theater": {
              "type": "string",
              "description": "The name of the movie theater."
          },
          "location": {
              "type": "string",
              "description": "The location of the movie theater."
          },
          "showtime": {
              "type": "string",
              "description": "The showtime of the movie in ISO 8601 format."
          },
          "date": {
              "type": "string",
              "description": "The date of the movie in ISO 8601 format."
          },
          "num_tix": {
              "type": "string",
              "description": "The integer number of tickets to book."
          }
      },
          "required": [
            "movie",
            "theater",
            "location",
            "showtime",
            "date",
            "num_tix",
        ],
    },
)


book_tickets_tool = Tool(
    function_declarations=[book_tickets_func],
)

### Generate a function call

Prompt the Gemini model and include the tool that you defined.

In [None]:
prompt = """I'd like to book 2 tickets for the movie "Mission Impossible Dead Reckoning Part 1"
at the Regal Edwards 14 theater in Mountain View, CA. The showtime is 7:30 PM on March 30th, 2024.
"""

gemini_model = GenerativeModel('gemini-pro')

gemini_response = gemini_model.generate_content(
    prompt,
    tools=[book_tickets_tool],
)

gemini_response.candidates[0].content

role: "model"
parts {
  function_call {
    name: "book_tickets"
    args {
      fields {
        key: "date"
        value {
          string_value: "2024-03-30"
        }
      }
      fields {
        key: "location"
        value {
          string_value: "Mountain View, CA"
        }
      }
      fields {
        key: "movie"
        value {
          string_value: "Mission Impossible Dead Reckoning Part 1"
        }
      }
      fields {
        key: "num_tix"
        value {
          number_value: 2.0
        }
      }
      fields {
        key: "showtime"
        value {
          string_value: "19:30"
        }
      }
      fields {
        key: "theater"
        value {
          string_value: "Regal Edwards 14"
        }
      }
    }
  }
}

###  Unpack the Gemini response into a Python dictionary

In [None]:
def unpack_response(response):
  output = {}
  function_call = {}
  for key, value in response.candidates[0].content.parts[0].to_dict().items():
    function_call[key] = value
  output['content'] = ''
  output['tool_calls'] = [function_call['function_call']]
  output['tool_calls'][0]['arguments'] = output['tool_calls'][0].pop('args')
  return json.dumps(output)

response = unpack_response(gemini_response)
response

'{"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Mission Impossible Dead Reckoning Part 1", "num_tix": 2.0, "showtime": "19:30", "date": "2024-03-30", "theater": "Regal Edwards 14", "location": "Mountain View, CA"}}]}'

### Evaluate the Gemini's Function Call Response

In [None]:
reference =json.dumps({"content": "", "tool_calls": [{"name": "book_tickets", "arguments": {"movie": "Mission Impossible Dead Reckoning Part 1", "theater": "Regal Edwards 14", "location": "Mountain View CA", "showtime": "7:30", "date": "2024-03-30", "num_tix": "2"}}]})

eval_dataset = pd.DataFrame({
        "response"  : [response],
        "reference" : [reference]
})

In [None]:
# Expected Tool Call Response
json.loads(eval_dataset.reference[0])

{'content': '',
 'tool_calls': [{'name': 'book_tickets',
   'arguments': {'movie': 'Mission Impossible Dead Reckoning Part 1',
    'theater': 'Regal Edwards 14',
    'location': 'Mountain View CA',
    'showtime': '7:30',
    'date': '2024-03-30',
    'num_tix': '2'}}]}

In [None]:
# Actual Gemini Tool Call Response
json.loads(eval_dataset.response[0])

{'content': '',
 'tool_calls': [{'name': 'book_tickets',
   'arguments': {'movie': 'Mission Impossible Dead Reckoning Part 1',
    'num_tix': 2.0,
    'showtime': '19:30',
    'date': '2024-03-30',
    'theater': 'Regal Edwards 14',
    'location': 'Mountain View, CA'}}]}

In [None]:
experiment_name = "eval-gemini-model-function-call" # @param {type:"string"}

gemini_functiona_call_eval_task = EvalTask(
    dataset=eval_dataset,
    metrics=['tool_call_quality'],
    experiment=experiment_name,
)

In [None]:
run_id = generate_uuid()

eval_result = gemini_functiona_call_eval_task.evaluate(
    experiment_run_name = f"eval-{run_id}"
)

display_eval_report((f'Gemini Tool Use Quality Evaluation Metrics', eval_result.summary_metrics, eval_result.metrics_table))

INFO:google.cloud.aiplatform.metadata.experiment_resources:Associating projects/801452371447/locations/us-central1/metadataStores/default/contexts/eval-gemini-model-function-call-eval-qkfzuqbe to Experiment: eval-gemini-model-function-call


## Gemini Tool Use Quality Evaluation Metrics

### Summary Metrics

Unnamed: 0,row_count,tool_call_valid/mean,tool_call_valid/std,tool_name_match/mean,tool_name_match/std,tool_parameter_key_match/mean,tool_parameter_key_match/std,tool_parameter_kv_match/mean,tool_parameter_kv_match/std
0,1.0,1.0,,1.0,,1.0,,0.5,


### Report Metrics

Unnamed: 0,response,reference,tool_call_valid,tool_name_match,tool_parameter_key_match,tool_parameter_kv_match
0,"{""content"": """", ""tool_calls"": [{""name"": ""book_...","{""content"": """", ""tool_calls"": [{""name"": ""book_...",1.0,1.0,1.0,0.5
