<a href="https://colab.research.google.com/github/Amirosimani/ReWOO-Gemini/blob/main/ReWOO_gemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.


|||
|----------|-------------|
| Author(s)   | amirimani@ |
| Last updated | 5/02/2025 |
<br><br>


In [None]:
!pip install --quiet datasets
!pip install --quiet langchain langchain_community
!pip install --quiet langchain_google_genai langchain_google_community
!pip install --quiet tiktoken

In [None]:
import re
import time
import tiktoken
from tqdm import tqdm
from datasets import load_dataset
from typing import List, Dict, Union, Callable, Any

from langchain import hub
from langchain import LLMChain, PromptTemplate
from langchain.agents import AgentExecutor, create_react_agent
from langchain_community.utilities import GoogleSearchAPIWrapper
from langchain.callbacks.base import BaseCallbackHandler
from langchain.schema import AgentAction, AgentFinish, LLMResult, BaseMessage
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.tools import Tool

from google.colab import userdata
from google import genai
from google.genai.types import GenerateContentConfig

In [None]:
# --- Configuration ---
MODEL = "gemini-2.0-flash"
MAX_ITERATIONS = 8
MAX_EXECUTION_TIME = 60

GENERATION_CONFIG = {
    "temperature": 0.8,
    "top_p": 0.95,
    "top_k": 20,
    "candidate_count": 1,
    "max_output_tokens": 8192,
}

GEMINI_API_KEY = userdata.get('GEMINI')
GOOGLE_API_KEY = userdata.get('GOOGLE-API')
CSE_ID = userdata.get("CSE-ID")

## Data

[**StrategyQA**](https://paperswithcode.com/dataset/strategyqa) is a question answering benchmark where the required reasoning steps are implicit in the question, and should be inferred using a strategy. It includes 2,780 examples, each consisting of a strategy question, its decomposition, and evidence paragraphs. Questions in StrategyQA are short, topic-diverse, and cover a wide range of strategies.



In [None]:
ds = load_dataset("ChilleD/StrategyQA")

In [None]:
# Access the training split
train_ds = ds["train"]
test_ds = ds['test']

# LLM

## Baseline

Use Gemnini with no tools and not fancy prompts!

In [None]:
client = genai.Client(api_key=GEMINI_API_KEY)

In [None]:
def generate(prompt, model=MODEL, config=GENERATION_CONFIG):

    client = genai.Client(api_key=GEMINI_API_KEY)
    config["system_instruction"] = "Once you are done finding the answer, only return Yes or No"

    start_time = time.time()

    response = client.models.generate_content(
        model=model,
        contents=prompt,
        config=GenerateContentConfig(**config)
    )

    end_time = time.time()
    wall_time = end_time - start_time

    result = {
        'input': prompt,
        'output': response.text.strip(),
        'total_token': response.usage_metadata.total_token_count,
        'wall_time': wall_time
    }

    return result

In [None]:
# generate(train_ds[1]['question'])

## ReAct Agent

In [None]:
class ReActAgentExecutor:
    """
    A class to run the ReAct agent with specified configurations and tools.
    """

    def __init__(
        self,
        model: str = MODEL,
        generation_config: Dict = GENERATION_CONFIG,
        max_iterations: int = MAX_ITERATIONS,
        max_execution_time: int = MAX_EXECUTION_TIME,
        google_api_key: str = GOOGLE_API_KEY,
        cse_id: str = CSE_ID,
    ):
        self.model = model
        self.generation_config = generation_config
        # self.safety_settings = safety_settings
        self.max_iterations = max_iterations
        self.max_execution_time = max_execution_time
        self.google_api_key = google_api_key
        self.cse_id = cse_id
        self.llm = None
        self.tools = None
        self.agent = None
        self.agent_executor = None
        self.token_callback = None

        self._setup_llm()
        self._setup_tools()
        self._setup_agent()

    def _setup_llm(self):
        """Initializes the language model."""
        if not GEMINI_API_KEY or GEMINI_API_KEY == "your_gemini_api_key":
            raise ValueError("GEMINI_API_KEY must be set to a valid API key.")
        self.llm = ChatGoogleGenerativeAI(
            model=self.model,
            google_api_key=GEMINI_API_KEY,
            generation_config=self.generation_config,
        )

    def _setup_tools(self):
        """Sets up the tools for the agent."""
        search = GoogleSearchAPIWrapper(
            google_api_key=self.google_api_key, google_cse_id=self.cse_id
        )

        self.tools = [
            Tool(
                name="Google Search",
                func=search.run,
                description="Useful for finding information on current events, comparisons, or diverse perspectives.",
            ),
        ]

    def _setup_agent(self):
        """Sets up the ReAct agent and executor."""
        prompt = hub.pull("hwchase17/react")
        system_instruction = "Once you are done finding the answer, only return Yes or No"
        prompt.template = system_instruction + "\n" + prompt.template

        self.agent = create_react_agent(self.llm, self.tools, prompt)

        self.token_callback = TokenCountingCallbackHandler(self.model)
        self.agent_executor = AgentExecutor(
            agent=self.agent,
            tools=self.tools,
            verbose=False,
            handle_parsing_errors=True,
            max_iterations=self.max_iterations,
            max_execution_time=self.max_execution_time,
            callbacks=[self.token_callback],
        )

    def run(self, input_data: Union[Dict, str]) -> Dict:
        """
        Runs the agent with the given input data.

        Args:
            input_data: Either a dictionary or a string representing the input for the agent.

        Returns:
            The output from the agent.
        """
        if isinstance(input_data, str):
            input_data = {"input": input_data}

        start_time = time.time()  # Start timing
        try:
            result = self.agent_executor.invoke(input_data)
            result["total_token"] = self.token_callback.total_token

            self.token_callback.reset()  # Reset after each run
        except Exception as e:
            print(f"An error occurred: {e}")
            result = {"error": str(e)}
        end_time = time.time()  # End timing

        # Log wall time
        wall_time = end_time - start_time
        print(f"Wall time for execution: {wall_time:.2f} seconds")
        result["wall_time"] = wall_time

        return result


class TokenCountingCallbackHandler(BaseCallbackHandler):
    """Callback handler for counting tokens used by the language model."""

    def __init__(self, model_name: str):
        self.model_name = model_name
        self.total_token = 0
        self.prompt_tokens = 0
        self.completion_tokens = 0
        self.encoding = tiktoken.get_encoding("cl100k_base")

    def on_llm_start(
        self, serialized: Dict[str, any], prompts: List[str], **kwargs
    ) -> None:
        """Collect prompt tokens when LLM starts."""
        for prompt in prompts:
            self.prompt_tokens += len(self.encoding.encode(prompt))

    def on_llm_end(self, response: LLMResult, **kwargs) -> None:
        """Collect completion tokens when LLM finishes generating."""
        if response.generations:
            for generation_list in response.generations:
                for generation in generation_list:
                    if generation.text:
                        self.completion_tokens += len(
                            self.encoding.encode(generation.text)
                        )

    def on_agent_action(self, action: AgentAction, **kwargs) -> None:
        """Increment token count on agent action."""
        if action.log:
            self.total_token += len(self.encoding.encode(action.log))

    def on_agent_finish(self, finish: AgentFinish, **kwargs) -> None:
        """Increment token count on agent finish."""
        if finish.log:
            self.total_token += len(self.encoding.encode(finish.log))

    def on_chain_end(self, outputs, **kwargs) -> None:
        """Print the total tokens used when the chain finishes."""
        self.total_token += self.completion_tokens + self.prompt_tokens
        print(f"Prompt tokens: {self.prompt_tokens}")
        print(f"Completion tokens: {self.completion_tokens}")
        print(f"Total tokens used in this chain: {self.total_token}")

    def reset(self):
        """Reset the counters for the next chain run."""
        self.total_token = 0
        self.prompt_tokens = 0
        self.completion_tokens = 0


In [None]:
# agent_executor = ReActAgentExecutor()
# result = agent_executor.run(train_ds[1]["question"])

## ReWOO

ReWOO: Decoupling Reasoning from Observations
for Efficient Augmented Language Models [paper](https://arxiv.org/pdf/2305.18323)

based on the implementation [here](https://github.com/billxbf/ReWOO/tree/main)

In [None]:
# --- Simplified Worker ---
class GoogleSearchWorker:
    def __init__(self, name="Google"):
        self.name = name
        self.google_api_key = GOOGLE_API_KEY
        self.cse_id = CSE_ID
        self.description = "Worker that searches results from Google. Useful when you need to find short " \
                           "and succinct answers about a specific topic. Input should be a search query."

    def run(self, input):
        search = GoogleSearchAPIWrapper(
            google_api_key=self.google_api_key, google_cse_id=self.cse_id
        )
        # Get the results from the API
        results = search.results(input, 1)

        # Print the structure of the results for debugging
        print("Results Structure:", results)

        evidence = ""
        for result in results:
            # Check if 'snippet' exists, otherwise use 'title' or 'body'
            if "snippet" in result:
                evidence += result["snippet"]
            elif "title" in result:
                evidence += result["title"]
            elif "body" in result:  # Use "body" as a fallback
                evidence += result["body"]
            else:
                print("Warning: No relevant information found in result:", result)

        return evidence

# --- LLM Node (Simplified) ---
class LLMNode:
    def __init__(self, name, model_name, stop=None, input_type=str, output_type=str):
        self.name = name
        self.model_name = model_name
        self.model = model_name
        self.stop = stop
        self.input_type = input_type
        self.output_type = output_type
        self.generation_config = GENERATION_CONFIG
        self.llm = ChatGoogleGenerativeAI(
            model=self.model,
            google_api_key=GEMINI_API_KEY,
            generation_config=self.generation_config,
            # safety_settings=self.safety_settings,
        )
        self.tokenizer = tiktoken.get_encoding("cl100k_base")

    def call_llm(self, prompt, stop):
        if isinstance(prompt, list):
            prompt_template = PromptTemplate(template=prompt[0], input_variables=["question"])
            prompt_text = prompt[0]
            llm_chain = LLMChain(prompt=prompt_template, llm=self.llm, verbose=False)
            response = llm_chain(prompt[1])
            output = response["text"].strip()
        else:
            prompt_template = PromptTemplate(template=prompt, input_variables=[])
            prompt_text = prompt
            llm_chain = LLMChain(prompt=prompt_template, llm=self.llm, verbose=False)
            response = llm_chain({})
            output = response["text"].strip()

        prompt_tokens = len(self.tokenizer.encode(prompt_text))
        completion_tokens = len(self.tokenizer.encode(output))
        return {
            "output": output,
            "prompt_tokens": prompt_tokens,
            "completion_tokens": completion_tokens
        }

# --- Planner ---
class Planner(LLMNode):
    def __init__(self, model_name="gemini-pro", fewshot=""):
        super().__init__("Planner", model_name, stop=None, input_type=str, output_type=str)
        self.worker_prompt = "Tools can be one of the following:\nGoogle[input]: Worker that searches results from Google. Useful when you need to find short and succinct answers about a specific topic. Input should be a search query.\n\n"
        self.prefix = "For the following tasks, make plans that can solve the problem step-by-step. For each plan, " \
                     "indicate which external tool together with tool input to retrieve evidence. You can store the " \
                     "evidence into a variable #E that can be called by later tools. (Plan, #E1, Plan, #E2, Plan, ...) \n\n"
        self.suffix = "Begin! Describe your plans with rich details. Each Plan should be followed by only one #E.\n\n"
        self.fewshot = fewshot

    def run(self, input, log=False):
        prompt = self.prefix + self.worker_prompt + self.fewshot + self.suffix + input + '\n'
        response = self.call_llm(prompt, self.stop)
        if log:
            return response
        return response["output"]

# --- Solver ---
class Solver(LLMNode):
    def __init__(self, model_name="gemini-pro"):
        super().__init__("Solver", model_name, stop=None, input_type=str, output_type=str)
        self.prefix = "Solve the following task or problem. To assist you, we provide some plans and corresponding evidences that might be helpful. Notice that some of these information contain noise so you should trust them with caution.\n\n"
        self.suffix = "\nNow begin to solve the task or problem. Respond with the answer directly with no extra words. Your answer should be either Yes or No\n\n"

    def run(self, input, worker_log, log=False):
        prompt = self.prefix + input + "\n" + worker_log + self.suffix + input + '\n'
        response = self.call_llm(prompt, self.stop)
        if log:
            return response
        return response["output"]

# --- Main PWS Class ---
class PWS:
    def __init__(self, planner_model="gemini-pro", solver_model="gemini-pro", fewshot=""):
        self.worker = GoogleSearchWorker()
        self.planner = Planner(model_name=planner_model, fewshot=fewshot)
        self.solver = Solver(model_name=solver_model)
        self.plans = []
        self.planner_evidences = {}
        self.worker_evidences = {}
        self.tokenizer = tiktoken.get_encoding("cl100k_base")
    def run(self, input):
        self._reinitialize()
        result = {}
        st = time.time()

        # Plan
        planner_response = self.planner.run(input, log=True)
        plan = planner_response["output"]

        # Store the planner input before calling call_llm
        planner_input = self.planner.prefix + self.planner.worker_prompt + self.planner.fewshot + self.planner.suffix + input + '\n'

        planner_log = planner_input + planner_response["output"]

        self.plans = self._parse_plans(plan)
        self.planner_evidences = self._parse_planner_evidences(plan)

        # --- Validation and Error Handling ---
        valid_plan = self._validate_plan()
        if not valid_plan:
            print("Warning: Invalid plan generated. Skipping worker and passing the question to solver.")
            worker_log = ""
            solver_input = self.solver.prefix + input + "\n" + self.solver.suffix + input + '\n'
        else:
            # Work
            self._get_worker_evidences()
            worker_log = ""
            total_worker_tokens = 0
            for i in range(len(self.plans)):
                e = f"#E{i + 1}"
                if e in self.worker_evidences:
                    worker_log += f"{self.plans[i]}\nEvidence:\n{self.worker_evidences[e]}\n"
                    total_worker_tokens += self._count_tokens(self.worker_evidences[e])
                else:
                    worker_log += f"{self.plans[i]}\nEvidence:\nNo evidence found for {e}\n"
                    print(f"Warning: No evidence found for {e} in self.worker_evidences")

        # Solve
        solver_response = self.solver.run(input, worker_log, log=True)

        # Similar fix for solver_log
        if valid_plan:
            solver_input = self.solver.prefix + input + "\n" + worker_log + self.solver.suffix + input + '\n'
        else:
            solver_input = self.solver.prefix + input + "\n" + self.solver.suffix + input + '\n'

        output = solver_response["output"]
        solver_log = solver_input + solver_response["output"]

        result["wall_time"] = time.time() - st
        result["input"] = input
        result["output"] = output
        result["planner_log"] = planner_log
        result["worker_log"] = worker_log
        result["solver_log"] = solver_log
        result["steps"] = len(self.plans) + 1
        result["prompt_tokens"] = planner_response["prompt_tokens"] + solver_response["prompt_tokens"]
        result["completion_tokens"] = planner_response["completion_tokens"] + solver_response["completion_tokens"]

        if valid_plan:
            result["total_token"] = result["completion_tokens"] + total_worker_tokens
        else:
            result["total_token"] = result["completion_tokens"]


        result = {k: result[k] for k in ["input", "output", "wall_time", "total_token"]}

        return result

    def _validate_plan(self):
        """
        Validates if the generated plan has the correct #E notation in sequence.
        """
        for i in range(len(self.plans)):
            expected_evidence_key = f"#E{i + 1}"
            if expected_evidence_key not in self.planner_evidences:
                return False
        return True

    def _parse_plans(self, response):
        plans = []
        for line in response.splitlines():
            if line.startswith("Plan:"):
                plans.append(line)
        return plans

    def _parse_planner_evidences(self, response):
        evidences = {}
        for line in response.splitlines():
            if line.startswith("#") and line[1] == "E" and line[2].isdigit():
                parts = line.split("=", 1)  # Split into at most 2 parts
                if len(parts) == 2:
                    e, tool_call = parts
                    e, tool_call = e.strip(), tool_call.strip()
                    evidences[e] = tool_call
                else:
                    # Handle cases where there's no '=' after #E
                    e = parts[0].strip()
                    evidences[e] = "No evidence found"
                    print(f"Warning: Invalid planner evidence format: {line}")
        return evidences

    def _get_worker_evidences(self):
        for e, tool_call in self.planner_evidences.items():
            if not tool_call.startswith("Google["):
                self.worker_evidences[e] = "No evidence found"
                continue
            tool_input = tool_call[7:-1]
            for var in re.findall(r"#E\d+", tool_input):
                if var in self.worker_evidences:
                    tool_input = tool_input.replace(var, "[" + self.worker_evidences[var] + "]")

            self.worker_evidences[e] = self.worker.run(tool_input)

    def _reinitialize(self):
        self.plans = []
        self.planner_evidences = {}
        self.worker_evidences = {}

    def _count_tokens(self, text):
        return len(self.tokenizer.encode(text))

In [None]:
# rewoo_executor = PWS()
# result = rewoo_executor.run(train_ds[1]["question"])
# result

# Run evalaution

In [None]:
def run_experiment(data: List[Dict[str, Any]]) -> Dict[int, Dict[str, Any]]:
    """Processes data and runs three functions, storing results in a dictionary."""

    results = {}
    question = data["question"]

    generate_result = generate(question)
    agent_result = ReActAgentExecutor().run(question)
    rewoo_result = PWS().run(question)

    return {
        "gemini_2_flash": generate_result,
        "gemini_2_lite": generate(prompt="question",
                                  model="gemini-2.0-flash-lite-preview-02-05"),
        "agent_executor": agent_result,
        "rewoo_executor": rewoo_result,
        "gt": data['answer']
    }

In [None]:
import json
with open('./results_train_20250205__2.jsonl', "a") as file:
    for data in train_ds.select(range(150, len(train_ds))):
        result = run_experiment(data)
        json.dump(result, file)
        file.write("\n")

# Analyse results

In [None]:
import os
import json
import glob
import statistics
import numpy as np
import pandas as pd
import seaborn as sns
from pprint import pprint
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

In [None]:
directory_path = "./"  # Change this to your directory containing JSONL files
all_data = []

jsonl_files = glob.glob(os.path.join(directory_path, "*.jsonl"))

for file_path in jsonl_files:
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file:
            try:
                all_data.append(json.loads(line.strip()))
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line in {file_path}: {e}")

print(f"Loaded {len(all_data)} records from {len(jsonl_files)} files.")

In [None]:
df = pd.DataFrame(all_data)
df.shape

In [None]:

def expand_dict_columns(df):

    new_df = df.copy()  # Create a copy to avoid modifying the original DataFrame

    cols_to_expand = []
    for col in new_df.columns:
        if isinstance(new_df[col].iloc[0], dict):  # Check if the first element is a dict
            cols_to_expand.append(col)

    if not cols_to_expand:
        return new_df  # Return original if no dict columns

    for col in cols_to_expand:
        try:
            # Efficiently handle potentially mixed data types by converting to string
            expanded_data = pd.json_normalize(new_df[col].astype(str).apply(eval))  # eval is generally unsafe, but we are converting to string first.
            expanded_data = expanded_data.add_prefix(col + "_")
            new_df = new_df.drop(columns=[col])  # Drop the original dictionary column
            new_df = pd.concat([new_df, expanded_data], axis=1)

        except (TypeError, ValueError, SyntaxError) as e:
            print(f"Warning: Could not expand column '{col}'.  Likely inconsistent data types.  Skipping. Error: {e}")
            # Handle exceptions gracefully, skip the column, and continue.
            continue


    return new_df

In [None]:
df = expand_dict_columns(df)
df = df.dropna(subset=['agent_executor_output', 'agent_executor_output'])
df.shape

In [None]:
df.columns



In [None]:
output_cols = [col for col in df.columns if "output" in col]
output_cols

In [None]:
for col in output_cols:
    df[col] = df[col].apply(lambda x: str(x).lower() if isinstance(x, str) else x)  # Conditional lowercasing
    df[col] = df[col].replace({'yes': True, 'no': False})
    df[col] = df[col].astype(str)

In [None]:
for col in output_cols:
    unique_count = df[col].nunique()
    unique_values = df[col].unique()
    print(f"Column '{col}' has {unique_count} unique values.", unique_values)

In [None]:
for col in output_cols:
    unique_count = df[col].value_counts()
    print(f"Column '{col}' has {unique_count}")

In [None]:
df['gt'] = df['gt'].astype(str)

In [None]:
def generate_report(executor, data):
  y_pred = [p if p in ["True", "False"] else "Wrong" for p in data[executor]]
  y_true = [t if t in ["True", "False"] else "Wrong" for t in data['gt']]

  return classification_report(y_true, y_pred, labels=["True", "False", "Wrong"],
                               output_dict=True)

In [None]:
reports = {}
for col in output_cols:

  reports[col] = generate_report(col, df)
  # print(f"result for {col} is {r}")

In [None]:
sel_cols = [col for col in df.columns if "token" in col]
# sel_cols = [col for col in df.columns if "time" in col]

df[sel_cols].describe()


mean_values = df[sel_cols].mean()
std_values = df[sel_cols].std()

# Create a summary table
summary = pd.DataFrame({'Mean': mean_values, 'Std Dev': std_values})
summary = summary.round(2)

# Display the summary table
print(summary.to_markdown(numalign="left", stralign="left"))

In [None]:
df_reports = pd.DataFrame({model: {metric: report["weighted avg"][metric] for metric in ["precision", "recall", "f1-score"]}
                           for model, report in reports.items()}).T

df_reports

In [None]:
df_reports['mean_wall_time'] = list(mean_values)
df_reports

In [None]:
df_reports['mean_token__count'] = list(mean_values)
df_reports

In [None]:
import numpy as np
from math import pi
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler


def create_spider_chart(df, title="Spider Chart"):

    categories = list(df.columns)  # The columns become the categories
    num_categories = len(categories)

    # We need to "close" the plot by repeating the first value at the end
    angles = [n / float(num_categories) * 2 * pi for n in range(num_categories)]
    angles += angles[:1] # Close the plot

    # Color mapping based on index names
    unique_indices = df.index.unique()
    color_map = plt.cm.get_cmap("viridis", len(unique_indices))  # Or any other colormap

    plt.figure(figsize=(8, 8))  # Adjust figure size as needed
    ax = plt.subplot(111, polar=True)

    for i, index_name in enumerate(unique_indices):
        values = df.loc[index_name].values.flatten().tolist()  # Get values for current index
        values += values[:1]  # Close the plot

        color = color_map(i)  # Get color from colormap
        ax.plot(angles, values, marker='o', linestyle='-', color=color, label=index_name)
        ax.fill(angles, values, alpha=0.25, color=color)  # Fill area under the line

    ax.set_xticks(angles[:-1])  # Set ticks for each category
    ax.set_xticklabels(categories)  # Set category labels
    ax.set_yticklabels([]) # Remove y ticks

    ax.grid(True)
    plt.title(title, y=1.1)  # Adjust title position
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1)) # Place legend outside the plot

    plt.tight_layout() # Adjust layout to prevent labels from overlapping
    plt.show()

def create_standardized_spider_chart(df, title="Standardized Spider Chart"):
    """Creates a spider chart with standardization (Z-score)."""

    df_scaled = df.copy()
    for col in df_scaled.columns:
        scaler = StandardScaler()
        df_scaled[col] = scaler.fit_transform(df_scaled[[col]])

    create_spider_chart(df_scaled, title=title)

In [None]:

create_standardized_spider_chart(df_reports)

In [None]:
df_sel = df.filter(like="token")

# Calculate the average value for each column
column_means = df_sel.mean()

# Sort the columns based on their average values (ascending)
sorted_columns = column_means.sort_values().index

# Melt the DataFrame using the sorted column order
df_melted = pd.melt(df_sel[sorted_columns], var_name='column', value_name='value')

# Set figure size
plt.figure(figsize=(10, 6))

# Create violin plot, ensuring order follows sorted means
sns.violinplot(x='value', y='column', data=df_melted, order=sorted_columns, orient='h', palette='Set3', linewidth=1.2, cut=0, scale="width")

# Enhancements for better visualization
plt.xlabel('Total Token Count', fontsize=12)
plt.ylabel('Executor', fontsize=12)
plt.title('Token Count Violin Plots', fontsize=14, fontweight='bold')
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

sns.despine(left=True)

plt.tight_layout()
plt.show()

In [None]:
df_sel = df.filter(like="wall")

# Calculate the average value for each column
column_means = df_sel.mean()

# Sort the columns based on their average values (ascending)
sorted_columns = column_means.sort_values().index

# Melt the DataFrame using the sorted column order
df_melted = pd.melt(df_sel[sorted_columns], var_name='column', value_name='value')

# Set figure size
plt.figure(figsize=(10, 6))

# Create violin plot, ensuring order follows sorted means
sns.violinplot(x='value', y='column', data=df_melted, order=sorted_columns, orient='h', palette='Set3', linewidth=1.2, cut=0, scale="width")

# Enhancements for better visualization
plt.xlabel('Total Wall Time', fontsize=12)
plt.ylabel('Executor', fontsize=12)
plt.title('Latency Plot', fontsize=14, fontweight='bold')
plt.xticks(fontsize=10)
plt.yticks(fontsize=10)

sns.despine(left=True)

plt.tight_layout()
plt.show()

In [None]:
df_sel.describe()

# TODO:

[x] add baseline

[] native google search retrieval

[x] callback/count tokens for react

[x] ReWOO

[x] add walltime to all other functions -> # token, time, accuracy

[] spider charts

[x] deepseek

# Deploy DeepSeek

In [None]:
# %pip install --upgrade --user --quiet "google-cloud-aiplatform[reasoningengine, evaluation]" "openai" "smolagents" \
#     "cloudpickle==3.0.0" \
#     "pydantic>=2.10" \
#     "requests"

In [None]:
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

In [None]:
# import sys

# if "google.colab" in sys.modules:
#     from google.colab import auth

#     auth.authenticate_user()

In [None]:
# PROJECT_ID = "amir-genai-bb"
# LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

# vertexai.init(project=PROJECT_ID, location=LOCATION, staging_bucket=BUCKET_URI)

In [None]:
# BUCKET_NAME = "deepseek-amir"
# BUCKET_URI = f"gs://{BUCKET_NAME}"

# ! gsutil mb -p $PROJECT_ID -l $LOCATION $BUCKET_URI
# MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

In [None]:
# import os

# import vertexai
# from huggingface_hub import get_token
# from google.cloud import aiplatform

In [None]:
# deepseek_model = aiplatform.Model.upload(
#     display_name=MODEL_ID.replace("/", "--").lower(),
#     serving_container_image_uri="us-docker.pkg.dev/deeplearning-platform-release/vertex-model-garden/vllm-inference.cu121.0-6.ubuntu2204.py310",
#     serving_container_args=[
#         "python",
#         "-m",
#         "vllm.entrypoints.api_server",
#         "--host=0.0.0.0",
#         "--port=8080",
#         f"--model={MODEL_ID}",
#         "--tensor-parallel-size=1",
#         "--max-model-len=16384",
#         "--enforce-eager",
#     ],
#     serving_container_ports=[8080],
#     serving_container_predict_route="/generate",
#     serving_container_health_route="/ping",
#     serving_container_environment_variables={
#         "HF_TOKEN": get_token(),
#         "DEPLOY_SOURCE": "notebook",
#     },
# )
# deepseek_model.wait()

In [None]:
# deepseek_endpoint = aiplatform.Endpoint.create(
#     display_name=MODEL_ID.replace("/", "--").lower() + "-endpoint"
# )

# deployed_deepseek_model = deepseek_model.deploy(
#     endpoint=deepseek_endpoint,
#     machine_type="g2-standard-12",
#     accelerator_type="NVIDIA_L4",
#     accelerator_count=1,
#     sync=False,
# )

## Use DeepSeek

In [None]:
import os
from google.cloud import aiplatform

PROJECT_ID = "amir-genai-bb"
LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

# Initialize the Vertex AI client
aiplatform.init(project=PROJECT_ID, location=LOCATION)


In [None]:
# Get a list of endpoints
endpoints = aiplatform.Endpoint.list()
endpoints

In [None]:
endpoints[0].name

In [None]:
endpoint = aiplatform.Endpoint(endpoint_name="5120549895366770688")

In [None]:
prediction_request = {
    "instances": [
        {
            "@requestFormat": "textGeneration",
            "prompt":"Count the number of 'r' in the word Strawberry. only return the final answer.",
            "max_tokens": 2048,
            "temperature": 0.7,
        }
    ]
}

In [None]:

output.predictions[0]

In [None]:

output = endpoint.predict(instances=prediction_request["instances"])
for prediction in output.predictions[0]:
    print("------- DeepSeek prediction -------")
    print(prediction["message"]["content"])
    print("---------------------------------\n")

In [None]:
prediction = endpoint.predict(instances=prediction_request["instances"])

In [None]:
prediction