# Set up Project

In [1]:
import os, re, time, torch
from dotenv import load_dotenv
from agents import Agent, function_tool, ModelSettings
from agents.extensions.models.litellm_model import LitellmModel
from transformers import AutoModelForCausalLM, AutoTokenizer
from utils import extracted_box, len_extract_boxed
from grader import math_equal
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [None]:
from agents import Agent, InputGuardrail, GuardrailFunctionOutput, Runner
from agents.exceptions import InputGuardrailTripwireTriggered
from pydantic import BaseModel
import asyncio

In [23]:
from openai.types.shared.reasoning import Reasoning

from agents import Agent, ModelSettings, Runner, trace
from agents.items import ReasoningItem

In [19]:
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
MODEL = "gemini/gemini-2.5-flash"
GPUS="0 5"
SAMPLES=10
SEED=0
DATASET="math_500"
MAX_TOKENS=8000

# Load Data

In [16]:
import sys
current_dir = os.getcwd()
from pathlib import Path
sys.path.append(str(Path(current_dir).parent))

In [17]:
from tools.data_loader import read_data
sys.path

['/data/long/miniconda3/envs/delegate/lib/python310.zip',
 '/data/long/miniconda3/envs/delegate/lib/python3.10',
 '/data/long/miniconda3/envs/delegate/lib/python3.10/lib-dynload',
 '',
 '/data/long/miniconda3/envs/delegate/lib/python3.10/site-packages',
 '/data/long/hai/Individual_Project/Delegate_SLM_Focus/experiments',
 '/tmp/tmpwswdq2no',
 '/data/long/hai/Individual_Project/Delegate_SLM_Focus']

In [None]:
test_df = read_data(DATASET, n_samples=SAMPLES, random_seed=SEED)

Loaded 10 problems from GSM8K (test split)


# Host and Request Model

In [None]:
# !CUDA_VISIBLE_DEVICES=7 vllm serve Qwen/Qwen2.5-1.5B-Instruct \
#   --tensor-parallel-size 1 \
#   --gpu-memory-utilization 0.9 \
#   --enable-auto-tool-choice \
#   --tool-call-parser hermes \
#   --reasoning-parser deepseek_r1


In [None]:
# CUDA_VISIBLE_DEVICES=6,7 vllm serve Qwen/Qwen2.5-Math-1.5B-Instruct --tensor-parallel-size 2 --gpu-memory-utilization 0.9

In [None]:
# CUDA_VISIBLE_DEVICES=7 vllm serve Qwen/Qwen2.5-1.5B-Instruct --tensor-parallel-size 1 --gpu-memory-utilization 0.9

## Request Model (normal Framework)

In [40]:
import requests
import json

# === Configuration ===
BASE_URL = "http://localhost:8000/v1"
MODEL_NAME = "Qwen/Qwen3-4B-Instruct-2507"  # or "Qwen3-4B" if you used --served-model-name

# === Send request ===
payload = {
    "model": MODEL_NAME,
    "messages": [
        {"role": "system", "content": "You are a helpful AI assistant."},
        {"role": "user", "content": "Write a haiku for me about loving."}
    ],
}

response = requests.post(
    f"{BASE_URL}/chat/completions",
    headers={"Content-Type": "application/json"},
    data=json.dumps(payload)
)

# === Display result ===
if response.status_code == 200:
    data = response.json()
    print("✅ Response:\n")
    print(data["choices"][0]["message"]["content"])
else:
    print(f"❌ Error {response.status_code}: {response.text}")

✅ Response:

Here's a haiku about loving:

**Warmth spreads through the quiet,  
A steady hand, a gentle voice—  
Heart finds its true home.**

*   **Line 1 (5 syllables):** Establishes the feeling of deep, quiet love ("Warmth spreads through the quiet").
*   **Line 2 (7 syllables):** Shows the actions and presence of love ("A steady hand, a gentle voice").
*   **Line 3 (5 syllables):** Conveys the ultimate, fulfilling destination of love ("Heart finds its true home").


In [43]:
data

{'id': 'chatcmpl-b187d3b2ebcd4b1cb6d9e17bddc06f95',
 'object': 'chat.completion',
 'created': 1762947653,
 'model': 'Qwen/Qwen3-4B-Instruct-2507',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': 'Here\'s a haiku about loving:\n\n**Warmth spreads through the quiet,  \nA steady hand, a gentle voice—  \nHeart finds its true home.**\n\n*   **Line 1 (5 syllables):** Establishes the feeling of deep, quiet love ("Warmth spreads through the quiet").\n*   **Line 2 (7 syllables):** Shows the actions and presence of love ("A steady hand, a gentle voice").\n*   **Line 3 (5 syllables):** Conveys the ultimate, fulfilling destination of love ("Heart finds its true home").',
    'refusal': None,
    'annotations': None,
    'audio': None,
    'function_call': None,
    'tool_calls': [],
    'reasoning_content': None},
   'logprobs': None,
   'finish_reason': 'stop',
   'stop_reason': None,
   'token_ids': None}],
 'service_tier': None,
 'system_fingerprint': None,
 'usage'

## Agents SDK Model

In [44]:
INSTRUCTIONS = """
You are a math calculator. Put the final answer in \\boxed{} at the end.
"""

In [45]:
agent = Agent(
    name="Math Router",
    instructions=INSTRUCTIONS,
    model=LitellmModel(model="hosted_vllm/Qwen/Qwen3-4B-Instruct-2507", base_url="http://localhost:8000/v1"),
    model_settings=ModelSettings(
        max_tokens=1024,
        parallel_tool_calls=False,
        reasoning={"effort": "low"},
        include_usage=True,
    ),
    # tools=[slm_help],
)

In [46]:
result = await Runner.run(agent, test_df.iloc[2]['problem'])

In [47]:
result

RunResult(input='What is $\\left(4\\dfrac{5}{8}\\right)^{55} \\cdot \\left(\\dfrac{8}{37}\\right)^{55}$?', new_items=[MessageOutputItem(agent=Agent(name='Math Router', handoff_description=None, tools=[], mcp_servers=[], mcp_config={}, instructions='\nYou are a math calculator. Put the final answer in \\boxed{} at the end.\n', prompt=None, handoffs=[], model=<agents.extensions.models.litellm_model.LitellmModel object at 0x7f549f88f4c0>, model_settings=ModelSettings(temperature=None, top_p=None, frequency_penalty=None, presence_penalty=None, tool_choice=None, parallel_tool_calls=False, truncation=None, max_tokens=1024, reasoning=Reasoning(effort='low', generate_summary=None, summary=None), verbosity=None, metadata=None, store=None, include_usage=True, response_include=None, top_logprobs=None, extra_query=None, extra_body=None, extra_headers=None, extra_args=None), input_guardrails=[], output_guardrails=[], output_type=None, hooks=None, tool_use_behavior='run_llm_again', reset_tool_choice

In [51]:
print(result.final_output)

We are given the expression:

$$
\left(4\dfrac{5}{8}\right)^{55} \cdot \left(\dfrac{8}{37}\right)^{55}
$$

---

### Step 1: Convert the mixed number to an improper fraction

$$
4\dfrac{5}{8} = \dfrac{4 \times 8 + 5}{8} = \dfrac{32 + 5}{8} = \dfrac{37}{8}
$$

So the expression becomes:

$$
\left(\dfrac{37}{8}\right)^{55} \cdot \left(\dfrac{8}{37}\right)^{55}
$$

---

### Step 2: Use the property of exponents: $ a^n \cdot b^n = (a \cdot b)^n $

$$
\left(\dfrac{37}{8} \cdot \dfrac{8}{37}\right)^{55}
$$

Now simplify the product inside the parentheses:

$$
\dfrac{37}{8} \cdot \dfrac{8}{37} = \dfrac{37 \cdot 8}{8 \cdot 37} = 1
$$

So the expression becomes:

$$
1^{55} = 1
$$

---

### ✅ Final Answer:

$$
\boxed{1}
$$


In [49]:
response = result.raw_responses[0]
print("Input tokens:", response.usage.input_tokens)
print("Output tokens:", response.usage.output_tokens)
print("Total tokens:", response.usage.total_tokens)
# Access detailed breakdown
print("Reasoning tokens:", response.usage.output_tokens_details.reasoning_tokens)
print("Cached tokens:", response.usage.input_tokens_details.cached_tokens)


# Output Results
print("-"*80)
print("The final result is:")

Input tokens: 70
Output tokens: 306
Total tokens: 376
Reasoning tokens: 0
Cached tokens: 0
--------------------------------------------------------------------------------
The final result is:
