From 6d489a351085f6fedacdc046e746fd99bc351479 Mon Sep 17 00:00:00 2001 From: lqc Date: Tue, 31 Mar 2026 11:32:58 +0800 Subject: [PATCH] update docs --- index.html | 2 +- search/search_index.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/index.html b/index.html index 54eab3a..c330c57 100644 --- a/index.html +++ b/index.html @@ -1509,7 +1509,7 @@

Team&par

Citation

@misc{clawr1-2026,
   title={Claw-R1: The Data Foundation for Agentic Reinforcement Learning},
-  author={Wang, Daoyu and Ouyang, Jie and Yu, Shuo and Cheng, Mingyue and Liu, Qi},
+  author={Wang, Daoyu and Li, Qingchuan and Ouyang, Jie and Yu, Shuo and Cheng, Mingyue and Liu, Qi},
   year={2025},
   howpublished={\url{https://github.com/AgentR1/Claw-R1}},
   note={GitHub repository}
diff --git a/search/search_index.json b/search/search_index.json
index b657128..6eba891 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"],"fields":{"title":{"boost":1000.0},"text":{"boost":1.0},"tags":{"boost":1000000.0}}},"docs":[{"location":"","title":"Claw-R1","text":"

The Data Foundation for Agentic Reinforcement Learning

Claw-R1 \u662f Agentic RL \u7684\u6570\u636e\u57fa\u7840\u8bbe\u65bd \u2014 \u4e13\u6ce8\u4e8e\u4ece\u4efb\u610f Agent \u4ea4\u4e92\u4e2d\u91c7\u96c6\u3001\u8bc4\u4f30\u548c\u7b5b\u9009\u9ad8\u8d28\u91cf\u8bad\u7ec3\u6570\u636e\uff0c\u5e76\u652f\u6301\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u7684\u6574\u5408\u3002

  • Universal Data Collection

    \u4ece\u767d\u76d2\u3001\u9ed1\u76d2\u5230\u5728\u7ebf\u670d\u52a1 Agent\uff0c\u901a\u8fc7 base_url \u673a\u5236\u96f6\u4ee3\u7801\u63a5\u5165\uff0c\u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\u3002\u652f\u6301 LangChain\u3001AutoGen\u3001CrewAI \u7b49\u4efb\u610f OpenAI \u517c\u5bb9 Agent\u3002

    Base URL Integration

  • Data Middleware Layer

    Gateway + DataPool \u6570\u636e\u4e2d\u95f4\u4ef6\uff1aGateway \u91c7\u96c6\u4ea4\u4e92\u6570\u636e\uff0cDataPool \u7ba1\u7406\u6570\u636e\u8d28\u91cf\u3001\u5206\u533a\u7f13\u51b2\u3001\u6309\u9700\u4f9b\u7ed9\u8bad\u7ec3\u5f15\u64ce\u3002

    Middleware Layer

  • Data Evaluation & Curation

    \u591a\u7ef4 Reward \u7cfb\u7edf\uff08\u89c4\u5219/\u5224\u522b\u5f0f RM/\u751f\u6210\u5f0f RM\uff09+ \u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u6574\u5408 + \u7b56\u7565\u7248\u672c\u8ffd\u8e2a\uff0c\u7cfb\u7edf\u6027\u8bc4\u4f30\u548c\u7b5b\u9009\u6570\u636e\u8d28\u91cf\u3002

    Reward System

  • Production Agent Scenario

    \"\u90e8\u7f72 = \u8bad\u7ec3\" \u8303\u5f0f\u3002Agent \u5728\u670d\u52a1\u7528\u6237\u7684\u540c\u65f6\u6301\u7eed\u91c7\u96c6\u6570\u636e\uff0c\u7528\u6237\u884c\u4e3a\uff08\u91c7\u7eb3\u3001\u4fee\u6539\u3001\u8ffd\u95ee\uff09\u5929\u7136\u6210\u4e3a\u6570\u636e\u8d28\u91cf\u4fe1\u53f7\u3002

    Production Scenario

"},{"location":"#why-claw-r1","title":"Why Claw-R1?","text":"

Agentic RL \u751f\u6001\u6b63\u84ec\u52c3\u53d1\u5c55 \u2014 verl\u3001Agent-R1\u3001Forge \u7b49\u4f18\u79c0\u6846\u67b6\u5728 Runtime \u548c\u8bad\u7ec3\u7b97\u6cd5\u65b9\u9762\u6301\u7eed\u63a8\u8fdb\u3002\u7136\u800c\uff0c\u968f\u7740 Agent \u4ece\u7b80\u5355 ReAct \u6f14\u8fdb\u5230 Claude Code\u3001OpenClaw \u7b49\u901a\u7528\u67b6\u6784\uff0c\u4e00\u4e2a\u76f8\u5bf9\u6b20\u7f3a\u3001\u503c\u5f97\u6df1\u8015\u7684\u65b9\u5411\u9010\u6e10\u6d6e\u73b0\uff1a\u5982\u4f55\u4ece\u591a\u6837\u7684 Agent \u4ea4\u4e92\u4e2d\u7cfb\u7edf\u6027\u5730\u91c7\u96c6\u3001\u8bc4\u4f30\u548c\u7b5b\u9009\u9ad8\u8d28\u91cf\u8bad\u7ec3\u6570\u636e\uff1f

Claw-R1 \u805a\u7126\u4e8e\u8fd9\u4e00\u65b9\u5411\uff0c\u63d0\u4f9b Agent \u4e0e Trainer \u4e4b\u95f4\u7684\u6570\u636e\u57fa\u7840\u8bbe\u65bd\u3002

\u7ef4\u5ea6 \u4f20\u7edf Agentic RL \u6846\u67b6 Claw-R1 \u6838\u5fc3\u5173\u6ce8 \u8bad\u7ec3\u7b97\u6cd5\u4e0e Runtime \u6570\u636e\u7684\u91c7\u96c6\u3001\u8bc4\u4f30\u4e0e\u7b5b\u9009 Agent \u63a5\u5165 \u9700\u8981\u7528\u6846\u67b6 API \u91cd\u5199 \u53ea\u6539 base_url\uff0c\u96f6\u4ee3\u7801\u4fb5\u5165 \u6570\u636e\u6765\u6e90 \u9884\u6536\u96c6\u7684\u79bb\u7ebf\u6570\u636e \u5b9e\u65f6\u4ea4\u4e92\u81ea\u52a8\u91c7\u96c6 + \u79bb\u7ebf\u6570\u636e\u96c6 \u6570\u636e\u8d28\u91cf\u7ba1\u63a7 \u8f83\u5c11\u5173\u6ce8 \u591a\u7ef4 Reward + \u4eba\u7c7b\u53cd\u9988 + \u65b0\u9c9c\u5ea6\u68c0\u6d4b \u8bad\u7ec3\u5f15\u64ce \u5185\u7f6e\u7ed1\u5b9a \u53ef\u63d2\u62d4 TrainingBackend\uff0c\u5bf9\u63a5\u4efb\u610f\u5f15\u64ce"},{"location":"#_1","title":"\u5feb\u901f\u5f00\u59cb","text":"
# \u514b\u9686\u4ed3\u5e93\ngit clone https://github.com/AgentR1/Claw-R1 && cd Claw-R1\n\n# \u8fd0\u884c\u9ed1\u76d2 GSM8K \u8bad\u7ec3\nexport CUDA_VISIBLE_DEVICES=0,1,2\nsh example/test_async_blackbox.sh\n

\u5b8c\u6574\u5b89\u88c5\u6307\u5357 \u00b7 Quick Start

"},{"location":"#_2","title":"\u9879\u76ee\u72b6\u6001","text":"\u80fd\u529b \u72b6\u6001 \u767d\u76d2 Agent \u6570\u636e\u91c7\u96c6 \u5df2\u5b9e\u73b0 \u9ed1\u76d2 Agent \u6570\u636e\u91c7\u96c6 \u5df2\u5b9e\u73b0 \u5728\u7ebf\u670d\u52a1\u6570\u636e\u91c7\u96c6 \u5f00\u53d1\u4e2d \u5f02\u6b65\u8bad\u7ec3\u4f9b\u7ed9 \u5df2\u5b9e\u73b0 \u4eba\u7c7b\u53cd\u9988\u7ba1\u7ebf \u89c4\u5212\u4e2d \u6570\u636e\u8d28\u91cf Dashboard \u89c4\u5212\u4e2d"},{"location":"#team","title":"Team","text":"

State Key Laboratory of Cognitive Intelligence, USTC

"},{"location":"#citation","title":"Citation","text":"
@misc{clawr1-2026,\n  title={Claw-R1: The Data Foundation for Agentic Reinforcement Learning},\n  author={Wang, Daoyu and Ouyang, Jie and Yu, Shuo and Cheng, Mingyue and Liu, Qi},\n  year={2025},\n  howpublished={\\url{https://github.com/AgentR1/Claw-R1}},\n  note={GitHub repository}\n}\n
"},{"location":"contributing/","title":"Contributing","text":"

\u611f\u8c22\u4f60\u5bf9 Claw-R1 \u7684\u5173\u6ce8\uff01\u6b22\u8fce\u8d21\u732e\u4ee3\u7801\u3001\u6587\u6863\u548c\u60f3\u6cd5\u3002

"},{"location":"contributing/#_1","title":"\u9879\u76ee\u7ed3\u6784","text":"
claw_r1/\n\u251c\u2500\u2500 agent_flow/           # Agent \u6267\u884c\u6846\u67b6\uff08\u767d\u76d2 + \u7ba1\u7406\u5668\uff09\n\u251c\u2500\u2500 blackbox_agent/       # \u9ed1\u76d2 Agent \u7cfb\u7edf\uff08Flow + Agent \u5b9e\u73b0\uff09\n\u251c\u2500\u2500 config/               # Hydra \u914d\u7f6e\u6587\u4ef6\n\u251c\u2500\u2500 data_pool/            # DataPool\uff08Ray Actor + Training Backend\uff09\n\u251c\u2500\u2500 gateway/              # Gateway Server\uff08FastAPI\uff09\n\u251c\u2500\u2500 async_main.py         # \u5f02\u6b65\u8bad\u7ec3\u5165\u53e3\n\u251c\u2500\u2500 async_rollouter.py    # AsyncRollouter\uff08Rollout GPU Pool\uff09\n\u251c\u2500\u2500 async_trainer.py      # AsyncTrainer\uff08Training GPU Pool\uff09\n\u251c\u2500\u2500 param_sync.py         # ParameterSynchronizer\n\u251c\u2500\u2500 detach_workers.py     # \u5206\u79bb\u5f0f Actor/Rollout Worker\n\u251c\u2500\u2500 core_algos.py         # PPO/GAE/GRPO \u6838\u5fc3\u7b97\u6cd5\n\u251c\u2500\u2500 reward_loop.py        # RewardLoopWorker\n\u251c\u2500\u2500 metric_utils.py       # \u6307\u6807\u805a\u5408\n\u251c\u2500\u2500 ray_agent_trainer.py  # \u540c\u6b65 Ray PPO Trainer\n\u2514\u2500\u2500 main_agent_ppo.py     # \u540c\u6b65\u8bad\u7ec3\u5165\u53e3\n
"},{"location":"contributing/#_2","title":"\u4ee3\u7801\u98ce\u683c","text":"
  • \u4f7f\u7528 Ruff \u8fdb\u884c lint \u548c\u683c\u5f0f\u5316
  • \u9075\u5faa PEP 8
  • \u7c7b\u578b\u6ce8\u89e3\uff08Python 3.10+ \u8bed\u6cd5\uff09
# \u5b89\u88c5 pre-commit hooks\npip install pre-commit\npre-commit install\n\n# \u624b\u52a8\u68c0\u67e5\nruff check .\nruff format .\n
"},{"location":"contributing/#_3","title":"\u8d21\u732e\u65b9\u5411","text":""},{"location":"contributing/#_4","title":"\u9ad8\u4f18\u5148\u7ea7","text":"
  • \u65b0\u7684\u9ed1\u76d2 Agent \u5b9e\u73b0\uff08\u53c2\u8003 blackbox_agent/gsm8k_agent.py\uff09
  • \u65b0\u7684 Reward \u51fd\u6570
  • \u6027\u80fd\u4f18\u5316\uff08DataPool \u541e\u5410\u3001Gateway \u5ef6\u8fdf\uff09
"},{"location":"contributing/#_5","title":"\u6587\u6863","text":"
  • \u6559\u7a0b\u548c\u793a\u4f8b
  • API \u6587\u6863\u8865\u5145
  • \u4e2d\u82f1\u6587\u7ffb\u8bd1
"},{"location":"contributing/#_6","title":"\u7814\u7a76","text":"
  • \u65b0\u7684 advantage \u8ba1\u7b97\u7b97\u6cd5
  • \u5728\u7ebf\u5b66\u4e60\u7b56\u7565
  • \u591a Agent \u534f\u4f5c\u8bad\u7ec3
"},{"location":"contributing/#pr","title":"PR \u6d41\u7a0b","text":"
  1. Fork \u4ed3\u5e93
  2. \u521b\u5efa feature branch\uff1agit checkout -b feature/my-feature
  3. \u7f16\u5199\u4ee3\u7801\u548c\u6d4b\u8bd5
  4. \u786e\u4fdd ruff check . \u901a\u8fc7
  5. \u63d0\u4ea4 PR\uff0c\u63cf\u8ff0\u6539\u52a8\u5185\u5bb9\u548c\u52a8\u673a
"},{"location":"contributing/#_7","title":"\u672c\u5730\u6784\u5efa\u6587\u6863","text":"
pip install mkdocs-material\nmkdocs serve\n# \u8bbf\u95ee http://localhost:8000\n
"},{"location":"contributing/#_8","title":"\u8054\u7cfb","text":"
  • GitHub Issues: AgentR1/Claw-R1
"},{"location":"api/","title":"API Reference","text":"

\u672c\u8282\u6587\u6863\u5316 Claw-R1 \u5404\u7ec4\u4ef6\u66b4\u9732\u7684 HTTP \u548c Python API\u3002

  • Gateway HTTP API

    REST \u7aef\u70b9\uff0c\u7528\u4e8e Agent \u96c6\u6210\u548c Step \u63d0\u4ea4\u3002\u5305\u62ec\u767d\u76d2\u7aef\u70b9\uff08/generate\u3001/submit_steps\uff09\u548c\u9ed1\u76d2\u7aef\u70b9\uff08{base_url}/v1/chat/completions\uff09\u3002

    Gateway API

"},{"location":"api/#python","title":"Python \u63a5\u53e3","text":""},{"location":"api/#datapool-ray-actor","title":"DataPool (Ray Actor)","text":"
import ray\nfrom claw_r1.data_pool import DataPool\n\ndata_pool = ray.get_actor(\"data_pool\")\n\n# Producer\uff08\u7531 Gateway \u5185\u90e8\u8c03\u7528\uff09\nray.get(data_pool.submit_step.remote(step, channel=\"train\"))\nray.get(data_pool.submit_steps.remote(steps, channel=\"train\"))\nray.get(data_pool.complete_trajectory.remote(trajectory_uid, channel=\"train\"))\n\n# Consumer\uff08\u7531 Trainer \u8c03\u7528\uff09\nbatch = ray.get(data_pool.fetch_batch.remote(n_rollouts=5, channel=\"train\"))\n
"},{"location":"api/#rewardloopworker-ray-actor","title":"RewardLoopWorker (Ray Actor)","text":"
from claw_r1.reward_loop import RewardLoopWorker\n\nreward_worker = ray.get_actor(\"reward_loop_worker\")\nrewards = ray.get(reward_worker.compute_score_batch.remote(steps))\n
"},{"location":"api/#agentflowbase-python-class","title":"AgentFlowBase (Python class)","text":"
from claw_r1.agent_flow import SingleStepSingleTurnAgentFlow\n\nclass MyFlow(SingleStepSingleTurnAgentFlow):\n    async def run(self, sampling_params, **kwargs) -> int:\n        text, response_ids, prompt_ids = await self.gateway_generate(\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            messages=[{\"role\": \"user\", \"content\": kwargs[\"question\"]}],\n        )\n        # \u6784\u5efa Step \u5e76\u63d0\u4ea4 ...\n        return 1\n
"},{"location":"api/#blackboxagentflowbase-python-class","title":"BlackBoxAgentFlowBase (Python class)","text":"
from claw_r1.agent_flow.agent_flow import register\nfrom claw_r1.blackbox_agent.blackbox_agent_flow import BlackBoxAgentFlowBase\n\n@register(\"my_blackbox_agent\")\nclass MyBlackBoxFlow(BlackBoxAgentFlowBase):\n    async def _run_agent(self, base_url: str, kwargs: dict) -> int:\n        # \u521b\u5efa Agent\uff0c\u4f7f\u7528 base_url \u4f5c\u4e3a OpenAI API endpoint\n        agent = MyAgent(base_url=base_url)\n        return await agent.solve(task=kwargs[\"raw_prompt\"])\n
"},{"location":"api/gateway/","title":"Gateway API","text":"

Gateway \u9ed8\u8ba4\u76d1\u542c\u7aef\u53e3 8100\uff08\u901a\u8fc7 --port \u914d\u7f6e\uff09\u3002\u6240\u6709\u7aef\u70b9\u5747\u63a5\u53d7\u548c\u8fd4\u56de JSON\u3002

"},{"location":"api/gateway/#base-url","title":"Base URL","text":"
http://<gateway-host>:8100\n
"},{"location":"api/gateway/#white-box","title":"White-box \u7aef\u70b9","text":"

\u8fd9\u4e9b\u7aef\u70b9\u7531 AgentFlowBase \u7684\u767d\u76d2 Agent \u8c03\u7528\u3002

"},{"location":"api/gateway/#post-generate","title":"POST /generate","text":"

\u5c06\u751f\u6210\u8bf7\u6c42\u8f6c\u53d1\u5230 vLLM \u5e76\u8fd4\u56de\u5e26 token ID \u7684\u54cd\u5e94\u3002

\u8c03\u7528\u65b9: AgentFlowBase.gateway_generate()

"},{"location":"api/gateway/#request","title":"Request","text":"
{\n  \"trajectory_uid\": \"string\",\n  \"prompt_uid\": \"string\",\n  \"messages\": [\n    { \"role\": \"user\", \"content\": \"string\" }\n  ],\n  \"max_tokens\": 1024,\n  \"temperature\": 1.0,\n  \"top_p\": 1.0\n}\n
\u5b57\u6bb5 \u7c7b\u578b \u5fc5\u586b \u8bf4\u660e trajectory_uid string \u662f \u5f53\u524d\u5bf9\u8bdd\u7684\u552f\u4e00 ID prompt_uid string \u662f Prompt \u7ec4 ID\uff08\u7528\u4e8e GRPO \u5206\u7ec4\uff09 messages array \u662f OpenAI \u683c\u5f0f\u7684\u804a\u5929\u6d88\u606f max_tokens int \u5426 \u6700\u5927\u54cd\u5e94\u957f\u5ea6\uff08\u9ed8\u8ba4\u53d6 --response-length\uff09 temperature float \u5426 \u91c7\u6837\u6e29\u5ea6\uff08\u9ed8\u8ba4 1.0\uff09 top_p float \u5426 Top-p \u91c7\u6837\uff08\u9ed8\u8ba4 1.0\uff09"},{"location":"api/gateway/#response","title":"Response","text":"
{\n  \"response_text\": \"string\",\n  \"response_ids\": [101, 202, 303],\n  \"prompt_ids\": [50, 60, 70, 80]\n}\n
"},{"location":"api/gateway/#post-submit_steps","title":"POST /submit_steps","text":"

\u63d0\u4ea4\u4e00\u4e2a\u6216\u591a\u4e2a Step \u5bf9\u8c61\u5230 DataPool\u3002

\u8c03\u7528\u65b9: AgentFlowBase.gateway_submit_steps()

"},{"location":"api/gateway/#request_1","title":"Request","text":"
{\n  \"steps\": [\n    {\n      \"trajectory_uid\": \"string\",\n      \"prompt_uid\": \"string\",\n      \"prompt_ids\": [50, 60, 70],\n      \"response_ids\": [101, 202],\n      \"reward\": 0.0,\n      \"step_index\": 0,\n      \"policy_version\": 42,\n      \"is_last\": true,\n      \"metadata\": {}\n    }\n  ]\n}\n
"},{"location":"api/gateway/#response_1","title":"Response","text":"
{\n  \"accepted\": 1\n}\n
"},{"location":"api/gateway/#post-compute_reward","title":"POST /compute_reward","text":"

\u4e3a\u4e00\u4e2a step \u8ba1\u7b97 reward\uff08\u7531 Trainer \u8c03\u7528\uff0c\u4e0d\u7531 Agent \u8c03\u7528\uff09\u3002

"},{"location":"api/gateway/#request_2","title":"Request","text":"
{\n  \"trajectory_uid\": \"string\",\n  \"messages\": [...],\n  \"dataset_fields\": {\n    \"ground_truth\": \"string\",\n    \"task_type\": \"string\"\n  }\n}\n
"},{"location":"api/gateway/#response_2","title":"Response","text":"
{\n  \"reward\": 0.85\n}\n
"},{"location":"api/gateway/#black-box","title":"Black-box \u7aef\u70b9","text":"

\u8fd9\u4e9b\u7aef\u70b9\u4f9b\u9ed1\u76d2 Agent \u4f7f\u7528\u3002\u9ed1\u76d2 Agent \u53ea\u9700\u8981\u77e5\u9053\u4e00\u4e2a base_url\uff0c\u6240\u6709\u4ea4\u4e92\u90fd\u901a\u8fc7\u8be5 URL \u5b8c\u6210\u3002

base_url \u7684\u683c\u5f0f\u4e3a http://<host>:<port>/<trajectory_uid>/<prompt_uid>\uff0c\u7531 POST /init_trajectory \u8fd4\u56de\u3002

"},{"location":"api/gateway/#post-init_trajectory","title":"POST /init_trajectory","text":"

\u5206\u914d\u4e00\u6761\u65b0\u7684 trajectory \u5e76\u8fd4\u56de base_url\u3002

"},{"location":"api/gateway/#request_3","title":"Request","text":"

\u65e0\u8bf7\u6c42\u4f53\u3002

"},{"location":"api/gateway/#response_3","title":"Response","text":"
{\n  \"trajectory_uid\": \"a1b2c3d4e5f6...\",\n  \"base_url\": \"http://0.0.0.0:8100/a1b2c3d4e5f6.../1\"\n}\n
"},{"location":"api/gateway/#post-base_urlv1register_trajectory","title":"POST {base_url}/v1/register_trajectory","text":"

\u6ce8\u518c trajectory \u7684 channel \u548c metadata\u3002\u5728 Agent \u5f00\u59cb\u4ea4\u4e92\u4e4b\u524d\u8c03\u7528\u3002

trajectory_uid \u4ece URL path \u4e2d\u63d0\u53d6\uff0c\u65e0\u9700\u5728 body \u4e2d\u4f20\u9012\u3002

"},{"location":"api/gateway/#request_4","title":"Request","text":"
{\n  \"channel\": \"train\",\n  \"metadata\": {\n    \"data_source\": \"gsm8k\",\n    \"ground_truth\": \"42\"\n  }\n}\n

\u6240\u6709\u5b57\u6bb5\u5747\u4e3a\u53ef\u9009\u3002channel \u9ed8\u8ba4\u4e3a \"train\"\u3002

"},{"location":"api/gateway/#response_4","title":"Response","text":"
{ \"status\": \"ok\" }\n
"},{"location":"api/gateway/#post-base_urlv1chatcompletions","title":"POST {base_url}/v1/chat/completions","text":"

OpenAI \u517c\u5bb9\u7684\u804a\u5929\u8865\u5168\u7aef\u70b9\u3002\u9ed1\u76d2 Agent \u53ea\u9700\u5c06 base_url \u8bbe\u4e3a OpenAI SDK \u7684 base_url\uff0c\u5373\u53ef\u900f\u660e\u63a5\u5165\u8bad\u7ec3\u7cfb\u7edf\u3002

Gateway \u4f1a\uff1a

  1. \u5c06\u8bf7\u6c42\u8f6c\u53d1\u5230 vLLM \u670d\u52a1\u5668
  2. \u5bf9 prompt \u548c response \u8fdb\u884c tokenize
  3. \u81ea\u52a8\u6784\u5efa Step \u5e76\u63d0\u4ea4\u5230 DataPool
  4. \u8fd4\u56de\u6807\u51c6 OpenAI \u683c\u5f0f\u7684\u54cd\u5e94
"},{"location":"api/gateway/#request_5","title":"Request","text":"

\u6807\u51c6 OpenAI chat/completions \u8bf7\u6c42\u4f53\u3002

{\n  \"model\": \"qwen\",\n  \"messages\": [\n    { \"role\": \"user\", \"content\": \"What is 2+2?\" }\n  ],\n  \"temperature\": 0.7\n}\n
"},{"location":"api/gateway/#response_5","title":"Response","text":"

\u6807\u51c6 OpenAI chat/completions \u54cd\u5e94\u4f53\u3002

{\n  \"id\": \"chatcmpl-...\",\n  \"object\": \"chat.completion\",\n  \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\": \"assistant\",\n        \"content\": \"4\"\n      },\n      \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 10,\n    \"completion_tokens\": 1,\n    \"total_tokens\": 11\n  }\n}\n
"},{"location":"api/gateway/#post-base_urlv1complete_trajectory","title":"POST {base_url}/v1/complete_trajectory","text":"

\u6807\u8bb0\u4e00\u6761 trajectory \u5b8c\u6210\u3002Agent \u5b8c\u6210\u6240\u6709\u4ea4\u4e92\u540e\u8c03\u7528\u3002

"},{"location":"api/gateway/#request_6","title":"Request","text":"

\u65e0\u8bf7\u6c42\u4f53\u3002

"},{"location":"api/gateway/#response_6","title":"Response","text":"
{ \"status\": \"ok\" }\n
"},{"location":"api/gateway/#post-complete_trajectorytrajectory_uid","title":"POST /complete_trajectory/{trajectory_uid}","text":"

\u5185\u90e8\u7aef\u70b9\uff0c\u901a\u8fc7 trajectory_uid \u76f4\u63a5\u6807\u8bb0\u5b8c\u6210\u3002\u53ef\u9009\u4f20\u5165 reward \u548c channel\u3002

"},{"location":"api/gateway/#request_7","title":"Request","text":"
{\n  \"channel\": \"train\",\n  \"reward\": 0.9\n}\n
"},{"location":"api/gateway/#response_7","title":"Response","text":"
{ \"status\": \"ok\" }\n
"},{"location":"api/gateway/#_1","title":"\u5c31\u7eea\u68c0\u67e5","text":""},{"location":"api/gateway/#get-ready","title":"GET /ready","text":"

\u5f53 Gateway \u5b8c\u5168\u521d\u59cb\u5316\uff08\u5305\u62ec tokenizer \u52a0\u8f7d\u5b8c\u6210\uff09\u540e\u8fd4\u56de 200\u3002\u7528\u4e8e Rollouter \u542f\u52a8\u65f6\u7684\u5065\u5eb7\u68c0\u67e5\u3002

"},{"location":"api/gateway/#response-200","title":"Response (200)","text":"
{ \"status\": \"ready\" }\n
"},{"location":"api/gateway/#response-503","title":"Response (503)","text":"
{ \"detail\": \"Gateway not ready (tokenizer still loading)\" }\n
"},{"location":"api/gateway/#get-docs","title":"GET /docs","text":"

FastAPI \u81ea\u52a8\u751f\u6210\u7684 Swagger UI \u6587\u6863\u9875\u9762\u3002

"},{"location":"components/","title":"Components","text":"

Claw-R1 \u7684\u7ec4\u4ef6\u56f4\u7ed5\u6570\u636e\u6d41\u7ec4\u7ec7\uff1a\u4ece Agent \u4ea4\u4e92\u7684\u91c7\u96c6\uff0c\u5230\u6570\u636e\u7684\u7ba1\u7406\u4e0e\u8d28\u91cf\u8bc4\u4f30\uff0c\u518d\u5230\u5411\u8bad\u7ec3\u5f15\u64ce\u7684\u4f9b\u7ed9\u3002\u5404\u7ec4\u4ef6\u901a\u8fc7 HTTP \u548c Ray RPC \u901a\u4fe1\u3002

  • Gateway Server \u00b7 \u6570\u636e\u91c7\u96c6\u5165\u53e3

    FastAPI HTTP \u670d\u52a1\u3002\u6240\u6709 Agent LLM \u8c03\u7528\u7684\u7edf\u4e00\u5165\u53e3\uff0c\u81ea\u52a8\u4ece\u4ea4\u4e92\u4e2d\u91c7\u96c6\u8bad\u7ec3\u6570\u636e\uff08Step\uff09\u5e76\u63d0\u4ea4\u5230 DataPool\u3002\u652f\u6301\u767d\u76d2\u663e\u5f0f\u63d0\u4ea4\u548c\u9ed1\u76d2\u81ea\u52a8\u91c7\u96c6\u4e24\u79cd\u6a21\u5f0f\u3002

    Gateway Server

  • DataPool \u00b7 \u6570\u636e\u7ba1\u7406\u6838\u5fc3

    Ray Actor\u3002Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u4e2d\u67a2 \u2014 \u5b58\u50a8\u3001\u7d22\u5f15\u3001\u5206\u533a\u548c\u4f9b\u7ed9\u4ea4\u4e92\u6570\u636e\u3002\u652f\u6301 Channel \u9694\u79bb\u3001GRPO \u5206\u7ec4\u3001\u5bb9\u91cf\u80cc\u538b\u63a7\u5236\u548c\u5b9e\u65f6\u7edf\u8ba1\u76d1\u63a7\u3002

    DataPool

  • Reward System \u00b7 \u6570\u636e\u8d28\u91cf\u8bc4\u4f30

    RewardLoopWorker Ray Actor\u3002\u591a\u7ef4\u5ea6\u6570\u636e\u8d28\u91cf\u8bc4\u4f30\uff1arule-based\u3001discriminative RM\u3001generative RM\uff0c\u4ee5\u53ca\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u7684\u6574\u5408\u3002

    Reward System

  • Agent Flow \u00b7 \u767d\u76d2\u6570\u636e\u91c7\u96c6

    Agent \u6267\u884c\u751f\u547d\u5468\u671f\u7ba1\u7406\u3002\u767d\u76d2 Agent \u901a\u8fc7 Python API \u663e\u5f0f\u63d0\u4ea4 Step\uff0c\u5b8c\u6574\u63a7\u5236\u6570\u636e\u91c7\u96c6\u8fc7\u7a0b\u3002

    Agent Flow

  • Black-box Agent \u00b7 \u9ed1\u76d2\u6570\u636e\u91c7\u96c6

    \u96f6\u4ee3\u7801\u4fb5\u5165\u7684\u9ed1\u76d2 Agent \u63a5\u5165\u3002\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u901a\u8fc7 base_url \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\u3002

    Black-box Agent

  • Async Training \u00b7 \u6570\u636e\u6d88\u8d39\u4e0e\u8bad\u7ec3

    AsyncTrainer \u548c AsyncRollouter Ray Actor\u3002\u6301\u7eed\u4ece DataPool \u6d88\u8d39\u9ad8\u8d28\u91cf\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\uff0c\u5e26\u53c2\u6570\u540c\u6b65\u3002

    Async Training

"},{"location":"components/#_1","title":"\u6570\u636e\u6d41\u5168\u666f","text":"
                        \u6570\u636e\u91c7\u96c6\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n  \u9ed1\u76d2 Agent \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u25ba\u2502                                         \u2502\n  (base_url)          \u2502         GATEWAY SERVER                  \u2502\n                      \u2502         (FastAPI, \u7aef\u53e3 8100)             \u2502\n  \u767d\u76d2 Agent \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u25ba\u2502         \u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92 Step                 \u2502\n  (AgentFlow)         \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                   \u2502 Ray RPC (submit_steps)\n                                   \u25bc\n                        \u6570\u636e\u7ba1\u7406\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         DATAPOOL                         \u2502\n                      \u2502         (Ray Actor)                      \u2502\n                      \u2502                                          \u2502\n                      \u2502  \u2022 \u5b58\u50a8\u4e0e\u7d22\u5f15    \u2022 Channel \u5206\u533a            \u2502\n                      \u2502  \u2022 GRPO \u5206\u7ec4     \u2022 \u5bb9\u91cf\u80cc\u538b\u63a7\u5236            \u2502\n                      \u2502  \u2022 \u8d28\u91cf\u8bc4\u4f30      \u2022 \u5b9e\u65f6\u7edf\u8ba1\u76d1\u63a7            \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                         \u2502 fetch_batch()\n                                         \u25bc\n                        \u6570\u636e\u6d88\u8d39\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         ASYNC TRAINER                    \u2502\n                      \u2502         (Ray Actor, Training GPU Pool)   \u2502\n                      \u2502   \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n                      \u2502   \u2502  Actor \u2502 Critic \u2502 RefPolicy      \u2502   \u2502\n                      \u2502   \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                       \u2502 NCCL weight sync\n                                       \u25bc\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         ASYNC ROLLOUTER                  \u2502\n                      \u2502         (Ray Actor, Rollout GPU Pool)    \u2502\n                      \u2502         vLLM servers                     \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
"},{"location":"components/agent-flow/","title":"Agent Flow","text":"

Agent Flow \u662f Claw-R1 \u4e2d\u7ba1\u7406 Agent \u6267\u884c\u751f\u547d\u5468\u671f\u7684\u6846\u67b6\u3002\u5b83\u5206\u4e3a\u4e24\u5927\u7c7b\uff1a

  • \u767d\u76d2 Agent Flow\uff1aAgent \u903b\u8f91\u7528 Python \u7f16\u5199\uff0c\u76f4\u63a5\u901a\u8fc7 Gateway \u7684 /generate\u3001/submit_steps \u7b49\u7aef\u70b9\u4ea4\u4e92\uff0c\u81ea\u884c\u7ba1\u7406 tokenize \u548c Step \u6784\u5efa\u3002
  • \u9ed1\u76d2 Agent Flow\uff1aAgent \u4f7f\u7528\u6807\u51c6 OpenAI API\uff0c\u901a\u8fc7 base_url \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u5904\u7406 tokenize \u548c Step \u63d0\u4ea4\u3002
"},{"location":"components/agent-flow/#_1","title":"\u7c7b\u5c42\u6b21","text":"
AgentFlowBase                              (abstract base)\n    \u2502\n    \u251c\u2500\u2500 SingleStepSingleTurnAgentFlow      (\u767d\u76d2\uff1a\u5355\u8f6e\u95ee\u7b54)\n    \u251c\u2500\u2500 MultiStepAgentFlow                 (\u767d\u76d2\uff1a\u591a\u8f6e\u5de5\u5177\u8c03\u7528)\n    \u2502\n    \u2514\u2500\u2500 BlackBoxAgentFlowBase              (\u9ed1\u76d2\u57fa\u7c7b)\n            \u2514\u2500\u2500 BlackBoxGSM8KAgentFlow     (\u9ed1\u76d2\uff1aGSM8K \u6570\u5b66\u9898)\n
"},{"location":"components/agent-flow/#agentflowbase","title":"AgentFlowBase","text":"

\u6240\u6709 Agent Flow \u7684\u62bd\u8c61\u57fa\u7c7b\uff0c\u63d0\u4f9b\uff1a

  • Gateway URL \u7ba1\u7406
  • \u914d\u7f6e\u8bbf\u95ee\uff08self.config\uff09
  • \u62bd\u8c61\u65b9\u6cd5 run(sampling_params, **kwargs) -> int
"},{"location":"components/agent-flow/#_2","title":"\u767d\u76d2\u8f85\u52a9\u65b9\u6cd5","text":"

\u767d\u76d2 Agent Flow \u53ef\u4f7f\u7528\u4ee5\u4e0b\u65b9\u6cd5\u4e0e Gateway \u4ea4\u4e92\uff1a

"},{"location":"components/agent-flow/#gateway_generatetrajectory_uid-prompt_uid-messages-kwargs","title":"gateway_generate(trajectory_uid, prompt_uid, messages, **kwargs)","text":"

\u5411 Gateway /generate \u53d1\u9001\u5f02\u6b65 HTTP POST\uff0c\u8fd4\u56de\u751f\u6210\u6587\u672c\u548c token IDs\u3002

text, response_ids, prompt_ids = await self.gateway_generate(\n    trajectory_uid=\"traj-abc\",\n    prompt_uid=\"prompt-xyz\",\n    messages=[{\"role\": \"user\", \"content\": \"Summarize this document.\"}],\n    max_tokens=512,\n    temperature=0.8,\n)\n
"},{"location":"components/agent-flow/#gateway_submit_stepssteps-channeltrain","title":"gateway_submit_steps(steps, channel=\"train\")","text":"

\u5411 Gateway /submit_steps \u63d0\u4ea4 Step \u5217\u8868\u3002

"},{"location":"components/agent-flow/#gateway_compute_rewardtrajectory_uid-messages-dataset_fields","title":"gateway_compute_reward(trajectory_uid, messages, dataset_fields)","text":"

\u5411 Gateway /compute_reward \u8bf7\u6c42 reward \u8ba1\u7b97\u3002

"},{"location":"components/agent-flow/#singlestepsingleturnagentflow","title":"SingleStepSingleTurnAgentFlow","text":"

\u6700\u7b80\u5355\u7684\u767d\u76d2\u5b9e\u73b0\uff1a\u5355\u4e2a prompt \u4ea7\u751f\u5355\u4e2a response\u3002\u9002\u7528\u4e8e\u6bcf\u4e2a\u6837\u672c\u90fd\u662f\u72ec\u7acb\u95ee\u7b54\u5bf9\u7684\u6570\u636e\u96c6\u3002

class MyAgentFlow(SingleStepSingleTurnAgentFlow):\n    async def run(self, sampling_params, **kwargs) -> int:\n        messages = [{\"role\": \"user\", \"content\": kwargs[\"raw_prompt\"]}]\n        text, response_ids, prompt_ids = await self.gateway_generate(\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            messages=messages,\n        )\n        step = Step(\n            prompt_ids=prompt_ids,\n            response_ids=response_ids,\n            reward=0.0,\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            step_index=0,\n            is_last=True,\n        )\n        await self.gateway_submit_steps([step])\n        return 1\n
"},{"location":"components/agent-flow/#multistepagentflow","title":"MultiStepAgentFlow","text":"

\u591a\u8f6e Agent Flow\uff0c\u652f\u6301\u5de5\u5177\u8c03\u7528\u3001\u89c4\u5212\u7b49\u573a\u666f\u3002\u6bcf\u8f6e\u4ea7\u751f\u4e00\u4e2a Step\uff0c\u901a\u8fc7 trajectory_uid \u4e32\u8054\u3002

class ToolAgentFlow(MultiStepAgentFlow):\n    async def run(self, sampling_params, **kwargs) -> int:\n        messages = [{\"role\": \"user\", \"content\": kwargs[\"task\"]}]\n        step_index = 0\n\n        while True:\n            text, response_ids, prompt_ids = await self.gateway_generate(...)\n            is_last = self.is_terminal(text)\n\n            step = Step(\n                prompt_ids=prompt_ids,\n                response_ids=response_ids,\n                step_index=step_index,\n                is_last=is_last,\n                ...\n            )\n            await self.gateway_submit_steps([step])\n\n            if is_last:\n                break\n\n            messages.append({\"role\": \"assistant\", \"content\": text})\n            tool_result = await self.execute_tool(text)\n            messages.append({\"role\": \"tool\", \"content\": tool_result})\n            step_index += 1\n\n        return step_index + 1\n
"},{"location":"components/agent-flow/#blackboxagentflowbase","title":"BlackBoxAgentFlowBase","text":"

\u9ed1\u76d2 Agent Flow \u7684\u57fa\u7c7b\u3002\u5904\u7406\u4e0e Gateway \u7684\u5b8c\u6574\u534f\u8bae\uff08init \u2192 register \u2192 complete\uff09\uff0c\u5c06 Agent \u6267\u884c\u59d4\u6258\u7ed9\u5b50\u7c7b\u7684 _run_agent \u65b9\u6cd5\u3002

\u8be6\u7ec6\u6587\u6863\u89c1 Black-box Agent\u3002

"},{"location":"components/agent-flow/#_3","title":"\u6ce8\u518c\u673a\u5236","text":"

Agent Flow \u901a\u8fc7 @register(\"name\") \u88c5\u9970\u5668\u6ce8\u518c\u5230\u5168\u5c40\u6ce8\u518c\u8868\uff1a

from claw_r1.agent_flow.agent_flow import register\n\n@register(\"my_agent_flow\")\nclass MyAgentFlow(AgentFlowBase):\n    ...\n

\u4e5f\u53ef\u901a\u8fc7 YAML \u914d\u7f6e\u6587\u4ef6\u6ce8\u518c\uff08\u7528\u4e8e\u9ed1\u76d2 Agent\uff09\uff1a

# agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n
"},{"location":"components/agent-flow/#agentflowmanager-agentflowworker","title":"AgentFlowManager \u548c AgentFlowWorker","text":"
  • AgentFlowManager\uff1a\u7ba1\u7406\u591a\u4e2a AgentFlowWorker\uff0c\u5c06 batch \u4e2d\u7684\u6bcf\u4e2a\u6837\u672c\u5206\u53d1\u7ed9\u5bf9\u5e94\u7684 Agent Flow \u6267\u884c\u3002
  • AgentFlowWorker\uff1aRay Actor\uff0c\u6301\u6709 tokenizer \u548c\u914d\u7f6e\uff0c\u6267\u884c\u5177\u4f53\u7684 Agent Flow\u3002
AsyncRollouter\n    \u2514\u2500\u2500 AgentFlowManager\n            \u2514\u2500\u2500 AgentFlowWorker (Ray Actor, \u53ef\u591a\u4e2a)\n                    \u2514\u2500\u2500 AgentFlowBase \u5b50\u7c7b\u5b9e\u4f8b\n
"},{"location":"components/agent-flow/#_4","title":"\u914d\u7f6e","text":"

\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u6307\u5b9a Agent Flow\uff1a

python3 -m claw_r1.async_main \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    ...\n
"},{"location":"components/async-training/","title":"Async Training","text":"

Claw-R1 \u7684\u5f02\u6b65\u8bad\u7ec3\u67b6\u6784\u5c06 rollout\uff08trajectory \u751f\u6210\uff09\u548c training\uff08\u6743\u91cd\u66f4\u65b0\uff09\u5206\u79bb\u4e3a\u4e24\u4e2a\u72ec\u7acb\u7684 Ray Actor\uff0c\u8fd0\u884c\u5728\u4e0d\u540c\u7684 GPU \u6c60\u4e0a\u3002

"},{"location":"components/async-training/#_1","title":"\u67b6\u6784","text":"
\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  Rollout GPU Pool                                        \u2502\n\u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n\u2502  \u2502  AsyncRollouter (Ray Actor)                      \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 DataLoader (\u904d\u5386\u6570\u636e\u96c6)                      \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 vLLM replicas (\u63a8\u7406\u5f15\u64ce)                     \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 AgentFlowManager (\u7ba1\u7406 Agent \u6267\u884c)           \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Gateway (FastAPI \u5b50\u8fdb\u7a0b, \u7aef\u53e3 8100)          \u2502   \u2502\n\u2502  \u2502  \u2514\u2500\u2500 RewardLoopWorker (\u8ba1\u7b97 reward)               \u2502   \u2502\n\u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  submit_step (via Gateway \u2192 DataPool)\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502   DataPool       \u2502   \u2190 \u5171\u4eab Ray Actor\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  fetch_batch()\n                       \u25bc\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  Training GPU Pool                                       \u2502\n\u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n\u2502  \u2502  AsyncTrainer (Ray Actor)                        \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Actor worker group (\u7b56\u7565\u6a21\u578b)                \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Critic worker group (\u4ef7\u503c\u6a21\u578b)               \u2502   \u2502\n\u2502  \u2502  \u2514\u2500\u2500 RefPolicy worker group (KL baseline)        \u2502   \u2502\n\u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  NCCL weight broadcast\n                       \u25bc\n              AsyncRollouter.update_weights()\n
"},{"location":"components/async-training/#asynctrainer","title":"AsyncTrainer","text":"

AsyncTrainer \u662f\u8fd0\u884c\u5728 Training GPU Pool \u4e0a\u7684 Ray Actor\uff0c\u6267\u884c\u6301\u7eed\u7684 PPO \u8bad\u7ec3\u5faa\u73af\uff1a

  1. \u4ece DataPool fetch_batch() \u2014 \u963b\u585e\u7b49\u5f85\u5b8c\u6574\u7684 prompt_uid \u7ec4
  2. \u901a\u8fc7 RewardLoopWorker \u8ba1\u7b97 batch \u7684 reward
  3. \u8ba1\u7b97 advantage\uff08GAE \u6216 GRPO\uff09
  4. \u6267\u884c PPO Actor + Critic \u66f4\u65b0
  5. \u6bcf trigger_parameter_sync_step \u6b65\u89e6\u53d1\u6743\u91cd\u540c\u6b65
"},{"location":"components/async-training/#worker","title":"Worker \u521d\u59cb\u5316","text":"

AsyncTrainer \u5728 init_workers() \u4e2d\u521b\u5efa Actor\u3001Critic\u3001RefPolicy \u7684 worker group\uff0c\u5e76\u5c06\u5b83\u4eec\u90e8\u7f72\u5230 Training GPU Pool\uff1a

# \u521b\u5efa\u987a\u5e8f\uff1aCritic \u2192 RefPolicy \u2192 Actor\uff08\u6700\u540e\u521b\u5efa Actor \u4ee5\u514d\u5f71\u54cd vLLM \u5185\u5b58\u4f30\u7b97\uff09\nself.critic_wg.init_model()\nself.ref_policy_wg.init_model()\nself.actor_wg.init_model()\n
"},{"location":"components/async-training/#asyncrollouter","title":"AsyncRollouter","text":"

AsyncRollouter \u8fd0\u884c\u5728 Rollout GPU Pool \u4e0a\uff0c\u6301\u6709\uff1a

  • DataLoader\uff1a\u904d\u5386\u8bad\u7ec3\u6570\u636e\u96c6
  • vLLM replicas\uff1a\u9ad8\u541e\u5410\u63a8\u7406\u670d\u52a1\u5668
  • AgentFlowManager\uff1a\u7ba1\u7406 AgentFlowBase worker
  • Gateway\uff1aFastAPI HTTP \u670d\u52a1\u5668\uff08\u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u542f\u52a8\uff09
  • RewardLoopWorker\uff1a\u5728 rollout \u671f\u95f4\u8ba1\u7b97 reward
"},{"location":"components/async-training/#gateway","title":"Gateway \u542f\u52a8\u6d41\u7a0b","text":"

Rollouter \u5c06 Gateway \u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u542f\u52a8\uff1a

  1. \u5feb\u901f\u521d\u59cb\u5316\uff08Ray \u8fde\u63a5\u3001DataPool\u3001vLLM \u5730\u5740\uff09\u2192 HTTP \u7acb\u5373\u53ef\u7528
  2. Tokenizer \u5728\u540e\u53f0\u7ebf\u7a0b\u52a0\u8f7d
  3. Rollouter \u8f6e\u8be2 GET /ready \u7b49\u5f85 Gateway \u5b8c\u5168\u5c31\u7eea
  4. \u8d85\u65f6\u65f6\u95f4\u53ef\u901a\u8fc7 trainer.gateway_startup_timeout \u914d\u7f6e\uff08\u9ed8\u8ba4 300 \u79d2\uff09
"},{"location":"components/async-training/#_2","title":"\u6682\u505c/\u6062\u590d\uff08\u6743\u91cd\u540c\u6b65\uff09","text":"

\u6743\u91cd\u540c\u6b65\u671f\u95f4\uff0cRollouter \u6682\u505c\u751f\u6210\uff1a

rollouter.pause()                          # \u505c\u6b62\u65b0\u751f\u6210\uff0c\u7b49\u5f85\u8fdb\u884c\u4e2d\u7684\u8bf7\u6c42\u5b8c\u6210\n# NCCL broadcast: Actor weights \u2192 vLLM\nrollouter.update_param_version(new_version)\nrollouter.resume()                         # \u4f7f\u7528\u66f4\u65b0\u540e\u7684\u6743\u91cd\u6062\u590d\u751f\u6210\n
"},{"location":"components/async-training/#parametersynchronizer","title":"ParameterSynchronizer","text":"

\u8f7b\u91cf\u7ea7 Ray Actor\uff0c\u534f\u8c03 AsyncTrainer \u548c AsyncRollouter \u4e4b\u95f4\u7684\u6743\u91cd\u540c\u6b65\uff1a

class ParameterSynchronizer:\n    def sync_weights(self, version, validate=False):\n        # 1. \u6682\u505c rollout\n        # 2. NCCL broadcast: trainer Actor \u2192 vLLM\n        # 3. \u66f4\u65b0 rollouter \u7684 param_version\n        # 4. \u53ef\u9009\uff1a\u8fd0\u884c\u9a8c\u8bc1\n        # 5. \u6062\u590d rollout\n
"},{"location":"components/async-training/#advantage","title":"Advantage \u8ba1\u7b97","text":""},{"location":"components/async-training/#gae-generalized-advantage-estimation","title":"GAE (Generalized Advantage Estimation)","text":"

\u7528\u4e8e trajectory \u7ea7\u522b\u7684 value baseline\u3002\u5728 step \u7ea7\u522b \u8ba1\u7b97 advantage\uff0c\u7136\u540e\u5e7f\u64ad\u5230 token \u7ea7\u522b\uff08\u540c\u4e00 step \u5185\u6240\u6709 response token \u5171\u4eab\u76f8\u540c\u7684 advantage\uff09\u3002

"},{"location":"components/async-training/#grpo-group-relative-policy-optimization","title":"GRPO (Group Relative Policy Optimization)","text":"

\u7528\u4e8e prompt \u7ea7\u522b\u7684 baseline\u3002\u5c06\u6765\u81ea\u540c\u4e00 prompt_uid \u7684\u591a\u4e2a rollout \u5206\u7ec4\uff0c\u5728\u7ec4\u5185\u5f52\u4e00\u5316 advantage\u3002\u4e0d\u9700\u8981\u5355\u72ec\u7684 Critic \u6a21\u578b\uff0c\u66f4\u8282\u7701\u5185\u5b58\u3002

"},{"location":"components/async-training/#_3","title":"\u8d44\u6e90\u6c60\u914d\u7f6e","text":"

Trainer \u548c Rollouter \u8fd0\u884c\u5728\u72ec\u7acb\u7684 GPU \u6c60\u4e0a\uff0c\u9632\u6b62\u8d44\u6e90\u7ade\u4e89\uff1a

# async_ppo_trainer.yaml\n\n# Training GPU Pool (Actor, Critic, RefPolicy)\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 2\n\n# Rollout GPU Pool (vLLM)\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 1\n

\u603b GPU \u6570 = trainer.nnodes \u00d7 trainer.n_gpus_per_node + rollout.nnodes \u00d7 rollout.n_gpus_per_node\u3002

GPU \u5206\u914d

\u5fc5\u987b\u540c\u65f6\u4e3a trainer \u548c rollout \u914d\u7f6e GPU\u3002\u5982\u679c trainer \u6ca1\u6709\u5206\u914d GPU\uff0c\u8bad\u7ec3\u53c2\u6570\uff08Actor\u3001Critic\uff09\u5c06\u65e0\u6cd5\u90e8\u7f72\u5230 GPU \u4e0a\u3002

"},{"location":"components/async-training/#_4","title":"\u5173\u952e\u914d\u7f6e","text":"
# async_ppo_trainer.yaml\nasync_training:\n  staleness_threshold: 0.1           # off-policy \u5bb9\u5fcd\u5ea6\n  trigger_parameter_sync_step: 4     # \u6bcf N \u6b65\u540c\u6b65\u6743\u91cd\n  require_batches: 1                 # \u6bcf\u6b21\u4ece DataPool \u53d6\u591a\u5c11\u4e2a batch\n  use_rollout_log_probs: true        # \u4f7f\u7528 rollout \u65f6\u7684 log_probs\n  max_queue_size: null               # DataPool \u961f\u5217\u5927\u5c0f\uff08null = \u65e0\u9650\uff09\n  partial_rollout: false             # \u540c\u6b65\u65f6\u662f\u5426\u4e2d\u65ad rollout\n\n  checkpoint_engine:\n    enable: true\n    device_buffer_size_M: 4096\n
"},{"location":"components/async-training/#_5","title":"\u5165\u53e3","text":"
python3 -m claw_r1.async_main \\\n    trainer.n_gpus_per_node=2 \\\n    trainer.nnodes=1 \\\n    rollout.n_gpus_per_node=1 \\\n    rollout.nnodes=1 \\\n    async_training.trigger_parameter_sync_step=1 \\\n    ...\n

\u5b8c\u6574\u793a\u4f8b\u89c1 example/test_async_blackbox.sh\u3002

"},{"location":"components/blackbox-agent/","title":"Black-box Agent","text":"

Black-box Agent \u7cfb\u7edf\u5141\u8bb8\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u63a5\u5165 Claw-R1 \u7684\u8bad\u7ec3\u5faa\u73af\uff0c\u65e0\u9700\u4fee\u6539 Agent \u5185\u90e8\u903b\u8f91\u3002Agent \u53ea\u9700\u5c06 base_url \u6307\u5411 Gateway\uff0c\u5373\u53ef\u900f\u660e\u5730\u6536\u96c6\u8bad\u7ec3\u6570\u636e\u3002

"},{"location":"components/blackbox-agent/#_1","title":"\u67b6\u6784\u6982\u89c8","text":"
\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  BlackBoxAgentFlowBase (\u8bad\u7ec3\u4fa7\u7f16\u6392)                           \u2502\n\u2502                                                               \u2502\n\u2502  1. POST /init_trajectory          \u2192 \u83b7\u53d6 base_url            \u2502\n\u2502  2. POST {base_url}/v1/register_trajectory \u2192 \u6ce8\u518c metadata    \u2502\n\u2502  3. \u8c03\u7528 _run_agent(base_url, kwargs)                         \u2502\n\u2502     \u2502                                                         \u2502\n\u2502     \u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510                    \u2502\n\u2502     \u2514\u2500\u2500\u2502  \u5177\u4f53 Agent (\u5982 GSM8KAgent)      \u2502                    \u2502\n\u2502        \u2502  \u53ea\u77e5\u9053 base_url\uff0c\u4f7f\u7528 OpenAI API \u2502                    \u2502\n\u2502        \u2502  POST {base_url}/v1/chat/completions (\u591a\u8f6e)          \u2502\n\u2502        \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518                    \u2502\n\u2502  4. POST {base_url}/v1/complete_trajectory \u2192 \u6807\u8bb0\u5b8c\u6210          \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
"},{"location":"components/blackbox-agent/#_2","title":"\u6838\u5fc3\u8bbe\u8ba1","text":""},{"location":"components/blackbox-agent/#_3","title":"\u5173\u6ce8\u70b9\u5206\u79bb","text":"
  • BlackBoxAgentFlowBase\uff1a\u5904\u7406\u4e0e Gateway \u7684\u5b8c\u6574\u534f\u8bae\uff08init \u2192 register \u2192 complete\uff09\uff0c\u662f\u8bad\u7ec3\u4fa7\u7684\u7f16\u6392\u5c42\u3002
  • \u5177\u4f53 Agent\uff08\u5982 GSM8KAgent\uff09\uff1a\u53ea\u63a5\u6536 base_url \u548c\u4efb\u52a1\u53c2\u6570\uff0c\u4f7f\u7528\u6807\u51c6 OpenAI API \u5b8c\u6210\u4efb\u52a1\u3002Agent \u5b8c\u5168\u4e0d\u77e5\u9053\u8bad\u7ec3\u7cfb\u7edf\u7684\u5b58\u5728\u3002

\u8fd9\u79cd\u5206\u79bb\u4f7f\u5f97\uff1a

  • \u540c\u4e00\u4e2a Agent \u53ef\u4ee5\u5728\u8bad\u7ec3\u6a21\u5f0f\u548c\u72ec\u7acb\u670d\u52a1\u6a21\u5f0f\u4e0b\u590d\u7528
  • \u65b0\u589e\u4efb\u52a1\u53ea\u9700\u5b9e\u73b0 Agent + \u5bf9\u5e94\u7684 Flow \u5b50\u7c7b
  • Agent \u53ef\u4ee5\u7528\u4efb\u4f55\u8bed\u8a00/\u6846\u67b6\u5b9e\u73b0\uff0c\u53ea\u8981\u652f\u6301 OpenAI API
"},{"location":"components/blackbox-agent/#_4","title":"\u6ce8\u518c\u673a\u5236","text":"

Agent Flow \u901a\u8fc7 @register(\"name\") \u88c5\u9970\u5668\u6ce8\u518c\uff0c\u5e76\u5728 YAML \u914d\u7f6e\u4e2d\u5f15\u7528\uff1a

# agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n
"},{"location":"components/blackbox-agent/#_5","title":"\u7c7b\u5c42\u6b21","text":"
AgentFlowBase                         (agent_flow/agent_flow.py)\n    \u2502\n    \u2514\u2500\u2500 BlackBoxAgentFlowBase          (blackbox_agent/blackbox_agent_flow.py)\n            \u2502\n            \u2514\u2500\u2500 BlackBoxGSM8KAgentFlow (blackbox_agent/gsm8k_agent_flow.py)\n
"},{"location":"components/blackbox-agent/#blackboxagentflowbase","title":"BlackBoxAgentFlowBase","text":"

\u6240\u6709\u9ed1\u76d2 Agent Flow \u7684\u57fa\u7c7b\uff0c\u5b9e\u73b0\u4e86\u5b8c\u6574\u7684 Gateway \u534f\u8bae\uff1a

class BlackBoxAgentFlowBase(AgentFlowBase):\n\n    async def run(self, sampling_params, **kwargs) -> int:\n        # 1. \u63d0\u53d6 channel\u3001prompt_uid\u3001metadata\n        channel, prompt_uid, metadata = self._prepare_params(kwargs)\n\n        # 2. init_trajectory \u2192 \u83b7\u53d6 base_url\n        init_resp = await http.post(f\"{self.gateway_url}/init_trajectory\")\n        base_url = ...\n\n        # 3. register_trajectory \u2192 \u6ce8\u518c channel \u548c metadata\n        await http.post(f\"{base_url}/v1/register_trajectory\", json={...})\n\n        # 4. \u8c03\u7528\u5b50\u7c7b\u5b9e\u73b0\u7684 _run_agent\n        num_turns = await self._run_agent(base_url, kwargs)\n\n        # 5. complete_trajectory \u2192 \u6807\u8bb0\u5b8c\u6210\n        await http.post(f\"{base_url}/v1/complete_trajectory\")\n\n        return num_turns\n\n    @abstractmethod\n    async def _run_agent(self, base_url: str, kwargs: dict) -> int:\n        \"\"\"\u5b50\u7c7b\u5b9e\u73b0\uff1a\u521b\u5efa\u5e76\u8fd0\u884c\u5177\u4f53 Agent\u3002\"\"\"\n        ...\n

\u5b50\u7c7b\u53ea\u9700\u5b9e\u73b0 _run_agent\uff1a\u4ece kwargs \u4e2d\u63d0\u53d6\u4efb\u52a1\u53c2\u6570\uff0c\u521b\u5efa Agent \u5b9e\u4f8b\uff0c\u8c03\u7528 Agent \u7684\u6267\u884c\u65b9\u6cd5\u3002

"},{"location":"components/blackbox-agent/#blackboxgsm8kagentflow","title":"BlackBoxGSM8KAgentFlow","text":"

GSM8K \u6570\u5b66\u9898\u7684\u5177\u4f53\u5b9e\u73b0\uff1a

@register(\"blackbox_gsm8k_agent\")\nclass BlackBoxGSM8KAgentFlow(BlackBoxAgentFlowBase):\n\n    async def _run_agent(self, base_url: str, kwargs: dict) -> int:\n        from claw_r1.blackbox_agent.gsm8k_agent import GSM8KAgent\n\n        question = ...   # \u4ece kwargs[\"raw_prompt\"] \u63d0\u53d6\n        ground_truth = ...  # \u4ece kwargs[\"reward_model\"] \u63d0\u53d6\n        max_turns = self.config.actor_rollout_ref.rollout.get(\"max_turns\", 3)\n\n        agent = GSM8KAgent(base_url=base_url)\n        return await agent.solve(\n            question=question,\n            ground_truth=ground_truth,\n            max_turns=max_turns,\n        )\n
"},{"location":"components/blackbox-agent/#gsm8kagent","title":"GSM8KAgent","text":"

\u4e00\u4e2a\u8bad\u7ec3\u65e0\u5173\u7684 Agent\uff0c\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u89e3\u51b3 GSM8K \u6570\u5b66\u9898\uff1a

  • \u63a5\u6536 base_url\uff08\u6307\u5411 Gateway\uff09\u548c\u4efb\u52a1\u53c2\u6570
  • \u4f7f\u7528 tool calling\uff08check_answer \u5de5\u5177\uff09\u8fdb\u884c\u591a\u8f6e\u63a8\u7406
  • \u652f\u6301 Qwen \u98ce\u683c\u7684 tool call \u89e3\u6790\uff08\u273fFUNCTION\u273f \u683c\u5f0f\uff09
  • \u8fd4\u56de\u4f7f\u7528\u7684\u8f6e\u6b21\u6570
agent = GSM8KAgent(base_url=\"http://gateway:8100/traj123/1\")\nnum_turns = await agent.solve(\n    question=\"What is 15 * 23?\",\n    ground_truth=\"345\",\n    max_turns=3,\n)\n
"},{"location":"components/blackbox-agent/#agent","title":"\u6dfb\u52a0\u65b0\u7684\u9ed1\u76d2 Agent","text":"
  1. \u5b9e\u73b0 Agent \u7c7b\uff08\u8bad\u7ec3\u65e0\u5173\uff09\uff1a
# claw_r1/blackbox_agent/my_agent.py\nclass MyAgent:\n    def __init__(self, base_url: str):\n        self.client = AsyncOpenAI(base_url=base_url, api_key=\"x\")\n\n    async def solve(self, task: str, **kwargs) -> int:\n        # \u4f7f\u7528 self.client \u8fdb\u884c\u591a\u8f6e\u5bf9\u8bdd\n        # \u8fd4\u56de\u4f7f\u7528\u7684\u8f6e\u6b21\u6570\n        ...\n
  1. \u5b9e\u73b0 Flow \u5b50\u7c7b\uff1a
# claw_r1/blackbox_agent/my_agent_flow.py\nfrom claw_r1.agent_flow.agent_flow import register\nfrom claw_r1.blackbox_agent.blackbox_agent_flow import BlackBoxAgentFlowBase\n\n@register(\"blackbox_my_agent\")\nclass BlackBoxMyAgentFlow(BlackBoxAgentFlowBase):\n    async def _run_agent(self, base_url, kwargs):\n        from claw_r1.blackbox_agent.my_agent import MyAgent\n        task = kwargs.get(\"raw_prompt\", \"\")\n        agent = MyAgent(base_url=base_url)\n        return await agent.solve(task=task)\n
  1. \u6ce8\u518c\u5230\u914d\u7f6e\uff1a
# agent_flow_config.yaml\n- name: blackbox_my_agent\n  _target_: claw_r1.blackbox_agent.my_agent_flow.BlackBoxMyAgentFlow\n
  1. \u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u4f7f\u7528\uff1a
python3 -m claw_r1.async_main \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_my_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    ...\n
"},{"location":"components/blackbox-agent/#_6","title":"\u6587\u4ef6\u7ed3\u6784","text":"
claw_r1/blackbox_agent/\n\u251c\u2500\u2500 blackbox_agent_flow.py      # BlackBoxAgentFlowBase \u57fa\u7c7b\n\u251c\u2500\u2500 gsm8k_agent_flow.py         # GSM8K Flow \u5b50\u7c7b\n\u251c\u2500\u2500 gsm8k_agent.py              # GSM8K Agent\uff08\u8bad\u7ec3\u65e0\u5173\uff09\n\u2514\u2500\u2500 agent_flow_config.yaml      # Agent Flow \u6ce8\u518c\u914d\u7f6e\n
"},{"location":"components/datapool/","title":"DataPool","text":"

DataPool \u662f Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u6838\u5fc3 \u2014 \u4e00\u4e2a Ray Actor\uff0c\u627f\u62c5\u7740 Agent \u4ea4\u4e92\u6570\u636e\u7684\u5b58\u50a8\u3001\u7d22\u5f15\u3001\u8d28\u91cf\u8ffd\u8e2a\u3001\u5206\u533a\u7ba1\u7406\u548c\u6309\u9700\u4f9b\u7ed9\u3002\u5b83\u4e0d\u4ec5\u662f Agent \u4fa7\u4e0e Training \u4fa7\u4e4b\u95f4\u7684\u7f13\u51b2\u533a\uff0c\u66f4\u662f\u6574\u4e2a\u6570\u636e\u57fa\u7840\u8bbe\u65bd\u7684\u4e2d\u67a2\u3002

"},{"location":"components/datapool/#_1","title":"\u5728\u67b6\u6784\u4e2d\u7684\u89d2\u8272","text":"
Gateway \u2500\u2500\u25ba DataPool.submit_steps()     (\u6570\u636e\u91c7\u96c6\uff1a\u5f02\u6b65\u5199\u5165)\nTrainer \u25c4\u2500\u2500 DataPool.fetch_batch()      (\u6570\u636e\u4f9b\u7ed9\uff1a\u963b\u585e\u62c9\u53d6\u5c31\u7eea\u7ec4)\n            DataPool.get_statistics()   (\u6570\u636e\u76d1\u63a7\uff1a\u5b9e\u65f6\u7edf\u8ba1)\n

DataPool \u5b8c\u5168\u89e3\u8026\u4e86\u6570\u636e\u91c7\u96c6\u901f\u5ea6\uff08\u7531 Agent \u8bf7\u6c42\u9891\u7387\u9a71\u52a8\uff09\u548c\u6570\u636e\u6d88\u8d39\u901f\u5ea6\uff08\u7531\u8bad\u7ec3\u541e\u5410\u91cf\u9a71\u52a8\uff09\u3002\u53cc\u65b9\u4e92\u4e0d\u7b49\u5f85\u3002

"},{"location":"components/datapool/#channel","title":"Channel \u7cfb\u7edf\uff08\u6570\u636e\u5206\u533a\uff09","text":"

DataPool \u901a\u8fc7 channel \u5bf9\u6570\u636e\u8fdb\u884c\u5206\u533a\u7ba1\u7406\u3002\u9ed8\u8ba4 channel \u4e3a \"train\"\uff0c\u9a8c\u8bc1\u6d41\u7a0b\u4f7f\u7528 \"val\" channel \u4ee5\u9694\u79bb\u6570\u636e\u3002

# \u8bad\u7ec3\u6570\u636e\ndata_pool.submit_step(step, channel=\"train\")\n\n# \u9a8c\u8bc1\u6570\u636e\ndata_pool.submit_step(step, channel=\"val\")\n

\u6bcf\u4e2a channel \u62e5\u6709\u72ec\u7acb\u7684\u5b58\u50a8\u3001\u7d22\u5f15\u548c FIFO \u961f\u5217\u3002

"},{"location":"components/datapool/#_2","title":"\u6570\u636e\u6a21\u578b","text":"

DataPool \u4ee5 step \u7c92\u5ea6 \u5b58\u50a8 trajectory\u3002\u6bcf\u4e2a step \u662f\u4e00\u4e2a (s, a, r) \u5143\u7ec4\uff1a

@dataclass\nclass Step:\n    prompt_ids:     list[int]   # state: \u5b8c\u6574\u4e0a\u4e0b\u6587 token IDs\n    response_ids:   list[int]   # action: LLM \u751f\u6210\u7684 token IDs\n    reward:         float       # \u8be5 step \u7684\u5373\u65f6 reward\n    trajectory_uid: str         # \u540c\u4e00\u5bf9\u8bdd\u4e2d\u7684 step \u5171\u4eab\u6b64 ID\n    prompt_uid:     str         # \u540c\u4e00 prompt \u7684 rollout \u5171\u4eab\u6b64 ID\uff08\u7528\u4e8e GRPO\uff09\n    step_index:     int         # trajectory \u5185\u7684\u4f4d\u7f6e\uff080-indexed\uff09\n    policy_version: int         # \u751f\u6210\u8be5 step \u65f6\u7684\u7b56\u7565\u7248\u672c\n    is_last:        bool        # \u662f\u5426\u4e3a trajectory \u7684\u6700\u540e\u4e00\u4e2a step\n    metadata:       dict        # \u8f85\u52a9\u6570\u636e\uff08\u6570\u636e\u96c6\u5b57\u6bb5\u3001\u6765\u6e90\u4fe1\u606f\u7b49\uff09\n
"},{"location":"components/datapool/#_3","title":"\u5185\u90e8\u7d22\u5f15","text":"\u7d22\u5f15 \u7c7b\u578b \u7528\u9014 trajectory_index dict[str, list[int]] trajectory_uid \u2192 step \u7d22\u5f15\u5217\u8868 trajectory_complete dict[str, bool] \u8ffd\u8e2a trajectory \u662f\u5426\u5df2\u6536\u5230 is_last step prompt_groups dict[str, PromptGroup] prompt_uid \u2192 trajectory \u5217\u8868\u548c\u5b8c\u6210\u72b6\u6001"},{"location":"components/datapool/#producer-api","title":"Producer API","text":""},{"location":"components/datapool/#submit_stepstep-step-channeltrain","title":"submit_step(step: Step, channel=\"train\")","text":"

\u6dfb\u52a0\u5355\u4e2a step \u5230\u6307\u5b9a channel\u3002\u7531 Gateway \u901a\u8fc7 Ray RPC \u8c03\u7528\u3002

"},{"location":"components/datapool/#submit_stepssteps-liststep-channeltrain","title":"submit_steps(steps: list[Step], channel=\"train\")","text":"

\u6279\u91cf\u63d0\u4ea4\u591a\u4e2a step\u3002\u6bd4\u5faa\u73af\u8c03\u7528 submit_step \u66f4\u9ad8\u6548\u3002

"},{"location":"components/datapool/#complete_trajectorytrajectory_uid-rewardnone-channeltrain","title":"complete_trajectory(trajectory_uid, reward=None, channel=\"train\")","text":"

\u6807\u8bb0\u4e00\u6761 trajectory \u5b8c\u6210\u3002\u7528\u4e8e\u9ed1\u76d2\u6a21\u5f0f\uff0cAgent \u901a\u8fc7 Gateway \u7684 v1/complete_trajectory \u7aef\u70b9\u89e6\u53d1\u3002

"},{"location":"components/datapool/#consumer-api","title":"Consumer API","text":""},{"location":"components/datapool/#fetch_batchn_rollouts-channeltrain-liststep-none","title":"fetch_batch(n_rollouts, channel=\"train\") \u2192 list[Step] | None","text":"

FIFO \u62c9\u53d6\u4e0b\u4e00\u4e2a\u5c31\u7eea\u7684 prompt_uid \u7ec4\u3002\u4e00\u4e2a\u7ec4\u5728\u6240\u6709 trajectory \u90fd\u6536\u5230 is_last step \u540e\u53d8\u4e3a\"\u5c31\u7eea\"\u3002

\u5f53\u6ca1\u6709\u5b8c\u6574\u7ec4\u53ef\u7528\u65f6\u8fd4\u56de None\u3002

# Trainer \u4fa7\nwhile True:\n    batch = await data_pool.fetch_batch.remote(n_rollouts=5)\n    if batch is not None:\n        train_on_batch(batch)\n
"},{"location":"components/datapool/#_4","title":"\u5bb9\u91cf\u7ba1\u7406\u4e0e\u80cc\u538b\u63a7\u5236","text":"

\u5f53\u8bbe\u7f6e max_queue_size \u65f6\uff0cDataPool \u5728\u961f\u5217\u6ee1\u65f6\u81ea\u52a8\u4e22\u5f03\u6700\u65e7\u7684\u5c31\u7eea\u7ec4\uff0c\u9632\u6b62\u6570\u636e\u5806\u79ef\u5bfc\u81f4\u5185\u5b58\u65e0\u9650\u589e\u957f\u3002\u8fd9\u79cd\u80cc\u538b\u673a\u5236\u4e5f\u786e\u4fdd\u4e86\u8bad\u7ec3\u4fa7\u6d88\u8d39\u7684\u6570\u636e\u5c3d\u53ef\u80fd\u65b0\u9c9c\uff1a

async_training:\n  max_queue_size: null   # null = \u65e0\u9650\n
"},{"location":"components/datapool/#training-backend","title":"Training Backend\uff08\u6570\u636e\u4f9b\u7ed9\u9002\u914d\uff09","text":"

DataPool \u901a\u8fc7\u53ef\u63d2\u62d4\u7684 TrainingBackend \u5c06 list[Step] \u8f6c\u6362\u4e3a\u4efb\u610f\u8bad\u7ec3\u5f15\u64ce\u7684\u539f\u751f\u683c\u5f0f\uff0c\u5b9e\u73b0\u6570\u636e\u7ba1\u7406\u4e0e\u8bad\u7ec3\u6846\u67b6\u7684\u89e3\u8026\uff1a

class VerlBackend(TrainingBackend):\n    \"\"\"\u5c06 Step \u5217\u8868\u8f6c\u6362\u4e3a verl DataProto\u3002\"\"\"\n\n    def convert(self, steps: list[Step]) -> DataProto:\n        # prompt_ids: \u5de6\u586b\u5145\u5230 prompt_length\n        # response_ids: \u53f3\u586b\u5145\u5230 response_length\n        # input_ids: [prompt_ids | response_ids]\n        # attention_mask, position_ids, response_mask \u7b49\n        ...\n
"},{"location":"components/datapool/#off-policy","title":"Off-policy \u652f\u6301\uff08\u6570\u636e\u65b0\u9c9c\u5ea6\u7ba1\u63a7\uff09","text":"

\u6bcf\u4e2a Step \u90fd\u8bb0\u5f55\u4e86\u751f\u6210\u65f6\u7684 policy_version\uff0cDataPool \u548c Trainer \u53ef\u4ee5\u636e\u6b64\u5224\u65ad\u6570\u636e\u7684\u65b0\u9c9c\u5ea6\u3002Trainer \u901a\u8fc7 staleness threshold \u914d\u7f6e\u6765\u5904\u7406\u5386\u53f2\uff08off-policy\uff09\u6570\u636e\uff1a

async_training:\n  staleness_threshold: 0.1   # policy_version \u6ede\u540e > threshold \u7684 step \u4e3a off-policy\n

Off-policy step \u4ecd\u5305\u542b\u5728 batch \u4e2d\uff0c\u4f46\u5728 loss \u8ba1\u7b97\u65f6\u901a\u8fc7 importance sampling \u8fdb\u884c\u964d\u6743\u3002

"},{"location":"components/gateway/","title":"Gateway Server","text":"

Gateway Server \u662f\u4e00\u4e2a FastAPI HTTP \u670d\u52a1\uff0c\u4f5c\u4e3a Agent \u4e0e Claw-R1 \u8bad\u7ec3\u57fa\u7840\u8bbe\u65bd\u4e4b\u95f4\u7684\u7f51\u7edc\u5c42\u4ee3\u7406\u3002

"},{"location":"components/gateway/#_1","title":"\u8bbe\u8ba1\u539f\u5219","text":"
  • \u72ec\u7acb\u8fdb\u7a0b\uff1aGateway \u4f5c\u4e3a\u666e\u901a OS \u8fdb\u7a0b\u8fd0\u884c\uff08\u975e Ray Actor\uff09\uff0c\u53ef\u4ee5\u72ec\u7acb\u4e8e Ray \u96c6\u7fa4\u91cd\u542f\u3002
  • \u7eaf\u4ee3\u7406\uff1aGateway \u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f\uff0c\u53ea\u8d1f\u8d23\u8f6c\u53d1\u8bf7\u6c42\u3001\u6536\u96c6 Step\u3001\u63d0\u4ea4\u5230 DataPool\u3002
  • OpenAI \u517c\u5bb9\uff1a\u9ed1\u76d2\u7aef\u70b9\u5b9e\u73b0\u4e0e OpenAI chat completions API \u76f8\u540c\u7684\u63a5\u53e3\uff0c\u53ef\u4f5c\u4e3a drop-in \u66ff\u6362\u3002
  • \u5ef6\u8fdf\u521d\u59cb\u5316\uff1a\u542f\u52a8\u65f6\u5148\u5feb\u901f\u521d\u59cb\u5316 Ray \u8fde\u63a5\u548c\u914d\u7f6e\uff0cHTTP \u670d\u52a1\u7acb\u5373\u53ef\u7528\uff1btokenizer \u5728\u540e\u53f0\u7ebf\u7a0b\u52a0\u8f7d\uff0c\u901a\u8fc7 /ready \u7aef\u70b9\u62a5\u544a\u5c31\u7eea\u72b6\u6001\u3002
"},{"location":"components/gateway/#_2","title":"\u542f\u52a8\u65b9\u5f0f","text":"

Gateway \u901a\u5e38\u7531 AsyncRollouter \u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u81ea\u52a8\u542f\u52a8\u3002\u4e5f\u53ef\u624b\u52a8\u542f\u52a8\uff1a

python -m claw_r1.gateway.gateway \\\n    --data-pool-name  data_pool \\\n    --vllm-addresses  http://host1:8001,http://host2:8001 \\\n    --tokenizer-path  /path/to/model \\\n    --prompt-length   4096 \\\n    --response-length 1024 \\\n    --reward-worker-name reward_loop_worker \\\n    --ray-address     auto \\\n    --ray-namespace   default \\\n    --host            0.0.0.0 \\\n    --port            8100\n
"},{"location":"components/gateway/#_3","title":"\u53c2\u6570","text":"\u53c2\u6570 \u5fc5\u586b \u8bf4\u660e --data-pool-name \u662f DataPool \u7684 Ray Actor \u540d\u79f0 --vllm-addresses \u662f \u9017\u53f7\u5206\u9694\u7684 vLLM \u670d\u52a1\u5668\u5730\u5740\u5217\u8868\uff08\u8f6e\u8be2\u8d1f\u8f7d\u5747\u8861\uff09 --tokenizer-path \u662f HuggingFace tokenizer \u8def\u5f84 --prompt-length \u662f \u6700\u5927 prompt token \u957f\u5ea6\uff08\u7528\u4e8e padding\uff09 --response-length \u662f \u6700\u5927 response token \u957f\u5ea6\uff08\u7528\u4e8e padding\uff09 --reward-worker-name \u5426 RewardLoopWorker \u7684 Ray Actor \u540d\u79f0 --ray-address \u5426 Ray GCS \u5730\u5740\uff08\u9ed8\u8ba4 auto\uff09 --ray-namespace \u5426 Ray namespace --host \u5426 \u76d1\u542c\u5730\u5740\uff08\u9ed8\u8ba4 0.0.0.0\uff09 --port \u5426 \u76d1\u542c\u7aef\u53e3\uff08\u9ed8\u8ba4 8100\uff09"},{"location":"components/gateway/#_4","title":"\u4e24\u79cd\u5de5\u4f5c\u6a21\u5f0f","text":""},{"location":"components/gateway/#white-box","title":"White-box \u6a21\u5f0f","text":"

\u767d\u76d2 Agent\uff08AgentFlowBase \u5b50\u7c7b\uff09\u901a\u8fc7 Gateway \u6839\u8def\u5f84\u7aef\u70b9\u4ea4\u4e92\uff1a

AgentFlow \u2192 POST /generate        \u2192 vLLM \u2192 \u8fd4\u56de token IDs\nAgentFlow \u2192 POST /submit_steps    \u2192 DataPool\nAgentFlow \u2192 POST /compute_reward  \u2192 RewardLoopWorker\n

Agent \u81ea\u5df1\u7ba1\u7406 tokenize\u3001Step \u6784\u5efa\u548c\u63d0\u4ea4\u3002

"},{"location":"components/gateway/#black-box","title":"Black-box \u6a21\u5f0f","text":"

\u9ed1\u76d2 Agent \u53ea\u9700\u8981\u4e00\u4e2a base_url\uff0c\u901a\u8fc7\u6807\u51c6 OpenAI \u63a5\u53e3\u4ea4\u4e92\uff1a

1. BlackBoxAgentFlow \u2192 POST /init_trajectory           \u2192 \u83b7\u53d6 base_url\n2. BlackBoxAgentFlow \u2192 POST {base_url}/v1/register_trajectory  \u2192 \u6ce8\u518c channel/metadata\n3. Agent             \u2192 POST {base_url}/v1/chat/completions     \u2192 \u6807\u51c6 OpenAI \u8c03\u7528\uff08\u53ef\u591a\u8f6e\uff09\n4. BlackBoxAgentFlow \u2192 POST {base_url}/v1/complete_trajectory  \u2192 \u6807\u8bb0\u5b8c\u6210\n

Gateway \u5728 v1/chat/completions \u5185\u90e8\u81ea\u52a8\u5b8c\u6210 tokenize\u3001Step \u6784\u5efa\u548c DataPool \u63d0\u4ea4\uff0cAgent \u5b8c\u5168\u65e0\u611f\u77e5\u3002

"},{"location":"components/gateway/#base_url","title":"base_url \u673a\u5236","text":"

base_url \u7684\u683c\u5f0f\u4e3a\uff1a

http://<host>:<port>/<trajectory_uid>/<prompt_uid>\n

trajectory_uid \u548c prompt_uid \u7f16\u7801\u5728 URL path \u4e2d\uff0c\u4f7f\u5f97 Gateway \u80fd\u5c06\u8bf7\u6c42\u5173\u8054\u5230\u6b63\u786e\u7684 trajectory\uff0c\u800c Agent \u7aef\u53ea\u9700\u4fee\u6539 base_url \u5373\u53ef\u63a5\u5165\u8bad\u7ec3\u7cfb\u7edf\u3002

from openai import OpenAI\n\nclient = OpenAI(\n    base_url=\"http://gateway:8100/abc123/1\",  # base_url \u7531 init_trajectory \u8fd4\u56de\n    api_key=\"not-needed\",\n)\nresponse = client.chat.completions.create(\n    model=\"qwen\",\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\n)\n
"},{"location":"components/gateway/#_5","title":"\u5185\u90e8\u72b6\u6001\u7ba1\u7406","text":"

Gateway \u4e3a\u6bcf\u6761 trajectory \u7ef4\u62a4\u4ee5\u4e0b\u72b6\u6001\uff1a

\u72b6\u6001 \u8bf4\u660e _trajectory_step_counter \u6bcf\u6761 trajectory \u7684\u4e0b\u4e00\u4e2a step_index _trajectory_channel trajectory \u5bf9\u5e94\u7684 DataPool channel\uff08\u9ed8\u8ba4 \"train\"\uff09 _trajectory_metadata trajectory \u5173\u8054\u7684 metadata\uff08\u5982 reward_model\u3001data_source \u7b49\uff09

\u8fd9\u4e9b\u72b6\u6001\u5728 register_trajectory \u65f6\u8bbe\u7f6e\uff0c\u5728 complete_trajectory \u65f6\u6e05\u7406\u3002

"},{"location":"components/gateway/#_6","title":"\u8d1f\u8f7d\u5747\u8861","text":"

\u5f53\u63d0\u4f9b\u591a\u4e2a --vllm-addresses \u65f6\uff0cGateway \u4f7f\u7528 round-robin \u8f6e\u8be2\u5206\u53d1\u8bf7\u6c42\uff1a

_vllm_cycle = itertools.cycle(vllm_addresses)\nvllm_url = next(_vllm_cycle)\n
"},{"location":"components/gateway/#api","title":"API \u53c2\u8003","text":"

\u5b8c\u6574\u7684\u7aef\u70b9\u6587\u6863\u89c1 Gateway API\u3002

"},{"location":"components/reward-system/","title":"Reward System","text":"

The RewardLoopWorker is a Ray Actor responsible for assigning reward scores to trajectory steps. It bridges the gap between raw agent interactions and trainable reward signals.

"},{"location":"components/reward-system/#three-reward-sources","title":"Three Reward Sources","text":"

Claw-R1 supports three types of reward computation, which can be combined:

Type Description Best For Rule-based Deterministic function of step output Verifiable tasks (math, code execution) Discriminative RM Binary classifier reward model Preference learning, safety evaluation Generative RM LLM-based evaluator via custom scoring function Complex quality assessment, nuanced feedback"},{"location":"components/reward-system/#reward-in-production-vs-research-settings","title":"Reward in Production vs. Research Settings","text":"

In research settings (white-box offline mode), rewards are computed from known ground truth:

Trajectory:   [user msg] \u2192 [agent think] \u2192 [tool call] \u2192 [tool result] \u2192 [final reply]\nReward:            0.0            0.3            0.7            0.9            0.8\n
  • Rule-based: is the final answer correct? does the code pass tests?
  • Model-based: is each step logically sound? is the tool use appropriate?

In production settings (online mode), rewards come from real user signals:

Signal Type Interpretation User sends follow-up Implicit positive Agent answer was relevant but incomplete User corrects the agent Negative feedback Factual or task error User says \"thanks\" Positive signal Task completed satisfactorily No follow-up after task Neutral / estimated Reward Model estimates step quality

Claw-R1 uses a Reward Model to convert these soft signals into scalar process rewards, filling the gap between verifiable task rewards and open-ended conversational rewards.

"},{"location":"components/reward-system/#rewardloopworker-api","title":"RewardLoopWorker API","text":""},{"location":"components/reward-system/#compute_score_batchsteps-liststep-listfloat","title":"compute_score_batch(steps: list[Step]) \u2192 list[float]","text":"

Computes rewards for a batch of steps. This is the primary interface used by the Trainer.

# In AsyncTrainer\nrewards = await reward_worker.compute_score_batch.remote(batch_steps)\nfor step, reward in zip(batch_steps, rewards):\n    step.reward = reward\n
"},{"location":"components/reward-system/#custom-reward-function","title":"Custom Reward Function","text":"

Register a custom generative reward model by implementing the reward_loop_manager interface:

# custom_reward.py\ndef compute_reward(step: dict, model, tokenizer) -> float:\n    \"\"\"\n    Args:\n        step: dict with keys 'messages', 'response', 'metadata'\n        model: loaded reward model\n        tokenizer: model tokenizer\n    Returns:\n        scalar reward in [0.0, 1.0]\n    \"\"\"\n    prompt = build_evaluation_prompt(step)\n    score = model.score(prompt)\n    return score\n

Then register it in the configuration:

reward:\n  type: genrm\n  reward_loop_manager: path.to.custom_reward.compute_reward\n  model_path: /path/to/reward/model\n
"},{"location":"components/reward-system/#reward-in-the-training-loop","title":"Reward in the Training Loop","text":"

Reward computation is decoupled from the agent service:

  1. The Gateway does not compute rewards before submitting steps to DataPool
  2. DataPool stores steps with reward=0.0 initially
  3. The Trainer calls RewardLoopWorker.compute_score_batch() before the PPO update
  4. Updated rewards are used for advantage computation

This ensures that even slow generative reward models (which may call an external LLM) do not affect agent service latency.

Reward Design

For new tasks, start with simple rule-based rewards (e.g., exact match, code execution pass rate). Generative reward models are more expressive but introduce variance and computational cost. Use discriminative models as a middle ground.

"},{"location":"concepts/","title":"Core Concepts","text":"

Claw-R1 \u7684\u8bbe\u8ba1\u56f4\u7ed5\u4e09\u4e2a\u6838\u5fc3\u6982\u5ff5\u5c55\u5f00\uff1a\u901a\u7528\u6570\u636e\u91c7\u96c6\u3001\u6570\u636e\u4e2d\u95f4\u4ef6\u7ba1\u7406\u548c\u6570\u636e\u9a71\u52a8\u7684\u6301\u7eed\u8fdb\u5316\u3002\u5b83\u4eec\u5171\u540c\u6784\u6210\u4e00\u4e2a\u4ece\u91c7\u96c6\u5230\u8bad\u7ec3\u7684\u6570\u636e\u98de\u8f6e\u3002

  • Base URL Integration \u00b7 \u901a\u7528\u6570\u636e\u91c7\u96c6

    \u96f6\u4ee3\u7801\u4fb5\u5165\u7684 Agent \u6570\u636e\u91c7\u96c6\u673a\u5236\u3002\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u53ea\u9700\u4fee\u6539 base_url\uff0cGateway \u5373\u53ef\u81ea\u52a8\u91c7\u96c6\u5176\u4ea4\u4e92\u6570\u636e\u3002

    Base URL Integration

  • Middleware Layer \u00b7 \u6570\u636e\u4e2d\u95f4\u4ef6

    Gateway + DataPool \u6570\u636e\u57fa\u7840\u8bbe\u65bd\u3002\u7edf\u4e00\u89e3\u51b3\u6570\u636e\u7684\u91c7\u96c6\u5165\u53e3\u3001\u8d28\u91cf\u7ba1\u7406\u3001\u5206\u533a\u7f13\u51b2\u548c\u6309\u9700\u4f9b\u7ed9\u3002

    Middleware Layer

  • Production Scenario \u00b7 \u6570\u636e\u9a71\u52a8\u8fdb\u5316

    \"\u90e8\u7f72 = \u8bad\u7ec3\" \u8303\u5f0f\u3002Agent \u5728\u670d\u52a1\u7528\u6237\u7684\u540c\u65f6\u6301\u7eed\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\uff0c\u7528\u6237\u884c\u4e3a\u5929\u7136\u6210\u4e3a\u6570\u636e\u8d28\u91cf\u4fe1\u53f7\uff0c\u9a71\u52a8\u6a21\u578b\u6301\u7eed\u8fdb\u5316\u3002

    Production Scenario

"},{"location":"concepts/#_1","title":"\u6570\u636e\u98de\u8f6e","text":"
                    base_url\n                 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                 \u2502 \u4efb\u610f Agent  \u2502\n                 \u2502 (\u767d\u76d2/\u9ed1\u76d2) \u2502\n                 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2518\n                        \u2502 OpenAI API\n                        \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    Gateway       \u2502 \u2190 \u6570\u636e\u91c7\u96c6\u5165\u53e3\n              \u2502  (\u81ea\u52a8\u91c7\u96c6 Step)  \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    DataPool      \u2502 \u2190 \u6570\u636e\u7ba1\u7406\u6838\u5fc3\n              \u2502  (\u8bc4\u4f30\u00b7\u7b5b\u9009\u00b7\u4f9b\u7ed9) \u2502    (\u8d28\u91cf\u8bc4\u4f30 + \u5206\u533a\u7ba1\u7406)\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    Trainer       \u2502 \u2190 \u6570\u636e\u6d88\u8d39\n              \u2502  (\u6301\u7eed\u8bad\u7ec3)       \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502 \u6743\u91cd\u540c\u6b65\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    vLLM          \u2502\n              \u2502  (\u66f4\u597d\u7684\u6a21\u578b)     \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n

\u4e09\u4e2a\u6982\u5ff5\u7684\u534f\u540c\uff1a

  1. Base URL \u8ba9\u4efb\u4f55 Agent \u7684\u4ea4\u4e92\u6570\u636e\u96f6\u6210\u672c\u88ab\u91c7\u96c6
  2. Middleware \u7ba1\u7406\u6570\u636e\u7684\u8d28\u91cf\u3001\u5206\u533a\u548c\u4f9b\u7ed9
  3. Production Scenario \u8ba9\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u81ea\u7136\u878d\u5165\u6570\u636e\uff0c\u9a71\u52a8\u6a21\u578b\u6301\u7eed\u8fdb\u5316
"},{"location":"concepts/base-url-integration/","title":"Base URL Integration","text":""},{"location":"concepts/base-url-integration/#agent-llm","title":"\u95ee\u9898\uff1a\u5982\u4f55\u62e6\u622a\u9ed1\u76d2 Agent \u7684 LLM \u8c03\u7528\uff1f","text":"

\u5728 Agentic RL \u4e2d\uff0c\u8bad\u7ec3\u7cfb\u7edf\u9700\u8981\u62e6\u622a Agent \u4e0e LLM \u4e4b\u95f4\u7684\u6bcf\u6b21\u4ea4\u4e92\uff0c\u4ee5\u6536\u96c6 (state, action, reward) \u6570\u636e\u3002\u5bf9\u4e8e\u767d\u76d2 Agent\uff08\u6e90\u7801\u53ef\u63a7\uff09\uff0c\u8fd9\u5f88\u7b80\u5355\u3002\u4f46\u5bf9\u4e8e\u9ed1\u76d2 Agent\uff08\u5982\u7b2c\u4e09\u65b9\u670d\u52a1\u3001\u7f16\u8bd1\u540e\u7684\u4e8c\u8fdb\u5236\u6587\u4ef6\uff09\uff0c\u5982\u4f55\u5728\u4e0d\u4fee\u6539 Agent \u4ee3\u7801\u7684\u60c5\u51b5\u4e0b\u62e6\u622a\uff1f

"},{"location":"concepts/base-url-integration/#_1","title":"\u65b9\u6848\u5bf9\u6bd4","text":"\u65b9\u6848 \u4fb5\u5165\u6027 \u53ef\u9760\u6027 \u9002\u7528\u8303\u56f4 SDK monkey-patch \u4e2d \u4f4e\uff08\u7248\u672c\u66f4\u65b0\u6613\u5931\u6548\uff09 \u4ec5\u9650\u7279\u5b9a SDK \u4ee3\u7406\u5c42\uff08Proxy\uff09 \u9ad8 \u4e2d\uff08\u9700\u914d\u7f6e\u7f51\u7edc\uff09 \u901a\u7528 base_url \u66ff\u6362 \u6781\u4f4e \u9ad8 \u6240\u6709 OpenAI \u517c\u5bb9 SDK"},{"location":"concepts/base-url-integration/#base_url","title":"base_url \u673a\u5236","text":"

\u51e0\u4e4e\u6240\u6709 OpenAI \u517c\u5bb9\u7684 SDK \u90fd\u652f\u6301\u81ea\u5b9a\u4e49 base_url\u3002Claw-R1 \u5229\u7528\u8fd9\u4e00\u70b9\uff1a

  1. Gateway \u66b4\u9732 POST {base_url}/v1/chat/completions \u7aef\u70b9
  2. Agent \u53ea\u9700\u5c06 base_url \u4ece https://api.openai.com \u6539\u4e3a Gateway \u7684\u5730\u5740
  3. Gateway \u900f\u660e\u5730\u8f6c\u53d1\u8bf7\u6c42\u5230 vLLM\uff0c\u540c\u65f6\u81ea\u52a8\u6536\u96c6\u8bad\u7ec3\u6570\u636e
from openai import OpenAI\n\n# \u539f\u59cb\u4ee3\u7801\nclient = OpenAI(base_url=\"https://api.openai.com/v1\")\n\n# \u63a5\u5165 Claw-R1\uff1a\u53ea\u6539\u4e00\u884c\nclient = OpenAI(\n    base_url=\"http://gateway:8100/traj123/prompt1\",\n    api_key=\"not-needed\",\n)\n\n# \u540e\u7eed\u4ee3\u7801\u5b8c\u5168\u4e0d\u53d8\nresponse = client.chat.completions.create(\n    model=\"qwen\",\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\n)\n
"},{"location":"concepts/base-url-integration/#base_url_1","title":"base_url \u7684\u7ed3\u6784","text":"
http://<host>:<port>/<trajectory_uid>/<prompt_uid>\n
  • trajectory_uid\uff1a\u6807\u8bc6\u4e00\u6761\u5b8c\u6574\u7684\u5bf9\u8bdd\u8f68\u8ff9
  • prompt_uid\uff1a\u6807\u8bc6\u540c\u4e00 prompt \u7684\u591a\u6b21 rollout\uff08\u7528\u4e8e GRPO \u5206\u7ec4\uff09

\u8fd9\u4e24\u4e2a ID \u7f16\u7801\u5728 URL path \u4e2d\uff0cGateway \u4ece path \u4e2d\u63d0\u53d6\uff0cAgent \u5b8c\u5168\u65e0\u611f\u77e5\u3002

"},{"location":"concepts/base-url-integration/#claw-r1","title":"\u5728 Claw-R1 \u4e2d\u7684\u4f7f\u7528","text":""},{"location":"concepts/base-url-integration/#_2","title":"\u9ed1\u76d2\u79bb\u7ebf\u6a21\u5f0f","text":"

BlackBoxAgentFlowBase \u81ea\u52a8\u7ba1\u7406 base_url \u7684\u751f\u547d\u5468\u671f\uff1a

1. POST /init_trajectory              \u2192 \u83b7\u53d6 base_url\n2. POST {base_url}/v1/register_trajectory  \u2192 \u6ce8\u518c channel/metadata\n3. Agent \u4f7f\u7528 base_url \u8fdb\u884c\u591a\u8f6e\u5bf9\u8bdd     \u2192 Gateway \u81ea\u52a8\u6536\u96c6 Step\n4. POST {base_url}/v1/complete_trajectory  \u2192 \u6807\u8bb0\u5b8c\u6210\n

Agent \u53ea\u9700\u8981\u63a5\u6536 base_url \u53c2\u6570\uff0c\u5176\u4f59\u7531\u8bad\u7ec3\u6846\u67b6\u5904\u7406\u3002

"},{"location":"concepts/base-url-integration/#_3","title":"\u9ed1\u76d2\u5728\u7ebf\u6a21\u5f0f","text":"

\u5728\u7ebf\u6a21\u5f0f\u4e0b\uff0c\u5916\u90e8\u670d\u52a1\u76f4\u63a5\u8c03\u7528 Gateway \u7684 init_trajectory \u83b7\u53d6 base_url\uff0c\u7136\u540e\u5c06\u5176\u4f20\u9012\u7ed9 Agent\u3002Agent \u7684\u6bcf\u6b21 LLM \u8c03\u7528\u90fd\u81ea\u52a8\u88ab Gateway \u8bb0\u5f55\u3002

"},{"location":"concepts/base-url-integration/#sdk-hook","title":"\u4e3a\u4ec0\u4e48\u4f18\u4e8e SDK Hook","text":"\u7ef4\u5ea6 SDK Hook base_url Agent \u4ee3\u7801\u4fee\u6539 \u9700\u8981\u6ce8\u5165 hook \u4ee3\u7801 \u53ea\u6539\u4e00\u4e2a\u53c2\u6570 \u591a\u8bed\u8a00\u652f\u6301 \u6bcf\u79cd\u8bed\u8a00\u9700\u8981\u5355\u72ec\u5b9e\u73b0 \u6240\u6709\u8bed\u8a00\u901a\u7528 \u7248\u672c\u517c\u5bb9\u6027 SDK \u66f4\u65b0\u53ef\u80fd\u7834\u574f hook HTTP \u534f\u8bae\u7a33\u5b9a \u8c03\u8bd5\u96be\u5ea6 Hook \u5c42\u589e\u52a0\u8c03\u8bd5\u590d\u6742\u5ea6 \u6807\u51c6 HTTP \u8bf7\u6c42\uff0c\u6613\u4e8e\u8c03\u8bd5 \u751f\u4ea7\u53ef\u9760\u6027 \u4e2d\u7b49 \u9ad8"},{"location":"concepts/base-url-integration/#sdk","title":"\u652f\u6301\u7684 SDK \u548c\u6846\u67b6","text":"

\u4efb\u4f55\u652f\u6301\u81ea\u5b9a\u4e49 base_url \u7684 OpenAI \u517c\u5bb9 SDK \u90fd\u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528\uff1a

  • Python: openai, httpx, requests
  • JavaScript/TypeScript: openai-node
  • Go: go-openai
  • \u6846\u67b6: LangChain, LlamaIndex, AutoGen, CrewAI \u7b49
"},{"location":"concepts/middleware-layer/","title":"Middleware Layer","text":""},{"location":"concepts/middleware-layer/#_1","title":"\u4e3a\u4ec0\u4e48\u9700\u8981\u6570\u636e\u4e2d\u95f4\u4ef6\uff1f","text":"

Agentic RL \u4e2d\uff0cAgent \u4ea7\u751f\u4ea4\u4e92\u6570\u636e\uff0cTrainer \u6d88\u8d39\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\u3002\u7136\u800c\u5728\u5b9e\u9645\u573a\u666f\u4e2d\uff0c\u4e24\u8005\u4e4b\u95f4\u5b58\u5728\u663e\u8457\u7684\u4e0d\u5bf9\u79f0\uff1a

  • \u6570\u636e\u6765\u6e90\u591a\u6837\uff1a\u767d\u76d2 Agent\u3001\u9ed1\u76d2 Agent\u3001\u5728\u7ebf\u670d\u52a1 Agent\uff0c\u4ea7\u51fa\u7684\u6570\u636e\u683c\u5f0f\u548c\u9891\u7387\u5404\u4e0d\u76f8\u540c
  • \u6570\u636e\u8d28\u91cf\u53c2\u5dee\uff1a\u5e76\u975e\u6240\u6709\u4ea4\u4e92\u90fd\u6709\u8bad\u7ec3\u4ef7\u503c\uff0c\u9700\u8981\u8bc4\u4f30\u548c\u7b5b\u9009
  • \u4ea7\u6d88\u901f\u7387\u4e0d\u5339\u914d\uff1aAgent \u4fa7\u7684\u6570\u636e\u4ea7\u751f\u901f\u7387\u4e0e Trainer \u4fa7\u7684\u6d88\u8d39\u901f\u7387\u5f80\u5f80\u4e0d\u540c\u6b65
  • \u6570\u636e\u9700\u8981\u7ba1\u7406\uff1a\u5206\u533a\u3001\u7d22\u5f15\u3001\u80cc\u538b\u63a7\u5236\u3001\u7edf\u8ba1\u76d1\u63a7 \u2014 \u8fd9\u4e9b\u4e0d\u662f\u7b80\u5355\u7684\u961f\u5217\u80fd\u89e3\u51b3\u7684

Claw-R1 \u901a\u8fc7 Middleware Layer\uff08Gateway + DataPool\uff09\u5728 Agent \u4fa7\u548c Training \u4fa7\u4e4b\u95f4\u5efa\u7acb\u4e00\u5c42\u6570\u636e\u57fa\u7840\u8bbe\u65bd\uff0c\u7edf\u4e00\u89e3\u51b3\u6570\u636e\u7684\u91c7\u96c6\u3001\u7ba1\u7406\u548c\u4f9b\u7ed9\u95ee\u9898\u3002

"},{"location":"concepts/middleware-layer/#gateway-datapool","title":"Gateway + DataPool \u67b6\u6784","text":"
Agent \u4fa7                    Middleware                    Training \u4fa7\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510           \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510           \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 Agent    \u2502\u2500\u2500HTTP\u2500\u2500\u25ba  \u2502  Gateway         \u2502\u2500\u2500Ray RPC\u2500\u2500\u25ba\u2502  DataPool    \u2502\n\u2502 (\u4efb\u610f)   \u2502\u25c4\u2500\u2500HTTP\u2500\u2500  \u2502  (FastAPI, 8100) \u2502           \u2502  (Ray Actor) \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518           \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518           \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                                             \u2502 fetch_batch()\n                                                             \u25bc\n                                                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                                                      \u2502  Trainer     \u2502\n                                                      \u2502  (Ray Actor) \u2502\n                                                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
"},{"location":"concepts/middleware-layer/#gateway","title":"Gateway\uff1a\u6570\u636e\u91c7\u96c6\u5165\u53e3","text":"

Gateway \u662f\u4e00\u4e2a\u72ec\u7acb\u8fdb\u7a0b\uff08FastAPI\uff09\uff0c\u8d1f\u8d23\u4ece Agent \u4ea4\u4e92\u4e2d\u91c7\u96c6\u8bad\u7ec3\u6570\u636e\uff1a

  • \u7eaf\u4ee3\u7406\uff1a\u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f\uff0c\u53ea\u8f6c\u53d1\u8bf7\u6c42\u548c\u91c7\u96c6\u6570\u636e
  • OpenAI \u517c\u5bb9\uff1a\u9ed1\u76d2 Agent \u901a\u8fc7 base_url \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u4ece\u5bf9\u8bdd\u4e2d\u6784\u5efa Step
  • \u5ef6\u8fdf\u521d\u59cb\u5316\uff1aHTTP \u670d\u52a1\u7acb\u5373\u53ef\u7528\uff0ctokenizer \u5728\u540e\u53f0\u52a0\u8f7d

Gateway \u652f\u6301\u4e24\u79cd\u6570\u636e\u91c7\u96c6\u6a21\u5f0f\uff1a

\u6a21\u5f0f \u7aef\u70b9 \u6570\u636e\u91c7\u96c6\u65b9\u5f0f \u767d\u76d2 /generate, /submit_steps Agent \u81ea\u884c\u6784\u5efa Step \u5e76\u63d0\u4ea4 \u9ed1\u76d2 {base_url}/v1/chat/completions Gateway \u81ea\u52a8 tokenize \u5e76\u6784\u5efa Step

\u8be6\u89c1 Gateway Server\u3002

"},{"location":"concepts/middleware-layer/#datapool","title":"DataPool\uff1a\u6570\u636e\u7ba1\u7406\u6838\u5fc3","text":"

DataPool \u662f\u4e00\u4e2a Ray Actor\uff0c\u4e0d\u4ec5\u662f trajectory \u7f13\u51b2\u533a\uff0c\u66f4\u662f Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u4e2d\u67a2\uff1a

\u80fd\u529b \u8bf4\u660e \u6570\u636e\u5b58\u50a8 \u4ee5 Step \u7c92\u5ea6\u5b58\u50a8\u4ea4\u4e92\u6570\u636e\uff0c\u652f\u6301\u591a\u7ef4\u7d22\u5f15 \u8d28\u91cf\u8ffd\u8e2a \u6bcf\u4e2a Step \u8bb0\u5f55 policy_version\uff0c\u652f\u6301\u65b0\u9c9c\u5ea6\u68c0\u6d4b Channel \u5206\u533a \"train\" \u548c \"val\" \u6570\u636e\u9694\u79bb\uff0c\u4e92\u4e0d\u5e72\u6270 GRPO \u5206\u7ec4 \u6309 prompt_uid \u5206\u7ec4\uff0c\u51d1\u9f50\u6240\u6709 rollout \u540e\u624d\u4f9b\u7ed9\u8bad\u7ec3 \u5bb9\u91cf\u7ba1\u7406 \u53ef\u914d\u7f6e max_queue_size\uff0c\u8d85\u9650\u81ea\u52a8\u4e22\u5f03\u6700\u65e7\u6570\u636e \u7edf\u8ba1\u76d1\u63a7 \u5b9e\u65f6\u63d0\u4f9b\u961f\u5217\u6df1\u5ea6\u3001produce/consume/drop \u901f\u7387\u7b49\u6307\u6807

\u8be6\u89c1 DataPool\u3002

"},{"location":"concepts/middleware-layer/#step","title":"Step \u6570\u636e\u6a21\u578b","text":"

Step \u662f\u6570\u636e\u7ba1\u7406\u7684\u539f\u5b50\u5355\u4f4d\uff0c\u8bb0\u5f55\u4e86\u4e00\u6b21 Agent \u4ea4\u4e92\u7684\u5b8c\u6574\u4fe1\u606f\uff1a

@dataclass\nclass Step:\n    prompt_ids:     list[int]   # state: \u5b8c\u6574\u4e0a\u4e0b\u6587 token IDs\n    response_ids:   list[int]   # action: LLM \u751f\u6210\u7684 token IDs\n    reward:         float       # \u5373\u65f6 reward\uff08\u8d28\u91cf\u8bc4\u5206\uff09\n    trajectory_uid: str         # \u540c\u4e00\u5bf9\u8bdd\u7684 step \u5171\u4eab\u6b64 ID\n    prompt_uid:     str         # \u540c\u4e00 prompt \u7684 rollout \u5171\u4eab\u6b64 ID\n    step_index:     int         # trajectory \u5185\u7684\u4f4d\u7f6e\n    policy_version: int         # \u751f\u6210\u65f6\u7684\u7b56\u7565\u7248\u672c\uff08\u65b0\u9c9c\u5ea6\u8ffd\u8e2a\uff09\n    is_last:        bool        # \u662f\u5426\u4e3a\u6700\u540e\u4e00\u4e2a step\n    metadata:       dict        # \u8f85\u52a9\u6570\u636e\uff08\u6765\u6e90\u3001\u6570\u636e\u96c6\u5b57\u6bb5\u7b49\uff09\n
"},{"location":"concepts/middleware-layer/#reward","title":"Reward \u6807\u6ce8\u4e0e\u6570\u636e\u8d28\u91cf\u8bc4\u4f30","text":"

Reward \u8ba1\u7b97\u4e0e Agent \u670d\u52a1\u89e3\u8026\uff0c\u786e\u4fdd\u6570\u636e\u8d28\u91cf\u8bc4\u4f30\u4e0d\u5f71\u54cd Agent \u670d\u52a1\u5ef6\u8fdf\uff1a

  1. Gateway \u91c7\u96c6 Step \u65f6 reward=0.0\uff08\u539f\u59cb\u6570\u636e\uff09
  2. DataPool \u5b58\u50a8\u539f\u59cb Step
  3. Trainer \u5728\u6d88\u8d39\u6570\u636e\u524d\u901a\u8fc7 RewardLoopWorker \u8bc4\u4f30\u6570\u636e\u8d28\u91cf\uff08\u8ba1\u7b97 reward\uff09
  4. \u8bc4\u4f30\u540e\u7684 reward \u7528\u4e8e advantage \u8ba1\u7b97\u548c\u6570\u636e\u7b5b\u9009

\u8fd9\u79cd\u8bbe\u8ba1\u4f7f\u5f97\u5373\u4f7f\u662f\u6162\u901f\u7684 generative reward model \u6216\u4eba\u7c7b\u53cd\u9988\u7ba1\u7ebf\u4e5f\u4e0d\u4f1a\u5f71\u54cd Agent \u7684\u6b63\u5e38\u670d\u52a1\u3002

"},{"location":"concepts/production-scenario/","title":"Production Agent Scenario","text":""},{"location":"concepts/production-scenario/#agentic-rl","title":"Agentic RL \u4e2d\u7684\u9690\u542b\u5047\u8bbe","text":"

\u51e0\u4e4e\u6240\u6709 Agentic RL \u6846\u67b6\u90fd\u5efa\u7acb\u5728\u4e00\u4e2a\u9690\u542b\u5047\u8bbe\u4e0a\uff1a

\u8bad\u7ec3\u9636\u6bb5 \u2260 \u90e8\u7f72\u9636\u6bb5

\u6807\u51c6\u6d41\u7a0b\uff1a\u5728\u79bb\u7ebf/\u6a21\u62df\u6570\u636e\u4e0a\u8bad\u7ec3 \u2192 \u90e8\u7f72\u56fa\u5b9a\u6a21\u578b \u2192 \u5b9a\u671f\u91cd\u8bad\u3002

\u8fd9\u5728\u7814\u7a76\u573a\u666f\u4e0b\u53ef\u884c\uff0c\u4f46\u5728\u751f\u4ea7\u73af\u5883\u4e2d\u9047\u5230\u6839\u672c\u6027\u969c\u788d\uff1a

\u95ee\u9898 \u8868\u73b0 \u5206\u5e03\u504f\u79fb \u8bad\u7ec3\u6570\u636e\u662f\u5408\u6210\u7684\uff1b\u771f\u5b9e\u7528\u6237\u8bf7\u6c42\u5206\u5e03\u4e0d\u540c \u2192 \u90e8\u7f72\u540e\u80fd\u529b\u9000\u5316 \u51b7\u542f\u52a8 \u65b0\u90e8\u7f72\u7684\u6a21\u578b\u5bf9\u7279\u5b9a\u7528\u6237\u7684\u4e60\u60ef\u3001\u5de5\u5177\u3001\u5de5\u4f5c\u6d41\u4e00\u65e0\u6240\u77e5 \u2192 \u6f2b\u957f\u7684\"\u9884\u70ed\"\u671f \u957f\u5c3e\u4efb\u52a1 Benchmark \u8986\u76d6\u5e38\u89c1\u4efb\u52a1\uff1b\u7528\u6237\u7684\u5c0f\u4f17\u9700\u6c42\u65e0\u6cd5\u88ab\u79bb\u7ebf\u8bad\u7ec3\u8986\u76d6 \u73af\u5883\u6f02\u79fb \u5de5\u5177 API \u66f4\u65b0\u3001\u7528\u6237\u884c\u4e3a\u53d8\u5316 \u2192 \u9759\u6001\u6a21\u578b\u65e0\u6cd5\u81ea\u9002\u5e94"},{"location":"concepts/production-scenario/#claw-r1-agent","title":"Claw-R1 \u7684\u6838\u5fc3\u573a\u666f\uff1a\u4e2a\u4eba Agent \u81ea\u6211\u8fdb\u5316","text":"

Claw-R1 \u7684\u9996\u4e2a\u9a8c\u8bc1\u573a\u666f\u662f OpenClaw \u4e2a\u4eba\u52a9\u624b\uff1a

\u8bbe\u7f6e\uff1a\n  \u7528\u6237\u5728 Mac Mini \u4e0a\u90e8\u7f72 OpenClaw\uff0c\u8fde\u63a5 Slack / \u5fae\u4fe1 / \u90ae\u4ef6\u3002\n  \u6bcf\u5929\u901a\u8fc7\u6d88\u606f\u4e0e OpenClaw \u4ea4\u4e92\uff1a\u65e5\u7a0b\u5b89\u6392\u3001\u4fe1\u606f\u68c0\u7d22\u3001\u4ee3\u7801\u8f85\u52a9\u7b49\u3002\n\n\u4f20\u7edf\u65b9\u6848\uff1a\n  OpenClaw \u4f7f\u7528\u56fa\u5b9a\u7684 GPT-4o / Claude 3.5\u3002\n  \u80fd\u529b\u4e0d\u4f1a\u968f\u4f7f\u7528\u800c\u589e\u957f\u3002\n\nClaw-R1 \u65b9\u6848\uff1a\n  1. \u7528\u6237\u6d88\u606f \u2192 OpenClaw \u2192 Gateway\uff08\u62e6\u622a LLM \u8c03\u7528\uff09\n  2. Gateway \u8bb0\u5f55\u6bcf\u6b21\u4ea4\u4e92 \u2192 DataPool\uff08\u672c\u5730\uff09\n  3. Reward Model \u5bf9\u6bcf\u6b21\u4ea4\u4e92\u8bc4\u5206\n  4. \u8fdc\u7a0b\u670d\u52a1\u5668\u4e0a\u7684\u8bad\u7ec3\u5f15\u64ce\u6301\u7eed\u6d88\u8d39 DataPool\uff0c\u66f4\u65b0\u6a21\u578b\u6743\u91cd\n  5. \u66f4\u65b0\u7684\u6743\u91cd\u63a8\u9001\u56de Gateway\uff1b\u4e0b\u6b21\u8c03\u7528\u4f7f\u7528\u6539\u8fdb\u540e\u7684\u6a21\u578b\n\n\u7ed3\u679c\uff1a\n  \u7528\u6237 Mac Mini \u4e0a\u7684 OpenClaw \u4f1a\u968f\u65f6\u95f4\u63a8\u79fb\u8d8a\u6765\u8d8a\u4e86\u89e3\u8be5\u7528\u6237\u3002\n
"},{"location":"concepts/production-scenario/#rl","title":"\u4f20\u7edf RL \u6846\u67b6\u65e0\u6cd5\u6ee1\u8db3\u7684\u4e09\u4e2a\u9700\u6c42","text":""},{"location":"concepts/production-scenario/#1","title":"\u2460 \u670d\u52a1\u8fde\u7eed\u6027","text":"

\u6a21\u578b\u6743\u91cd\u66f4\u65b0\u4e0d\u80fd\u4e2d\u65ad Gateway \u7684\u8bf7\u6c42\u5904\u7406\u3002\u5728 Claw-R1 \u4e2d\uff1a

  • Trainer \u76f4\u63a5\u7ba1\u7406 Rollout Engine \u548c Reward Model \u7684\u751f\u547d\u5468\u671f\uff08wake_up / sleep / \u6743\u91cd\u540c\u6b65\uff09
  • Gateway \u662f\u7eaf HTTP \u4ee3\u7406 \u2014 \u53ea\u8f6c\u53d1\u8bf7\u6c42\u548c\u63d0\u4ea4 step\uff1b\u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f
  • \u8fd9\u4fdd\u8bc1\u4e86\u5373\u4f7f\u5728\u6743\u91cd\u66f4\u65b0\u671f\u95f4\uff0c\u8bf7\u6c42\u8f6c\u53d1\u548c\u6570\u636e\u6536\u96c6\u4e5f\u80fd\u6301\u7eed\u8fdb\u884c
"},{"location":"concepts/production-scenario/#2","title":"\u2461 \u65e0\u9884\u8bbe\u6570\u636e","text":"

\u4f20\u7edf\u6846\u67b6\u9700\u8981\u9884\u5148\u6536\u96c6\u7684\u6570\u636e\u96c6\u3002Claw-R1 \u7684\u8bad\u7ec3\u6570\u636e\u5b8c\u5168\u6765\u81ea\u5b9e\u65f6\u7528\u6237\u4ea4\u4e92\uff1a

  • \u7528\u6237\u95ee\u4e86\u4ec0\u4e48\u3001Agent \u5982\u4f55\u56de\u7b54\u3001\u8c03\u7528\u4e86\u54ea\u4e9b\u5de5\u5177 \u2014 \u8fd9\u4e9b\u81ea\u52a8\u6210\u4e3a\u8bad\u7ec3\u6570\u636e
  • \u96f6\u6570\u636e\u5de5\u7a0b\uff1b\u6570\u636e\u968f\u670d\u52a1\u8fd0\u884c\u81ea\u7136\u79ef\u7d2f
"},{"location":"concepts/production-scenario/#3-reward","title":"\u2462 \u771f\u5b9e\u73af\u5883\u7684 Reward \u4fe1\u53f7","text":"

\u4f20\u7edf RLVR \u7684 reward \u6765\u81ea\u53ef\u9a8c\u8bc1\u7684\u4efb\u52a1\u7ed3\u679c\u3002\u751f\u4ea7\u73af\u5883\u7684 reward \u66f4\u52a0\u5fae\u5999\uff1a

  • \u7528\u6237\u7ee7\u7eed\u8ffd\u95ee \u2192 \u9690\u5f0f\u6b63\u4fe1\u53f7
  • \u7528\u6237\u7ea0\u6b63 Agent \u2192 \u8d1f\u53cd\u9988
  • \u4efb\u52a1\u5b8c\u6210\u540e\u65e0\u540e\u7eed \u2192 Reward Model \u4f30\u8ba1\u4e2d\u95f4\u6b65\u9aa4\u8d28\u91cf

Claw-R1 \u4f7f\u7528 Reward Model \u5c06\u8fd9\u4e9b\u8f6f\u4fe1\u53f7\u8f6c\u6362\u4e3a\u53ef\u8bad\u7ec3\u7684 process reward\u3002

"},{"location":"concepts/production-scenario/#_1","title":"\u4e09\u79cd\u8fd0\u884c\u6a21\u5f0f","text":"\u6a21\u5f0f Agent \u7c7b\u578b \u6570\u636e\u6765\u6e90 \u8bf4\u660e \u767d\u76d2\u79bb\u7ebf AgentFlow (Python) \u5408\u6210\u6570\u636e\u96c6\u6216\u9884\u6536\u96c6\u7684 trajectory \u5df2\u5b8c\u6574\u5b9e\u73b0\uff1b\u63a8\u8350\u7528\u4e8e\u7814\u7a76 \u9ed1\u76d2\u79bb\u7ebf \u4efb\u4f55 HTTP Agent \u9884\u6536\u96c6\u7684\u6570\u636e\u96c6 \u5df2\u5b8c\u6574\u5b9e\u73b0\uff1b\u901a\u8fc7 base_url \u63a5\u5165 \u9ed1\u76d2\u5728\u7ebf \u4efb\u4f55 HTTP Agent \u5b9e\u65f6\u7528\u6237\u4ea4\u4e92 \u76ee\u6807\u751f\u4ea7\u6a21\u5f0f\uff1bGateway \u7aef\u70b9\u5df2\u5b9e\u73b0"},{"location":"concepts/production-scenario/#_2","title":"\u90e8\u7f72 = \u8bad\u7ec3","text":"

Claw-R1 \u5f15\u5165\u4e86\u4e00\u79cd\u65b0\u8303\u5f0f\uff1a

\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502         \u4f20\u7edf\uff1a\u8bad\u7ec3 \u2192 \u90e8\u7f72\uff08\u56fa\u5b9a\uff09                      \u2502\n\u2502                                                      \u2502\n\u2502  [\u5408\u6210\u6570\u636e] \u2192 [\u8bad\u7ec3] \u2192 [\u56fa\u5b9a\u6a21\u578b] \u2192 \u7528\u6237               \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502         Claw-R1\uff1a\u90e8\u7f72 = \u8bad\u7ec3\uff08\u6301\u7eed\uff09                   \u2502\n\u2502                                                      \u2502\n\u2502  \u7528\u6237 \u2500\u2500\u25ba Agent \u2500\u2500\u25ba [\u5b9e\u65f6\u6570\u636e] \u2500\u2500\u25ba \u8bad\u7ec3 \u2500\u2500\u25ba Agent     \u2502\n\u2502           \u25b2___________________________________|      \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n

\u5728\u8fd9\u79cd\u8303\u5f0f\u4e0b\uff1a

  • \u6bcf\u6b21\u7528\u6237\u4ea4\u4e92\u90fd\u662f\u4e00\u4e2a\u8bad\u7ec3\u6837\u672c
  • \u6bcf\u6b21\u6a21\u578b\u66f4\u65b0\u90fd\u6539\u5584 Agent \u7684\u771f\u5b9e\u4e16\u754c\u8868\u73b0
  • Agent \u8fd0\u884c\u65f6\u95f4\u8d8a\u957f\uff0c\u5bf9\u5176\u7279\u5b9a\u7528\u6237\u548c\u73af\u5883\u7684\u8868\u73b0\u8d8a\u597d
"},{"location":"configuration/","title":"Configuration Reference","text":"

Claw-R1 \u4f7f\u7528 Hydra \u8fdb\u884c\u5c42\u6b21\u5316\u914d\u7f6e\u7ba1\u7406\u3002\u6240\u6709 YAML \u914d\u7f6e\u4f4d\u4e8e claw_r1/config/\u3002

"},{"location":"configuration/#_1","title":"\u914d\u7f6e\u6587\u4ef6","text":"\u6587\u4ef6 \u7528\u9014 agent_ppo_trainer.yaml \u57fa\u7840 PPO trainer \u914d\u7f6e\uff08\u7ee7\u627f veRL \u7684 ppo_trainer\uff09 async_ppo_trainer.yaml \u5f02\u6b65\u8bad\u7ec3\u4e13\u7528\u914d\u7f6e overrides/rollout.yaml Rollout worker \u8bbe\u7f6e\uff08\u5f02\u6b65\u6a21\u5f0f\u3001Agent Flow\uff09"},{"location":"configuration/#async_ppo_traineryaml","title":"async_ppo_trainer.yaml","text":"

\u5f02\u6b65\u8bad\u7ec3\u7684\u6838\u5fc3\u914d\u7f6e\u6587\u4ef6\uff1a

defaults:\n  - ppo_trainer\n  - /overrides/rollout@actor_rollout_ref.rollout\n  - _self_\n\n# -- \u5f02\u6b65\u8bad\u7ec3\u8bbe\u7f6e --\nasync_training:\n  staleness_threshold: 0.1           # off-policy \u5bb9\u5fcd\u5ea6\n  trigger_parameter_sync_step: 4     # \u6bcf N \u6b65\u540c\u6b65\u6743\u91cd\u5230 Rollouter\n  require_batches: 1                 # \u6bcf\u6b21\u4ece DataPool \u53d6\u7684 batch \u6570\n  partial_rollout: false             # \u540c\u6b65\u65f6\u662f\u5426\u4e2d\u65ad\u8fdb\u884c\u4e2d\u7684 rollout\n  use_rollout_log_probs: true        # \u4f7f\u7528 rollout \u65f6\u6536\u96c6\u7684 log_probs\n  max_queue_size: null               # DataPool \u961f\u5217\u5927\u5c0f\uff08null = \u65e0\u9650\uff09\n\n  checkpoint_engine:\n    enable: true\n    device_buffer_size_M: 4096\n\n# -- Training GPU Pool --\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 4\n\n# -- Rollout GPU Pool --\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 4\n  total_epochs: 10\n  test_freq: 1\n\n# -- Actor \u914d\u7f6e --\nactor_rollout_ref:\n  hybrid_engine: false\n  actor:\n    use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, true}\n  checkpoint_engine: ${oc.select:async_training.checkpoint_engine, null}\n

GPU \u5206\u914d

trainer \u548c rollout \u90fd\u5fc5\u987b\u5206\u914d GPU\u3002\u603b GPU \u6570 = trainer.nnodes \u00d7 trainer.n_gpus_per_node + rollout.nnodes \u00d7 rollout.n_gpus_per_node\u3002

"},{"location":"configuration/#overridesrolloutyaml","title":"overrides/rollout.yaml","text":"

Rollout worker \u7684\u914d\u7f6e\u8986\u76d6\uff1a

name: vllm\nmode: async\n\nagent:\n  default_agent_flow: single_step_single_turn_agent\n  agent_flow_config_path: null\n
"},{"location":"configuration/#gateway","title":"Gateway \u914d\u7f6e","text":"

Gateway \u4f5c\u4e3a\u72ec\u7acb\u8fdb\u7a0b\u8fd0\u884c\uff0c\u901a\u8fc7 CLI \u53c2\u6570\u914d\u7f6e\uff08\u975e Hydra\uff09\uff1a

python -m claw_r1.gateway.gateway \\\n    --data-pool-name   data_pool \\\n    --vllm-addresses   host1:8001,host2:8001 \\\n    --tokenizer-path   /path/to/model \\\n    --prompt-length    4096 \\\n    --response-length  1024 \\\n    --reward-worker-name reward_loop_worker \\\n    --ray-address      auto \\\n    --ray-namespace    default \\\n    --host             0.0.0.0 \\\n    --port             8100\n

Gateway \u542f\u52a8\u8d85\u65f6\u53ef\u901a\u8fc7 Hydra \u914d\u7f6e\uff1a

trainer:\n  gateway_startup_timeout: 300   # \u79d2\uff0c\u9ed8\u8ba4 300\n
"},{"location":"configuration/#agent-flow","title":"Agent Flow \u914d\u7f6e","text":""},{"location":"configuration/#agent-flow_1","title":"\u767d\u76d2 Agent Flow","text":"

\u5728 overrides/rollout.yaml \u4e2d\u6307\u5b9a\uff1a

agent:\n  default_agent_flow: single_step_single_turn_agent\n
"},{"location":"configuration/#agent-flow_2","title":"\u9ed1\u76d2 Agent Flow","text":"

\u901a\u8fc7\u5916\u90e8 YAML \u6587\u4ef6\u6ce8\u518c\uff1a

# claw_r1/blackbox_agent/agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n

\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u5f15\u7528\uff1a

actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\nactor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml\n
"},{"location":"configuration/#gpu","title":"\u591a GPU \u914d\u7f6e","text":"
# \u72ec\u7acb\u7684 GPU \u6c60\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 2    # 2 GPU \u7528\u4e8e\u8bad\u7ec3\uff08Actor + Critic\uff09\n\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 1    # 1 GPU \u7528\u4e8e\u63a8\u7406\uff08vLLM\uff09\n

\u8d44\u6e90\u6c60\u9694\u79bb

Claw-R1 \u4f7f\u7528 Ray \u7684\u8d44\u6e90\u7ec4\u673a\u5236\u786e\u4fdd Trainer \u548c Rollouter \u7684 GPU \u4e0d\u91cd\u53e0\u3002\u4f7f\u7528 async_ppo_trainer.yaml \u65f6\u81ea\u52a8\u914d\u7f6e\u3002\u8be6\u89c1 Async Training\u3002

"},{"location":"configuration/#_2","title":"\u5b8c\u6574\u8bad\u7ec3\u811a\u672c\u793a\u4f8b","text":"
python3 -m claw_r1.async_main \\\n    algorithm.adv_estimator=grpo \\\n    data.train_files=$TRAIN_FILE \\\n    data.val_files=$VAL_FILE \\\n    data.train_batch_size=128 \\\n    data.max_prompt_length=512 \\\n    data.max_response_length=1024 \\\n    data.return_raw_chat=True \\\n    actor_rollout_ref.model.path=$MODEL \\\n    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \\\n    actor_rollout_ref.rollout.name=vllm \\\n    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \\\n    actor_rollout_ref.rollout.n=5 \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    trainer.n_gpus_per_node=2 \\\n    trainer.nnodes=1 \\\n    rollout.n_gpus_per_node=1 \\\n    rollout.nnodes=1 \\\n    async_training.trigger_parameter_sync_step=1 \\\n    async_training.use_rollout_log_probs=true\n

\u66f4\u591a\u793a\u4f8b\u89c1 example/ \u76ee\u5f55\u3002

"},{"location":"getting-started/","title":"Getting Started","text":"
  • Installation

    \u73af\u5883\u914d\u7f6e\u3001\u4f9d\u8d56\u5b89\u88c5\u548c\u9a8c\u8bc1\u3002

    Installation

  • Quick Start

    5 \u5206\u949f\u5185\u8fd0\u884c\u4f60\u7684\u7b2c\u4e00\u4e2a\u5f02\u6b65\u8bad\u7ec3\u5b9e\u9a8c\u3002

    Quick Start

"},{"location":"getting-started/#_1","title":"\u524d\u7f6e\u6761\u4ef6","text":"\u4f9d\u8d56 \u6700\u4f4e\u7248\u672c Python 3.10+ PyTorch 2.0+ CUDA 12.1+ Ray 2.10+ GPU 3 \u5f20\uff082 \u8bad\u7ec3 + 1 \u63a8\u7406\uff09"},{"location":"getting-started/#_2","title":"\u67b6\u6784\u4e00\u89c8","text":"
\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502   Agent     \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 Gateway  \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 DataPool \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 Trainer  \u2502\n\u2502 (\u9ed1\u76d2/\u767d\u76d2) \u2502\u25c4\u2500\u2500\u2500\u2500\u2502 (:8100)  \u2502     \u2502          \u2502     \u2502          \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2518\n                                                           \u2502 \u6743\u91cd\u540c\u6b65\n                                                           \u25bc\n                                                     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                                                     \u2502  vLLM    \u2502\n                                                     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
"},{"location":"getting-started/installation/","title":"Installation Guide","text":"

Claw-R1 uses the same environment setup as verl.

"},{"location":"getting-started/installation/#base-environment","title":"Base Environment","text":"

Follow the official verl installation guide, but make sure the environment ends up with verl==0.7.0.

If you want a broader overview of the base training workflow, the verl quickstart is also useful.

"},{"location":"getting-started/installation/#what-this-means-for-claw-r1","title":"What This Means for Claw-R1","text":"

Once the verl environment is working, Claw-R1 should run in the same environment. In practice, that means you can:

  • prepare a Python environment with verl==0.7.0
  • clone this repository
  • run Claw-R1 commands directly from the repository root

You do not need to install Claw-R1 as a separate package.

The documentation in this repository intentionally does not duplicate a separate environment guide, so that the infrastructure setup stays aligned with verl.

"},{"location":"getting-started/quickstart/","title":"Quick Start","text":"

\u672c\u6307\u5357\u5c55\u793a\u5982\u4f55\u5feb\u901f\u8fd0\u884c Claw-R1 \u7684\u5f02\u6b65\u8bad\u7ec3\u3002

"},{"location":"getting-started/quickstart/#_1","title":"\u524d\u7f6e\u6761\u4ef6","text":"
  • \u5df2\u5b8c\u6210 \u5b89\u88c5
  • \u81f3\u5c11 3 \u5f20 GPU\uff082 \u5f20\u8bad\u7ec3 + 1 \u5f20\u63a8\u7406\uff09
  • \u8bad\u7ec3\u6570\u636e\uff08parquet \u683c\u5f0f\uff09
"},{"location":"getting-started/quickstart/#black-box","title":"Black-box \u6a21\u5f0f\uff08\u63a8\u8350\u5165\u95e8\uff09","text":"

\u9ed1\u76d2\u6a21\u5f0f\u4e0b\uff0cAgent \u4f7f\u7528\u6807\u51c6 OpenAI API \u4e0e Gateway \u4ea4\u4e92\uff0c\u65e0\u9700\u4fee\u6539 Agent \u4ee3\u7801\u3002\u4ee5 GSM8K \u6570\u5b66\u9898\u4e3a\u4f8b\uff1a

"},{"location":"getting-started/quickstart/#1","title":"1. \u51c6\u5907\u6570\u636e","text":"
# \u4e0b\u8f7d GSM8K \u6570\u636e\u96c6\uff08parquet \u683c\u5f0f\uff09\n# \u786e\u4fdd train.parquet \u548c test.parquet \u5728 ~/data/gsm8k/ \u4e0b\n
"},{"location":"getting-started/quickstart/#2","title":"2. \u8fd0\u884c\u8bad\u7ec3","text":"
export CUDA_VISIBLE_DEVICES=0,1,2\n\nsh example/test_async_blackbox.sh\n

\u8be5\u811a\u672c\u4f1a\uff1a

  1. \u542f\u52a8 Ray \u96c6\u7fa4
  2. \u521b\u5efa DataPool\uff08Ray Actor\uff09
  3. \u5728 GPU 0-1 \u4e0a\u90e8\u7f72 Actor + Critic\uff08\u8bad\u7ec3\uff09
  4. \u5728 GPU 2 \u4e0a\u90e8\u7f72 vLLM\uff08\u63a8\u7406\uff09
  5. \u542f\u52a8 Gateway\uff08\u7aef\u53e3 8100\uff09
  6. \u8fd0\u884c BlackBoxGSM8KAgentFlow\uff1a
    • \u4e3a\u6bcf\u4e2a\u6837\u672c\u8c03\u7528 init_trajectory \u83b7\u53d6 base_url
    • \u521b\u5efa GSM8KAgent\uff0c\u4f7f\u7528 base_url \u4f5c\u4e3a OpenAI API \u7684 endpoint
    • Agent \u901a\u8fc7\u591a\u8f6e tool calling \u89e3\u9898
    • Gateway \u81ea\u52a8\u6536\u96c6\u6bcf\u8f6e\u5bf9\u8bdd\u4e3a Step \u5e76\u63d0\u4ea4\u5230 DataPool
  7. AsyncTrainer \u4ece DataPool \u62c9\u53d6 batch \u8fdb\u884c PPO \u8bad\u7ec3
  8. \u5b9a\u671f\u540c\u6b65\u6743\u91cd\u5230 vLLM
"},{"location":"getting-started/quickstart/#3","title":"3. \u5173\u952e\u914d\u7f6e\u53c2\u6570","text":"
# GPU \u5206\u914d\ntrainer.n_gpus_per_node=2        # \u8bad\u7ec3\u7528 2 \u5f20 GPU\nrollout.n_gpus_per_node=1        # \u63a8\u7406\u7528 1 \u5f20 GPU\n\n# Agent Flow\nactor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent\nactor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml\n\n# \u5f02\u6b65\u8bad\u7ec3\nasync_training.trigger_parameter_sync_step=1   # \u6bcf\u6b65\u540c\u6b65\u6743\u91cd\nactor_rollout_ref.rollout.n=5                  # \u6bcf\u4e2a prompt \u751f\u6210 5 \u6761 trajectory\n
"},{"location":"getting-started/quickstart/#white-box","title":"White-box \u6a21\u5f0f","text":"

\u767d\u76d2\u6a21\u5f0f\u4e0b\uff0cAgent \u903b\u8f91\u7528 Python \u7f16\u5199\uff0c\u76f4\u63a5\u901a\u8fc7 Gateway \u7684 /generate \u548c /submit_steps \u7aef\u70b9\u4ea4\u4e92\u3002

export CUDA_VISIBLE_DEVICES=0,1,2\n\nsh example/test_async.sh\n

\u767d\u76d2\u6a21\u5f0f\u4f7f\u7528 MultiStepAgentFlow \u6216 SingleStepSingleTurnAgentFlow\uff0cAgent \u81ea\u884c\u7ba1\u7406 tokenize \u548c Step \u6784\u5efa\u3002

"},{"location":"getting-started/quickstart/#agent","title":"\u81ea\u5b9a\u4e49 Agent","text":""},{"location":"getting-started/quickstart/#agent_1","title":"\u6dfb\u52a0\u9ed1\u76d2 Agent","text":"
  1. \u5b9e\u73b0 Agent \u7c7b\uff08\u53ea\u9700 base_url \u548c OpenAI API\uff09
  2. \u5b9e\u73b0 BlackBoxAgentFlowBase \u5b50\u7c7b
  3. \u5728 agent_flow_config.yaml \u4e2d\u6ce8\u518c
  4. \u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u6307\u5b9a

\u8be6\u7ec6\u6b65\u9aa4\u89c1 Black-box Agent\u3002

"},{"location":"getting-started/quickstart/#agent_2","title":"\u6dfb\u52a0\u767d\u76d2 Agent","text":"
  1. \u7ee7\u627f AgentFlowBase\uff08\u6216 MultiStepAgentFlow\uff09
  2. \u5b9e\u73b0 run() \u65b9\u6cd5
  3. \u4f7f\u7528 @register(\"name\") \u6ce8\u518c

\u8be6\u7ec6\u6b65\u9aa4\u89c1 Agent Flow\u3002

"},{"location":"getting-started/quickstart/#_2","title":"\u76d1\u63a7\u8bad\u7ec3","text":"

\u8bad\u7ec3\u65e5\u5fd7\u9ed8\u8ba4\u8f93\u51fa\u5230\u63a7\u5236\u53f0\u3002\u53ef\u914d\u7f6e SwanLab \u7b49\u65e5\u5fd7\u540e\u7aef\uff1a

trainer.logger='[\"console\",\"swanlab\"]'\ntrainer.project_name='my_project'\ntrainer.experiment_name='my_experiment'\n
"},{"location":"getting-started/quickstart/#_3","title":"\u4e0b\u4e00\u6b65","text":"
  • Components \u2014 \u4e86\u89e3\u5404\u7ec4\u4ef6\u7684\u8be6\u7ec6\u8bbe\u8ba1
  • Configuration \u2014 \u5b8c\u6574\u914d\u7f6e\u53c2\u8003
  • Gateway API \u2014 HTTP \u7aef\u70b9\u6587\u6863
"}]} \ No newline at end of file +{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"],"fields":{"title":{"boost":1000.0},"text":{"boost":1.0},"tags":{"boost":1000000.0}}},"docs":[{"location":"","title":"Claw-R1","text":"

The Data Foundation for Agentic Reinforcement Learning

Claw-R1 \u662f Agentic RL \u7684\u6570\u636e\u57fa\u7840\u8bbe\u65bd \u2014 \u4e13\u6ce8\u4e8e\u4ece\u4efb\u610f Agent \u4ea4\u4e92\u4e2d\u91c7\u96c6\u3001\u8bc4\u4f30\u548c\u7b5b\u9009\u9ad8\u8d28\u91cf\u8bad\u7ec3\u6570\u636e\uff0c\u5e76\u652f\u6301\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u7684\u6574\u5408\u3002

  • Universal Data Collection

    \u4ece\u767d\u76d2\u3001\u9ed1\u76d2\u5230\u5728\u7ebf\u670d\u52a1 Agent\uff0c\u901a\u8fc7 base_url \u673a\u5236\u96f6\u4ee3\u7801\u63a5\u5165\uff0c\u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\u3002\u652f\u6301 LangChain\u3001AutoGen\u3001CrewAI \u7b49\u4efb\u610f OpenAI \u517c\u5bb9 Agent\u3002

    Base URL Integration

  • Data Middleware Layer

    Gateway + DataPool \u6570\u636e\u4e2d\u95f4\u4ef6\uff1aGateway \u91c7\u96c6\u4ea4\u4e92\u6570\u636e\uff0cDataPool \u7ba1\u7406\u6570\u636e\u8d28\u91cf\u3001\u5206\u533a\u7f13\u51b2\u3001\u6309\u9700\u4f9b\u7ed9\u8bad\u7ec3\u5f15\u64ce\u3002

    Middleware Layer

  • Data Evaluation & Curation

    \u591a\u7ef4 Reward \u7cfb\u7edf\uff08\u89c4\u5219/\u5224\u522b\u5f0f RM/\u751f\u6210\u5f0f RM\uff09+ \u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u6574\u5408 + \u7b56\u7565\u7248\u672c\u8ffd\u8e2a\uff0c\u7cfb\u7edf\u6027\u8bc4\u4f30\u548c\u7b5b\u9009\u6570\u636e\u8d28\u91cf\u3002

    Reward System

  • Production Agent Scenario

    \"\u90e8\u7f72 = \u8bad\u7ec3\" \u8303\u5f0f\u3002Agent \u5728\u670d\u52a1\u7528\u6237\u7684\u540c\u65f6\u6301\u7eed\u91c7\u96c6\u6570\u636e\uff0c\u7528\u6237\u884c\u4e3a\uff08\u91c7\u7eb3\u3001\u4fee\u6539\u3001\u8ffd\u95ee\uff09\u5929\u7136\u6210\u4e3a\u6570\u636e\u8d28\u91cf\u4fe1\u53f7\u3002

    Production Scenario

"},{"location":"#why-claw-r1","title":"Why Claw-R1?","text":"

Agentic RL \u751f\u6001\u6b63\u84ec\u52c3\u53d1\u5c55 \u2014 verl\u3001Agent-R1\u3001Forge \u7b49\u4f18\u79c0\u6846\u67b6\u5728 Runtime \u548c\u8bad\u7ec3\u7b97\u6cd5\u65b9\u9762\u6301\u7eed\u63a8\u8fdb\u3002\u7136\u800c\uff0c\u968f\u7740 Agent \u4ece\u7b80\u5355 ReAct \u6f14\u8fdb\u5230 Claude Code\u3001OpenClaw \u7b49\u901a\u7528\u67b6\u6784\uff0c\u4e00\u4e2a\u76f8\u5bf9\u6b20\u7f3a\u3001\u503c\u5f97\u6df1\u8015\u7684\u65b9\u5411\u9010\u6e10\u6d6e\u73b0\uff1a\u5982\u4f55\u4ece\u591a\u6837\u7684 Agent \u4ea4\u4e92\u4e2d\u7cfb\u7edf\u6027\u5730\u91c7\u96c6\u3001\u8bc4\u4f30\u548c\u7b5b\u9009\u9ad8\u8d28\u91cf\u8bad\u7ec3\u6570\u636e\uff1f

Claw-R1 \u805a\u7126\u4e8e\u8fd9\u4e00\u65b9\u5411\uff0c\u63d0\u4f9b Agent \u4e0e Trainer \u4e4b\u95f4\u7684\u6570\u636e\u57fa\u7840\u8bbe\u65bd\u3002

\u7ef4\u5ea6 \u4f20\u7edf Agentic RL \u6846\u67b6 Claw-R1 \u6838\u5fc3\u5173\u6ce8 \u8bad\u7ec3\u7b97\u6cd5\u4e0e Runtime \u6570\u636e\u7684\u91c7\u96c6\u3001\u8bc4\u4f30\u4e0e\u7b5b\u9009 Agent \u63a5\u5165 \u9700\u8981\u7528\u6846\u67b6 API \u91cd\u5199 \u53ea\u6539 base_url\uff0c\u96f6\u4ee3\u7801\u4fb5\u5165 \u6570\u636e\u6765\u6e90 \u9884\u6536\u96c6\u7684\u79bb\u7ebf\u6570\u636e \u5b9e\u65f6\u4ea4\u4e92\u81ea\u52a8\u91c7\u96c6 + \u79bb\u7ebf\u6570\u636e\u96c6 \u6570\u636e\u8d28\u91cf\u7ba1\u63a7 \u8f83\u5c11\u5173\u6ce8 \u591a\u7ef4 Reward + \u4eba\u7c7b\u53cd\u9988 + \u65b0\u9c9c\u5ea6\u68c0\u6d4b \u8bad\u7ec3\u5f15\u64ce \u5185\u7f6e\u7ed1\u5b9a \u53ef\u63d2\u62d4 TrainingBackend\uff0c\u5bf9\u63a5\u4efb\u610f\u5f15\u64ce"},{"location":"#_1","title":"\u5feb\u901f\u5f00\u59cb","text":"
# \u514b\u9686\u4ed3\u5e93\ngit clone https://github.com/AgentR1/Claw-R1 && cd Claw-R1\n\n# \u8fd0\u884c\u9ed1\u76d2 GSM8K \u8bad\u7ec3\nexport CUDA_VISIBLE_DEVICES=0,1,2\nsh example/test_async_blackbox.sh\n

\u5b8c\u6574\u5b89\u88c5\u6307\u5357 \u00b7 Quick Start

"},{"location":"#_2","title":"\u9879\u76ee\u72b6\u6001","text":"\u80fd\u529b \u72b6\u6001 \u767d\u76d2 Agent \u6570\u636e\u91c7\u96c6 \u5df2\u5b9e\u73b0 \u9ed1\u76d2 Agent \u6570\u636e\u91c7\u96c6 \u5df2\u5b9e\u73b0 \u5728\u7ebf\u670d\u52a1\u6570\u636e\u91c7\u96c6 \u5f00\u53d1\u4e2d \u5f02\u6b65\u8bad\u7ec3\u4f9b\u7ed9 \u5df2\u5b9e\u73b0 \u4eba\u7c7b\u53cd\u9988\u7ba1\u7ebf \u89c4\u5212\u4e2d \u6570\u636e\u8d28\u91cf Dashboard \u89c4\u5212\u4e2d"},{"location":"#team","title":"Team","text":"

State Key Laboratory of Cognitive Intelligence, USTC

"},{"location":"#citation","title":"Citation","text":"
@misc{clawr1-2026,\n  title={Claw-R1: The Data Foundation for Agentic Reinforcement Learning},\n  author={Wang, Daoyu and Li, Qingchuan and Ouyang, Jie and Yu, Shuo and Cheng, Mingyue and Liu, Qi},\n  year={2025},\n  howpublished={\\url{https://github.com/AgentR1/Claw-R1}},\n  note={GitHub repository}\n}\n
"},{"location":"contributing/","title":"Contributing","text":"

\u611f\u8c22\u4f60\u5bf9 Claw-R1 \u7684\u5173\u6ce8\uff01\u6b22\u8fce\u8d21\u732e\u4ee3\u7801\u3001\u6587\u6863\u548c\u60f3\u6cd5\u3002

"},{"location":"contributing/#_1","title":"\u9879\u76ee\u7ed3\u6784","text":"
claw_r1/\n\u251c\u2500\u2500 agent_flow/           # Agent \u6267\u884c\u6846\u67b6\uff08\u767d\u76d2 + \u7ba1\u7406\u5668\uff09\n\u251c\u2500\u2500 blackbox_agent/       # \u9ed1\u76d2 Agent \u7cfb\u7edf\uff08Flow + Agent \u5b9e\u73b0\uff09\n\u251c\u2500\u2500 config/               # Hydra \u914d\u7f6e\u6587\u4ef6\n\u251c\u2500\u2500 data_pool/            # DataPool\uff08Ray Actor + Training Backend\uff09\n\u251c\u2500\u2500 gateway/              # Gateway Server\uff08FastAPI\uff09\n\u251c\u2500\u2500 async_main.py         # \u5f02\u6b65\u8bad\u7ec3\u5165\u53e3\n\u251c\u2500\u2500 async_rollouter.py    # AsyncRollouter\uff08Rollout GPU Pool\uff09\n\u251c\u2500\u2500 async_trainer.py      # AsyncTrainer\uff08Training GPU Pool\uff09\n\u251c\u2500\u2500 param_sync.py         # ParameterSynchronizer\n\u251c\u2500\u2500 detach_workers.py     # \u5206\u79bb\u5f0f Actor/Rollout Worker\n\u251c\u2500\u2500 core_algos.py         # PPO/GAE/GRPO \u6838\u5fc3\u7b97\u6cd5\n\u251c\u2500\u2500 reward_loop.py        # RewardLoopWorker\n\u251c\u2500\u2500 metric_utils.py       # \u6307\u6807\u805a\u5408\n\u251c\u2500\u2500 ray_agent_trainer.py  # \u540c\u6b65 Ray PPO Trainer\n\u2514\u2500\u2500 main_agent_ppo.py     # \u540c\u6b65\u8bad\u7ec3\u5165\u53e3\n
"},{"location":"contributing/#_2","title":"\u4ee3\u7801\u98ce\u683c","text":"
  • \u4f7f\u7528 Ruff \u8fdb\u884c lint \u548c\u683c\u5f0f\u5316
  • \u9075\u5faa PEP 8
  • \u7c7b\u578b\u6ce8\u89e3\uff08Python 3.10+ \u8bed\u6cd5\uff09
# \u5b89\u88c5 pre-commit hooks\npip install pre-commit\npre-commit install\n\n# \u624b\u52a8\u68c0\u67e5\nruff check .\nruff format .\n
"},{"location":"contributing/#_3","title":"\u8d21\u732e\u65b9\u5411","text":""},{"location":"contributing/#_4","title":"\u9ad8\u4f18\u5148\u7ea7","text":"
  • \u65b0\u7684\u9ed1\u76d2 Agent \u5b9e\u73b0\uff08\u53c2\u8003 blackbox_agent/gsm8k_agent.py\uff09
  • \u65b0\u7684 Reward \u51fd\u6570
  • \u6027\u80fd\u4f18\u5316\uff08DataPool \u541e\u5410\u3001Gateway \u5ef6\u8fdf\uff09
"},{"location":"contributing/#_5","title":"\u6587\u6863","text":"
  • \u6559\u7a0b\u548c\u793a\u4f8b
  • API \u6587\u6863\u8865\u5145
  • \u4e2d\u82f1\u6587\u7ffb\u8bd1
"},{"location":"contributing/#_6","title":"\u7814\u7a76","text":"
  • \u65b0\u7684 advantage \u8ba1\u7b97\u7b97\u6cd5
  • \u5728\u7ebf\u5b66\u4e60\u7b56\u7565
  • \u591a Agent \u534f\u4f5c\u8bad\u7ec3
"},{"location":"contributing/#pr","title":"PR \u6d41\u7a0b","text":"
  1. Fork \u4ed3\u5e93
  2. \u521b\u5efa feature branch\uff1agit checkout -b feature/my-feature
  3. \u7f16\u5199\u4ee3\u7801\u548c\u6d4b\u8bd5
  4. \u786e\u4fdd ruff check . \u901a\u8fc7
  5. \u63d0\u4ea4 PR\uff0c\u63cf\u8ff0\u6539\u52a8\u5185\u5bb9\u548c\u52a8\u673a
"},{"location":"contributing/#_7","title":"\u672c\u5730\u6784\u5efa\u6587\u6863","text":"
pip install mkdocs-material\nmkdocs serve\n# \u8bbf\u95ee http://localhost:8000\n
"},{"location":"contributing/#_8","title":"\u8054\u7cfb","text":"
  • GitHub Issues: AgentR1/Claw-R1
"},{"location":"api/","title":"API Reference","text":"

\u672c\u8282\u6587\u6863\u5316 Claw-R1 \u5404\u7ec4\u4ef6\u66b4\u9732\u7684 HTTP \u548c Python API\u3002

  • Gateway HTTP API

    REST \u7aef\u70b9\uff0c\u7528\u4e8e Agent \u96c6\u6210\u548c Step \u63d0\u4ea4\u3002\u5305\u62ec\u767d\u76d2\u7aef\u70b9\uff08/generate\u3001/submit_steps\uff09\u548c\u9ed1\u76d2\u7aef\u70b9\uff08{base_url}/v1/chat/completions\uff09\u3002

    Gateway API

"},{"location":"api/#python","title":"Python \u63a5\u53e3","text":""},{"location":"api/#datapool-ray-actor","title":"DataPool (Ray Actor)","text":"
import ray\nfrom claw_r1.data_pool import DataPool\n\ndata_pool = ray.get_actor(\"data_pool\")\n\n# Producer\uff08\u7531 Gateway \u5185\u90e8\u8c03\u7528\uff09\nray.get(data_pool.submit_step.remote(step, channel=\"train\"))\nray.get(data_pool.submit_steps.remote(steps, channel=\"train\"))\nray.get(data_pool.complete_trajectory.remote(trajectory_uid, channel=\"train\"))\n\n# Consumer\uff08\u7531 Trainer \u8c03\u7528\uff09\nbatch = ray.get(data_pool.fetch_batch.remote(n_rollouts=5, channel=\"train\"))\n
"},{"location":"api/#rewardloopworker-ray-actor","title":"RewardLoopWorker (Ray Actor)","text":"
from claw_r1.reward_loop import RewardLoopWorker\n\nreward_worker = ray.get_actor(\"reward_loop_worker\")\nrewards = ray.get(reward_worker.compute_score_batch.remote(steps))\n
"},{"location":"api/#agentflowbase-python-class","title":"AgentFlowBase (Python class)","text":"
from claw_r1.agent_flow import SingleStepSingleTurnAgentFlow\n\nclass MyFlow(SingleStepSingleTurnAgentFlow):\n    async def run(self, sampling_params, **kwargs) -> int:\n        text, response_ids, prompt_ids = await self.gateway_generate(\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            messages=[{\"role\": \"user\", \"content\": kwargs[\"question\"]}],\n        )\n        # \u6784\u5efa Step \u5e76\u63d0\u4ea4 ...\n        return 1\n
"},{"location":"api/#blackboxagentflowbase-python-class","title":"BlackBoxAgentFlowBase (Python class)","text":"
from claw_r1.agent_flow.agent_flow import register\nfrom claw_r1.blackbox_agent.blackbox_agent_flow import BlackBoxAgentFlowBase\n\n@register(\"my_blackbox_agent\")\nclass MyBlackBoxFlow(BlackBoxAgentFlowBase):\n    async def _run_agent(self, base_url: str, kwargs: dict) -> int:\n        # \u521b\u5efa Agent\uff0c\u4f7f\u7528 base_url \u4f5c\u4e3a OpenAI API endpoint\n        agent = MyAgent(base_url=base_url)\n        return await agent.solve(task=kwargs[\"raw_prompt\"])\n
"},{"location":"api/gateway/","title":"Gateway API","text":"

Gateway \u9ed8\u8ba4\u76d1\u542c\u7aef\u53e3 8100\uff08\u901a\u8fc7 --port \u914d\u7f6e\uff09\u3002\u6240\u6709\u7aef\u70b9\u5747\u63a5\u53d7\u548c\u8fd4\u56de JSON\u3002

"},{"location":"api/gateway/#base-url","title":"Base URL","text":"
http://<gateway-host>:8100\n
"},{"location":"api/gateway/#white-box","title":"White-box \u7aef\u70b9","text":"

\u8fd9\u4e9b\u7aef\u70b9\u7531 AgentFlowBase \u7684\u767d\u76d2 Agent \u8c03\u7528\u3002

"},{"location":"api/gateway/#post-generate","title":"POST /generate","text":"

\u5c06\u751f\u6210\u8bf7\u6c42\u8f6c\u53d1\u5230 vLLM \u5e76\u8fd4\u56de\u5e26 token ID \u7684\u54cd\u5e94\u3002

\u8c03\u7528\u65b9: AgentFlowBase.gateway_generate()

"},{"location":"api/gateway/#request","title":"Request","text":"
{\n  \"trajectory_uid\": \"string\",\n  \"prompt_uid\": \"string\",\n  \"messages\": [\n    { \"role\": \"user\", \"content\": \"string\" }\n  ],\n  \"max_tokens\": 1024,\n  \"temperature\": 1.0,\n  \"top_p\": 1.0\n}\n
\u5b57\u6bb5 \u7c7b\u578b \u5fc5\u586b \u8bf4\u660e trajectory_uid string \u662f \u5f53\u524d\u5bf9\u8bdd\u7684\u552f\u4e00 ID prompt_uid string \u662f Prompt \u7ec4 ID\uff08\u7528\u4e8e GRPO \u5206\u7ec4\uff09 messages array \u662f OpenAI \u683c\u5f0f\u7684\u804a\u5929\u6d88\u606f max_tokens int \u5426 \u6700\u5927\u54cd\u5e94\u957f\u5ea6\uff08\u9ed8\u8ba4\u53d6 --response-length\uff09 temperature float \u5426 \u91c7\u6837\u6e29\u5ea6\uff08\u9ed8\u8ba4 1.0\uff09 top_p float \u5426 Top-p \u91c7\u6837\uff08\u9ed8\u8ba4 1.0\uff09"},{"location":"api/gateway/#response","title":"Response","text":"
{\n  \"response_text\": \"string\",\n  \"response_ids\": [101, 202, 303],\n  \"prompt_ids\": [50, 60, 70, 80]\n}\n
"},{"location":"api/gateway/#post-submit_steps","title":"POST /submit_steps","text":"

\u63d0\u4ea4\u4e00\u4e2a\u6216\u591a\u4e2a Step \u5bf9\u8c61\u5230 DataPool\u3002

\u8c03\u7528\u65b9: AgentFlowBase.gateway_submit_steps()

"},{"location":"api/gateway/#request_1","title":"Request","text":"
{\n  \"steps\": [\n    {\n      \"trajectory_uid\": \"string\",\n      \"prompt_uid\": \"string\",\n      \"prompt_ids\": [50, 60, 70],\n      \"response_ids\": [101, 202],\n      \"reward\": 0.0,\n      \"step_index\": 0,\n      \"policy_version\": 42,\n      \"is_last\": true,\n      \"metadata\": {}\n    }\n  ]\n}\n
"},{"location":"api/gateway/#response_1","title":"Response","text":"
{\n  \"accepted\": 1\n}\n
"},{"location":"api/gateway/#post-compute_reward","title":"POST /compute_reward","text":"

\u4e3a\u4e00\u4e2a step \u8ba1\u7b97 reward\uff08\u7531 Trainer \u8c03\u7528\uff0c\u4e0d\u7531 Agent \u8c03\u7528\uff09\u3002

"},{"location":"api/gateway/#request_2","title":"Request","text":"
{\n  \"trajectory_uid\": \"string\",\n  \"messages\": [...],\n  \"dataset_fields\": {\n    \"ground_truth\": \"string\",\n    \"task_type\": \"string\"\n  }\n}\n
"},{"location":"api/gateway/#response_2","title":"Response","text":"
{\n  \"reward\": 0.85\n}\n
"},{"location":"api/gateway/#black-box","title":"Black-box \u7aef\u70b9","text":"

\u8fd9\u4e9b\u7aef\u70b9\u4f9b\u9ed1\u76d2 Agent \u4f7f\u7528\u3002\u9ed1\u76d2 Agent \u53ea\u9700\u8981\u77e5\u9053\u4e00\u4e2a base_url\uff0c\u6240\u6709\u4ea4\u4e92\u90fd\u901a\u8fc7\u8be5 URL \u5b8c\u6210\u3002

base_url \u7684\u683c\u5f0f\u4e3a http://<host>:<port>/<trajectory_uid>/<prompt_uid>\uff0c\u7531 POST /init_trajectory \u8fd4\u56de\u3002

"},{"location":"api/gateway/#post-init_trajectory","title":"POST /init_trajectory","text":"

\u5206\u914d\u4e00\u6761\u65b0\u7684 trajectory \u5e76\u8fd4\u56de base_url\u3002

"},{"location":"api/gateway/#request_3","title":"Request","text":"

\u65e0\u8bf7\u6c42\u4f53\u3002

"},{"location":"api/gateway/#response_3","title":"Response","text":"
{\n  \"trajectory_uid\": \"a1b2c3d4e5f6...\",\n  \"base_url\": \"http://0.0.0.0:8100/a1b2c3d4e5f6.../1\"\n}\n
"},{"location":"api/gateway/#post-base_urlv1register_trajectory","title":"POST {base_url}/v1/register_trajectory","text":"

\u6ce8\u518c trajectory \u7684 channel \u548c metadata\u3002\u5728 Agent \u5f00\u59cb\u4ea4\u4e92\u4e4b\u524d\u8c03\u7528\u3002

trajectory_uid \u4ece URL path \u4e2d\u63d0\u53d6\uff0c\u65e0\u9700\u5728 body \u4e2d\u4f20\u9012\u3002

"},{"location":"api/gateway/#request_4","title":"Request","text":"
{\n  \"channel\": \"train\",\n  \"metadata\": {\n    \"data_source\": \"gsm8k\",\n    \"ground_truth\": \"42\"\n  }\n}\n

\u6240\u6709\u5b57\u6bb5\u5747\u4e3a\u53ef\u9009\u3002channel \u9ed8\u8ba4\u4e3a \"train\"\u3002

"},{"location":"api/gateway/#response_4","title":"Response","text":"
{ \"status\": \"ok\" }\n
"},{"location":"api/gateway/#post-base_urlv1chatcompletions","title":"POST {base_url}/v1/chat/completions","text":"

OpenAI \u517c\u5bb9\u7684\u804a\u5929\u8865\u5168\u7aef\u70b9\u3002\u9ed1\u76d2 Agent \u53ea\u9700\u5c06 base_url \u8bbe\u4e3a OpenAI SDK \u7684 base_url\uff0c\u5373\u53ef\u900f\u660e\u63a5\u5165\u8bad\u7ec3\u7cfb\u7edf\u3002

Gateway \u4f1a\uff1a

  1. \u5c06\u8bf7\u6c42\u8f6c\u53d1\u5230 vLLM \u670d\u52a1\u5668
  2. \u5bf9 prompt \u548c response \u8fdb\u884c tokenize
  3. \u81ea\u52a8\u6784\u5efa Step \u5e76\u63d0\u4ea4\u5230 DataPool
  4. \u8fd4\u56de\u6807\u51c6 OpenAI \u683c\u5f0f\u7684\u54cd\u5e94
"},{"location":"api/gateway/#request_5","title":"Request","text":"

\u6807\u51c6 OpenAI chat/completions \u8bf7\u6c42\u4f53\u3002

{\n  \"model\": \"qwen\",\n  \"messages\": [\n    { \"role\": \"user\", \"content\": \"What is 2+2?\" }\n  ],\n  \"temperature\": 0.7\n}\n
"},{"location":"api/gateway/#response_5","title":"Response","text":"

\u6807\u51c6 OpenAI chat/completions \u54cd\u5e94\u4f53\u3002

{\n  \"id\": \"chatcmpl-...\",\n  \"object\": \"chat.completion\",\n  \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\": \"assistant\",\n        \"content\": \"4\"\n      },\n      \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 10,\n    \"completion_tokens\": 1,\n    \"total_tokens\": 11\n  }\n}\n
"},{"location":"api/gateway/#post-base_urlv1complete_trajectory","title":"POST {base_url}/v1/complete_trajectory","text":"

\u6807\u8bb0\u4e00\u6761 trajectory \u5b8c\u6210\u3002Agent \u5b8c\u6210\u6240\u6709\u4ea4\u4e92\u540e\u8c03\u7528\u3002

"},{"location":"api/gateway/#request_6","title":"Request","text":"

\u65e0\u8bf7\u6c42\u4f53\u3002

"},{"location":"api/gateway/#response_6","title":"Response","text":"
{ \"status\": \"ok\" }\n
"},{"location":"api/gateway/#post-complete_trajectorytrajectory_uid","title":"POST /complete_trajectory/{trajectory_uid}","text":"

\u5185\u90e8\u7aef\u70b9\uff0c\u901a\u8fc7 trajectory_uid \u76f4\u63a5\u6807\u8bb0\u5b8c\u6210\u3002\u53ef\u9009\u4f20\u5165 reward \u548c channel\u3002

"},{"location":"api/gateway/#request_7","title":"Request","text":"
{\n  \"channel\": \"train\",\n  \"reward\": 0.9\n}\n
"},{"location":"api/gateway/#response_7","title":"Response","text":"
{ \"status\": \"ok\" }\n
"},{"location":"api/gateway/#_1","title":"\u5c31\u7eea\u68c0\u67e5","text":""},{"location":"api/gateway/#get-ready","title":"GET /ready","text":"

\u5f53 Gateway \u5b8c\u5168\u521d\u59cb\u5316\uff08\u5305\u62ec tokenizer \u52a0\u8f7d\u5b8c\u6210\uff09\u540e\u8fd4\u56de 200\u3002\u7528\u4e8e Rollouter \u542f\u52a8\u65f6\u7684\u5065\u5eb7\u68c0\u67e5\u3002

"},{"location":"api/gateway/#response-200","title":"Response (200)","text":"
{ \"status\": \"ready\" }\n
"},{"location":"api/gateway/#response-503","title":"Response (503)","text":"
{ \"detail\": \"Gateway not ready (tokenizer still loading)\" }\n
"},{"location":"api/gateway/#get-docs","title":"GET /docs","text":"

FastAPI \u81ea\u52a8\u751f\u6210\u7684 Swagger UI \u6587\u6863\u9875\u9762\u3002

"},{"location":"components/","title":"Components","text":"

Claw-R1 \u7684\u7ec4\u4ef6\u56f4\u7ed5\u6570\u636e\u6d41\u7ec4\u7ec7\uff1a\u4ece Agent \u4ea4\u4e92\u7684\u91c7\u96c6\uff0c\u5230\u6570\u636e\u7684\u7ba1\u7406\u4e0e\u8d28\u91cf\u8bc4\u4f30\uff0c\u518d\u5230\u5411\u8bad\u7ec3\u5f15\u64ce\u7684\u4f9b\u7ed9\u3002\u5404\u7ec4\u4ef6\u901a\u8fc7 HTTP \u548c Ray RPC \u901a\u4fe1\u3002

  • Gateway Server \u00b7 \u6570\u636e\u91c7\u96c6\u5165\u53e3

    FastAPI HTTP \u670d\u52a1\u3002\u6240\u6709 Agent LLM \u8c03\u7528\u7684\u7edf\u4e00\u5165\u53e3\uff0c\u81ea\u52a8\u4ece\u4ea4\u4e92\u4e2d\u91c7\u96c6\u8bad\u7ec3\u6570\u636e\uff08Step\uff09\u5e76\u63d0\u4ea4\u5230 DataPool\u3002\u652f\u6301\u767d\u76d2\u663e\u5f0f\u63d0\u4ea4\u548c\u9ed1\u76d2\u81ea\u52a8\u91c7\u96c6\u4e24\u79cd\u6a21\u5f0f\u3002

    Gateway Server

  • DataPool \u00b7 \u6570\u636e\u7ba1\u7406\u6838\u5fc3

    Ray Actor\u3002Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u4e2d\u67a2 \u2014 \u5b58\u50a8\u3001\u7d22\u5f15\u3001\u5206\u533a\u548c\u4f9b\u7ed9\u4ea4\u4e92\u6570\u636e\u3002\u652f\u6301 Channel \u9694\u79bb\u3001GRPO \u5206\u7ec4\u3001\u5bb9\u91cf\u80cc\u538b\u63a7\u5236\u548c\u5b9e\u65f6\u7edf\u8ba1\u76d1\u63a7\u3002

    DataPool

  • Reward System \u00b7 \u6570\u636e\u8d28\u91cf\u8bc4\u4f30

    RewardLoopWorker Ray Actor\u3002\u591a\u7ef4\u5ea6\u6570\u636e\u8d28\u91cf\u8bc4\u4f30\uff1arule-based\u3001discriminative RM\u3001generative RM\uff0c\u4ee5\u53ca\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u7684\u6574\u5408\u3002

    Reward System

  • Agent Flow \u00b7 \u767d\u76d2\u6570\u636e\u91c7\u96c6

    Agent \u6267\u884c\u751f\u547d\u5468\u671f\u7ba1\u7406\u3002\u767d\u76d2 Agent \u901a\u8fc7 Python API \u663e\u5f0f\u63d0\u4ea4 Step\uff0c\u5b8c\u6574\u63a7\u5236\u6570\u636e\u91c7\u96c6\u8fc7\u7a0b\u3002

    Agent Flow

  • Black-box Agent \u00b7 \u9ed1\u76d2\u6570\u636e\u91c7\u96c6

    \u96f6\u4ee3\u7801\u4fb5\u5165\u7684\u9ed1\u76d2 Agent \u63a5\u5165\u3002\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u901a\u8fc7 base_url \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\u3002

    Black-box Agent

  • Async Training \u00b7 \u6570\u636e\u6d88\u8d39\u4e0e\u8bad\u7ec3

    AsyncTrainer \u548c AsyncRollouter Ray Actor\u3002\u6301\u7eed\u4ece DataPool \u6d88\u8d39\u9ad8\u8d28\u91cf\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\uff0c\u5e26\u53c2\u6570\u540c\u6b65\u3002

    Async Training

"},{"location":"components/#_1","title":"\u6570\u636e\u6d41\u5168\u666f","text":"
                        \u6570\u636e\u91c7\u96c6\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n  \u9ed1\u76d2 Agent \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u25ba\u2502                                         \u2502\n  (base_url)          \u2502         GATEWAY SERVER                  \u2502\n                      \u2502         (FastAPI, \u7aef\u53e3 8100)             \u2502\n  \u767d\u76d2 Agent \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u25ba\u2502         \u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92 Step                 \u2502\n  (AgentFlow)         \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                   \u2502 Ray RPC (submit_steps)\n                                   \u25bc\n                        \u6570\u636e\u7ba1\u7406\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         DATAPOOL                         \u2502\n                      \u2502         (Ray Actor)                      \u2502\n                      \u2502                                          \u2502\n                      \u2502  \u2022 \u5b58\u50a8\u4e0e\u7d22\u5f15    \u2022 Channel \u5206\u533a            \u2502\n                      \u2502  \u2022 GRPO \u5206\u7ec4     \u2022 \u5bb9\u91cf\u80cc\u538b\u63a7\u5236            \u2502\n                      \u2502  \u2022 \u8d28\u91cf\u8bc4\u4f30      \u2022 \u5b9e\u65f6\u7edf\u8ba1\u76d1\u63a7            \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                         \u2502 fetch_batch()\n                                         \u25bc\n                        \u6570\u636e\u6d88\u8d39\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         ASYNC TRAINER                    \u2502\n                      \u2502         (Ray Actor, Training GPU Pool)   \u2502\n                      \u2502   \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n                      \u2502   \u2502  Actor \u2502 Critic \u2502 RefPolicy      \u2502   \u2502\n                      \u2502   \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                       \u2502 NCCL weight sync\n                                       \u25bc\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         ASYNC ROLLOUTER                  \u2502\n                      \u2502         (Ray Actor, Rollout GPU Pool)    \u2502\n                      \u2502         vLLM servers                     \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
"},{"location":"components/agent-flow/","title":"Agent Flow","text":"

Agent Flow \u662f Claw-R1 \u4e2d\u7ba1\u7406 Agent \u6267\u884c\u751f\u547d\u5468\u671f\u7684\u6846\u67b6\u3002\u5b83\u5206\u4e3a\u4e24\u5927\u7c7b\uff1a

  • \u767d\u76d2 Agent Flow\uff1aAgent \u903b\u8f91\u7528 Python \u7f16\u5199\uff0c\u76f4\u63a5\u901a\u8fc7 Gateway \u7684 /generate\u3001/submit_steps \u7b49\u7aef\u70b9\u4ea4\u4e92\uff0c\u81ea\u884c\u7ba1\u7406 tokenize \u548c Step \u6784\u5efa\u3002
  • \u9ed1\u76d2 Agent Flow\uff1aAgent \u4f7f\u7528\u6807\u51c6 OpenAI API\uff0c\u901a\u8fc7 base_url \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u5904\u7406 tokenize \u548c Step \u63d0\u4ea4\u3002
"},{"location":"components/agent-flow/#_1","title":"\u7c7b\u5c42\u6b21","text":"
AgentFlowBase                              (abstract base)\n    \u2502\n    \u251c\u2500\u2500 SingleStepSingleTurnAgentFlow      (\u767d\u76d2\uff1a\u5355\u8f6e\u95ee\u7b54)\n    \u251c\u2500\u2500 MultiStepAgentFlow                 (\u767d\u76d2\uff1a\u591a\u8f6e\u5de5\u5177\u8c03\u7528)\n    \u2502\n    \u2514\u2500\u2500 BlackBoxAgentFlowBase              (\u9ed1\u76d2\u57fa\u7c7b)\n            \u2514\u2500\u2500 BlackBoxGSM8KAgentFlow     (\u9ed1\u76d2\uff1aGSM8K \u6570\u5b66\u9898)\n
"},{"location":"components/agent-flow/#agentflowbase","title":"AgentFlowBase","text":"

\u6240\u6709 Agent Flow \u7684\u62bd\u8c61\u57fa\u7c7b\uff0c\u63d0\u4f9b\uff1a

  • Gateway URL \u7ba1\u7406
  • \u914d\u7f6e\u8bbf\u95ee\uff08self.config\uff09
  • \u62bd\u8c61\u65b9\u6cd5 run(sampling_params, **kwargs) -> int
"},{"location":"components/agent-flow/#_2","title":"\u767d\u76d2\u8f85\u52a9\u65b9\u6cd5","text":"

\u767d\u76d2 Agent Flow \u53ef\u4f7f\u7528\u4ee5\u4e0b\u65b9\u6cd5\u4e0e Gateway \u4ea4\u4e92\uff1a

"},{"location":"components/agent-flow/#gateway_generatetrajectory_uid-prompt_uid-messages-kwargs","title":"gateway_generate(trajectory_uid, prompt_uid, messages, **kwargs)","text":"

\u5411 Gateway /generate \u53d1\u9001\u5f02\u6b65 HTTP POST\uff0c\u8fd4\u56de\u751f\u6210\u6587\u672c\u548c token IDs\u3002

text, response_ids, prompt_ids = await self.gateway_generate(\n    trajectory_uid=\"traj-abc\",\n    prompt_uid=\"prompt-xyz\",\n    messages=[{\"role\": \"user\", \"content\": \"Summarize this document.\"}],\n    max_tokens=512,\n    temperature=0.8,\n)\n
"},{"location":"components/agent-flow/#gateway_submit_stepssteps-channeltrain","title":"gateway_submit_steps(steps, channel=\"train\")","text":"

\u5411 Gateway /submit_steps \u63d0\u4ea4 Step \u5217\u8868\u3002

"},{"location":"components/agent-flow/#gateway_compute_rewardtrajectory_uid-messages-dataset_fields","title":"gateway_compute_reward(trajectory_uid, messages, dataset_fields)","text":"

\u5411 Gateway /compute_reward \u8bf7\u6c42 reward \u8ba1\u7b97\u3002

"},{"location":"components/agent-flow/#singlestepsingleturnagentflow","title":"SingleStepSingleTurnAgentFlow","text":"

\u6700\u7b80\u5355\u7684\u767d\u76d2\u5b9e\u73b0\uff1a\u5355\u4e2a prompt \u4ea7\u751f\u5355\u4e2a response\u3002\u9002\u7528\u4e8e\u6bcf\u4e2a\u6837\u672c\u90fd\u662f\u72ec\u7acb\u95ee\u7b54\u5bf9\u7684\u6570\u636e\u96c6\u3002

class MyAgentFlow(SingleStepSingleTurnAgentFlow):\n    async def run(self, sampling_params, **kwargs) -> int:\n        messages = [{\"role\": \"user\", \"content\": kwargs[\"raw_prompt\"]}]\n        text, response_ids, prompt_ids = await self.gateway_generate(\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            messages=messages,\n        )\n        step = Step(\n            prompt_ids=prompt_ids,\n            response_ids=response_ids,\n            reward=0.0,\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            step_index=0,\n            is_last=True,\n        )\n        await self.gateway_submit_steps([step])\n        return 1\n
"},{"location":"components/agent-flow/#multistepagentflow","title":"MultiStepAgentFlow","text":"

\u591a\u8f6e Agent Flow\uff0c\u652f\u6301\u5de5\u5177\u8c03\u7528\u3001\u89c4\u5212\u7b49\u573a\u666f\u3002\u6bcf\u8f6e\u4ea7\u751f\u4e00\u4e2a Step\uff0c\u901a\u8fc7 trajectory_uid \u4e32\u8054\u3002

class ToolAgentFlow(MultiStepAgentFlow):\n    async def run(self, sampling_params, **kwargs) -> int:\n        messages = [{\"role\": \"user\", \"content\": kwargs[\"task\"]}]\n        step_index = 0\n\n        while True:\n            text, response_ids, prompt_ids = await self.gateway_generate(...)\n            is_last = self.is_terminal(text)\n\n            step = Step(\n                prompt_ids=prompt_ids,\n                response_ids=response_ids,\n                step_index=step_index,\n                is_last=is_last,\n                ...\n            )\n            await self.gateway_submit_steps([step])\n\n            if is_last:\n                break\n\n            messages.append({\"role\": \"assistant\", \"content\": text})\n            tool_result = await self.execute_tool(text)\n            messages.append({\"role\": \"tool\", \"content\": tool_result})\n            step_index += 1\n\n        return step_index + 1\n
"},{"location":"components/agent-flow/#blackboxagentflowbase","title":"BlackBoxAgentFlowBase","text":"

\u9ed1\u76d2 Agent Flow \u7684\u57fa\u7c7b\u3002\u5904\u7406\u4e0e Gateway \u7684\u5b8c\u6574\u534f\u8bae\uff08init \u2192 register \u2192 complete\uff09\uff0c\u5c06 Agent \u6267\u884c\u59d4\u6258\u7ed9\u5b50\u7c7b\u7684 _run_agent \u65b9\u6cd5\u3002

\u8be6\u7ec6\u6587\u6863\u89c1 Black-box Agent\u3002

"},{"location":"components/agent-flow/#_3","title":"\u6ce8\u518c\u673a\u5236","text":"

Agent Flow \u901a\u8fc7 @register(\"name\") \u88c5\u9970\u5668\u6ce8\u518c\u5230\u5168\u5c40\u6ce8\u518c\u8868\uff1a

from claw_r1.agent_flow.agent_flow import register\n\n@register(\"my_agent_flow\")\nclass MyAgentFlow(AgentFlowBase):\n    ...\n

\u4e5f\u53ef\u901a\u8fc7 YAML \u914d\u7f6e\u6587\u4ef6\u6ce8\u518c\uff08\u7528\u4e8e\u9ed1\u76d2 Agent\uff09\uff1a

# agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n
"},{"location":"components/agent-flow/#agentflowmanager-agentflowworker","title":"AgentFlowManager \u548c AgentFlowWorker","text":"
  • AgentFlowManager\uff1a\u7ba1\u7406\u591a\u4e2a AgentFlowWorker\uff0c\u5c06 batch \u4e2d\u7684\u6bcf\u4e2a\u6837\u672c\u5206\u53d1\u7ed9\u5bf9\u5e94\u7684 Agent Flow \u6267\u884c\u3002
  • AgentFlowWorker\uff1aRay Actor\uff0c\u6301\u6709 tokenizer \u548c\u914d\u7f6e\uff0c\u6267\u884c\u5177\u4f53\u7684 Agent Flow\u3002
AsyncRollouter\n    \u2514\u2500\u2500 AgentFlowManager\n            \u2514\u2500\u2500 AgentFlowWorker (Ray Actor, \u53ef\u591a\u4e2a)\n                    \u2514\u2500\u2500 AgentFlowBase \u5b50\u7c7b\u5b9e\u4f8b\n
"},{"location":"components/agent-flow/#_4","title":"\u914d\u7f6e","text":"

\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u6307\u5b9a Agent Flow\uff1a

python3 -m claw_r1.async_main \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    ...\n
"},{"location":"components/async-training/","title":"Async Training","text":"

Claw-R1 \u7684\u5f02\u6b65\u8bad\u7ec3\u67b6\u6784\u5c06 rollout\uff08trajectory \u751f\u6210\uff09\u548c training\uff08\u6743\u91cd\u66f4\u65b0\uff09\u5206\u79bb\u4e3a\u4e24\u4e2a\u72ec\u7acb\u7684 Ray Actor\uff0c\u8fd0\u884c\u5728\u4e0d\u540c\u7684 GPU \u6c60\u4e0a\u3002

"},{"location":"components/async-training/#_1","title":"\u67b6\u6784","text":"
\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  Rollout GPU Pool                                        \u2502\n\u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n\u2502  \u2502  AsyncRollouter (Ray Actor)                      \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 DataLoader (\u904d\u5386\u6570\u636e\u96c6)                      \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 vLLM replicas (\u63a8\u7406\u5f15\u64ce)                     \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 AgentFlowManager (\u7ba1\u7406 Agent \u6267\u884c)           \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Gateway (FastAPI \u5b50\u8fdb\u7a0b, \u7aef\u53e3 8100)          \u2502   \u2502\n\u2502  \u2502  \u2514\u2500\u2500 RewardLoopWorker (\u8ba1\u7b97 reward)               \u2502   \u2502\n\u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  submit_step (via Gateway \u2192 DataPool)\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502   DataPool       \u2502   \u2190 \u5171\u4eab Ray Actor\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  fetch_batch()\n                       \u25bc\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  Training GPU Pool                                       \u2502\n\u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n\u2502  \u2502  AsyncTrainer (Ray Actor)                        \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Actor worker group (\u7b56\u7565\u6a21\u578b)                \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Critic worker group (\u4ef7\u503c\u6a21\u578b)               \u2502   \u2502\n\u2502  \u2502  \u2514\u2500\u2500 RefPolicy worker group (KL baseline)        \u2502   \u2502\n\u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  NCCL weight broadcast\n                       \u25bc\n              AsyncRollouter.update_weights()\n
"},{"location":"components/async-training/#asynctrainer","title":"AsyncTrainer","text":"

AsyncTrainer \u662f\u8fd0\u884c\u5728 Training GPU Pool \u4e0a\u7684 Ray Actor\uff0c\u6267\u884c\u6301\u7eed\u7684 PPO \u8bad\u7ec3\u5faa\u73af\uff1a

  1. \u4ece DataPool fetch_batch() \u2014 \u963b\u585e\u7b49\u5f85\u5b8c\u6574\u7684 prompt_uid \u7ec4
  2. \u901a\u8fc7 RewardLoopWorker \u8ba1\u7b97 batch \u7684 reward
  3. \u8ba1\u7b97 advantage\uff08GAE \u6216 GRPO\uff09
  4. \u6267\u884c PPO Actor + Critic \u66f4\u65b0
  5. \u6bcf trigger_parameter_sync_step \u6b65\u89e6\u53d1\u6743\u91cd\u540c\u6b65
"},{"location":"components/async-training/#worker","title":"Worker \u521d\u59cb\u5316","text":"

AsyncTrainer \u5728 init_workers() \u4e2d\u521b\u5efa Actor\u3001Critic\u3001RefPolicy \u7684 worker group\uff0c\u5e76\u5c06\u5b83\u4eec\u90e8\u7f72\u5230 Training GPU Pool\uff1a

# \u521b\u5efa\u987a\u5e8f\uff1aCritic \u2192 RefPolicy \u2192 Actor\uff08\u6700\u540e\u521b\u5efa Actor \u4ee5\u514d\u5f71\u54cd vLLM \u5185\u5b58\u4f30\u7b97\uff09\nself.critic_wg.init_model()\nself.ref_policy_wg.init_model()\nself.actor_wg.init_model()\n
"},{"location":"components/async-training/#asyncrollouter","title":"AsyncRollouter","text":"

AsyncRollouter \u8fd0\u884c\u5728 Rollout GPU Pool \u4e0a\uff0c\u6301\u6709\uff1a

  • DataLoader\uff1a\u904d\u5386\u8bad\u7ec3\u6570\u636e\u96c6
  • vLLM replicas\uff1a\u9ad8\u541e\u5410\u63a8\u7406\u670d\u52a1\u5668
  • AgentFlowManager\uff1a\u7ba1\u7406 AgentFlowBase worker
  • Gateway\uff1aFastAPI HTTP \u670d\u52a1\u5668\uff08\u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u542f\u52a8\uff09
  • RewardLoopWorker\uff1a\u5728 rollout \u671f\u95f4\u8ba1\u7b97 reward
"},{"location":"components/async-training/#gateway","title":"Gateway \u542f\u52a8\u6d41\u7a0b","text":"

Rollouter \u5c06 Gateway \u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u542f\u52a8\uff1a

  1. \u5feb\u901f\u521d\u59cb\u5316\uff08Ray \u8fde\u63a5\u3001DataPool\u3001vLLM \u5730\u5740\uff09\u2192 HTTP \u7acb\u5373\u53ef\u7528
  2. Tokenizer \u5728\u540e\u53f0\u7ebf\u7a0b\u52a0\u8f7d
  3. Rollouter \u8f6e\u8be2 GET /ready \u7b49\u5f85 Gateway \u5b8c\u5168\u5c31\u7eea
  4. \u8d85\u65f6\u65f6\u95f4\u53ef\u901a\u8fc7 trainer.gateway_startup_timeout \u914d\u7f6e\uff08\u9ed8\u8ba4 300 \u79d2\uff09
"},{"location":"components/async-training/#_2","title":"\u6682\u505c/\u6062\u590d\uff08\u6743\u91cd\u540c\u6b65\uff09","text":"

\u6743\u91cd\u540c\u6b65\u671f\u95f4\uff0cRollouter \u6682\u505c\u751f\u6210\uff1a

rollouter.pause()                          # \u505c\u6b62\u65b0\u751f\u6210\uff0c\u7b49\u5f85\u8fdb\u884c\u4e2d\u7684\u8bf7\u6c42\u5b8c\u6210\n# NCCL broadcast: Actor weights \u2192 vLLM\nrollouter.update_param_version(new_version)\nrollouter.resume()                         # \u4f7f\u7528\u66f4\u65b0\u540e\u7684\u6743\u91cd\u6062\u590d\u751f\u6210\n
"},{"location":"components/async-training/#parametersynchronizer","title":"ParameterSynchronizer","text":"

\u8f7b\u91cf\u7ea7 Ray Actor\uff0c\u534f\u8c03 AsyncTrainer \u548c AsyncRollouter \u4e4b\u95f4\u7684\u6743\u91cd\u540c\u6b65\uff1a

class ParameterSynchronizer:\n    def sync_weights(self, version, validate=False):\n        # 1. \u6682\u505c rollout\n        # 2. NCCL broadcast: trainer Actor \u2192 vLLM\n        # 3. \u66f4\u65b0 rollouter \u7684 param_version\n        # 4. \u53ef\u9009\uff1a\u8fd0\u884c\u9a8c\u8bc1\n        # 5. \u6062\u590d rollout\n
"},{"location":"components/async-training/#advantage","title":"Advantage \u8ba1\u7b97","text":""},{"location":"components/async-training/#gae-generalized-advantage-estimation","title":"GAE (Generalized Advantage Estimation)","text":"

\u7528\u4e8e trajectory \u7ea7\u522b\u7684 value baseline\u3002\u5728 step \u7ea7\u522b \u8ba1\u7b97 advantage\uff0c\u7136\u540e\u5e7f\u64ad\u5230 token \u7ea7\u522b\uff08\u540c\u4e00 step \u5185\u6240\u6709 response token \u5171\u4eab\u76f8\u540c\u7684 advantage\uff09\u3002

"},{"location":"components/async-training/#grpo-group-relative-policy-optimization","title":"GRPO (Group Relative Policy Optimization)","text":"

\u7528\u4e8e prompt \u7ea7\u522b\u7684 baseline\u3002\u5c06\u6765\u81ea\u540c\u4e00 prompt_uid \u7684\u591a\u4e2a rollout \u5206\u7ec4\uff0c\u5728\u7ec4\u5185\u5f52\u4e00\u5316 advantage\u3002\u4e0d\u9700\u8981\u5355\u72ec\u7684 Critic \u6a21\u578b\uff0c\u66f4\u8282\u7701\u5185\u5b58\u3002

"},{"location":"components/async-training/#_3","title":"\u8d44\u6e90\u6c60\u914d\u7f6e","text":"

Trainer \u548c Rollouter \u8fd0\u884c\u5728\u72ec\u7acb\u7684 GPU \u6c60\u4e0a\uff0c\u9632\u6b62\u8d44\u6e90\u7ade\u4e89\uff1a

# async_ppo_trainer.yaml\n\n# Training GPU Pool (Actor, Critic, RefPolicy)\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 2\n\n# Rollout GPU Pool (vLLM)\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 1\n

\u603b GPU \u6570 = trainer.nnodes \u00d7 trainer.n_gpus_per_node + rollout.nnodes \u00d7 rollout.n_gpus_per_node\u3002

GPU \u5206\u914d

\u5fc5\u987b\u540c\u65f6\u4e3a trainer \u548c rollout \u914d\u7f6e GPU\u3002\u5982\u679c trainer \u6ca1\u6709\u5206\u914d GPU\uff0c\u8bad\u7ec3\u53c2\u6570\uff08Actor\u3001Critic\uff09\u5c06\u65e0\u6cd5\u90e8\u7f72\u5230 GPU \u4e0a\u3002

"},{"location":"components/async-training/#_4","title":"\u5173\u952e\u914d\u7f6e","text":"
# async_ppo_trainer.yaml\nasync_training:\n  staleness_threshold: 0.1           # off-policy \u5bb9\u5fcd\u5ea6\n  trigger_parameter_sync_step: 4     # \u6bcf N \u6b65\u540c\u6b65\u6743\u91cd\n  require_batches: 1                 # \u6bcf\u6b21\u4ece DataPool \u53d6\u591a\u5c11\u4e2a batch\n  use_rollout_log_probs: true        # \u4f7f\u7528 rollout \u65f6\u7684 log_probs\n  max_queue_size: null               # DataPool \u961f\u5217\u5927\u5c0f\uff08null = \u65e0\u9650\uff09\n  partial_rollout: false             # \u540c\u6b65\u65f6\u662f\u5426\u4e2d\u65ad rollout\n\n  checkpoint_engine:\n    enable: true\n    device_buffer_size_M: 4096\n
"},{"location":"components/async-training/#_5","title":"\u5165\u53e3","text":"
python3 -m claw_r1.async_main \\\n    trainer.n_gpus_per_node=2 \\\n    trainer.nnodes=1 \\\n    rollout.n_gpus_per_node=1 \\\n    rollout.nnodes=1 \\\n    async_training.trigger_parameter_sync_step=1 \\\n    ...\n

\u5b8c\u6574\u793a\u4f8b\u89c1 example/test_async_blackbox.sh\u3002

"},{"location":"components/blackbox-agent/","title":"Black-box Agent","text":"

Black-box Agent \u7cfb\u7edf\u5141\u8bb8\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u63a5\u5165 Claw-R1 \u7684\u8bad\u7ec3\u5faa\u73af\uff0c\u65e0\u9700\u4fee\u6539 Agent \u5185\u90e8\u903b\u8f91\u3002Agent \u53ea\u9700\u5c06 base_url \u6307\u5411 Gateway\uff0c\u5373\u53ef\u900f\u660e\u5730\u6536\u96c6\u8bad\u7ec3\u6570\u636e\u3002

"},{"location":"components/blackbox-agent/#_1","title":"\u67b6\u6784\u6982\u89c8","text":"
\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  BlackBoxAgentFlowBase (\u8bad\u7ec3\u4fa7\u7f16\u6392)                           \u2502\n\u2502                                                               \u2502\n\u2502  1. POST /init_trajectory          \u2192 \u83b7\u53d6 base_url            \u2502\n\u2502  2. POST {base_url}/v1/register_trajectory \u2192 \u6ce8\u518c metadata    \u2502\n\u2502  3. \u8c03\u7528 _run_agent(base_url, kwargs)                         \u2502\n\u2502     \u2502                                                         \u2502\n\u2502     \u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510                    \u2502\n\u2502     \u2514\u2500\u2500\u2502  \u5177\u4f53 Agent (\u5982 GSM8KAgent)      \u2502                    \u2502\n\u2502        \u2502  \u53ea\u77e5\u9053 base_url\uff0c\u4f7f\u7528 OpenAI API \u2502                    \u2502\n\u2502        \u2502  POST {base_url}/v1/chat/completions (\u591a\u8f6e)          \u2502\n\u2502        \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518                    \u2502\n\u2502  4. POST {base_url}/v1/complete_trajectory \u2192 \u6807\u8bb0\u5b8c\u6210          \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
"},{"location":"components/blackbox-agent/#_2","title":"\u6838\u5fc3\u8bbe\u8ba1","text":""},{"location":"components/blackbox-agent/#_3","title":"\u5173\u6ce8\u70b9\u5206\u79bb","text":"
  • BlackBoxAgentFlowBase\uff1a\u5904\u7406\u4e0e Gateway \u7684\u5b8c\u6574\u534f\u8bae\uff08init \u2192 register \u2192 complete\uff09\uff0c\u662f\u8bad\u7ec3\u4fa7\u7684\u7f16\u6392\u5c42\u3002
  • \u5177\u4f53 Agent\uff08\u5982 GSM8KAgent\uff09\uff1a\u53ea\u63a5\u6536 base_url \u548c\u4efb\u52a1\u53c2\u6570\uff0c\u4f7f\u7528\u6807\u51c6 OpenAI API \u5b8c\u6210\u4efb\u52a1\u3002Agent \u5b8c\u5168\u4e0d\u77e5\u9053\u8bad\u7ec3\u7cfb\u7edf\u7684\u5b58\u5728\u3002

\u8fd9\u79cd\u5206\u79bb\u4f7f\u5f97\uff1a

  • \u540c\u4e00\u4e2a Agent \u53ef\u4ee5\u5728\u8bad\u7ec3\u6a21\u5f0f\u548c\u72ec\u7acb\u670d\u52a1\u6a21\u5f0f\u4e0b\u590d\u7528
  • \u65b0\u589e\u4efb\u52a1\u53ea\u9700\u5b9e\u73b0 Agent + \u5bf9\u5e94\u7684 Flow \u5b50\u7c7b
  • Agent \u53ef\u4ee5\u7528\u4efb\u4f55\u8bed\u8a00/\u6846\u67b6\u5b9e\u73b0\uff0c\u53ea\u8981\u652f\u6301 OpenAI API
"},{"location":"components/blackbox-agent/#_4","title":"\u6ce8\u518c\u673a\u5236","text":"

Agent Flow \u901a\u8fc7 @register(\"name\") \u88c5\u9970\u5668\u6ce8\u518c\uff0c\u5e76\u5728 YAML \u914d\u7f6e\u4e2d\u5f15\u7528\uff1a

# agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n
"},{"location":"components/blackbox-agent/#_5","title":"\u7c7b\u5c42\u6b21","text":"
AgentFlowBase                         (agent_flow/agent_flow.py)\n    \u2502\n    \u2514\u2500\u2500 BlackBoxAgentFlowBase          (blackbox_agent/blackbox_agent_flow.py)\n            \u2502\n            \u2514\u2500\u2500 BlackBoxGSM8KAgentFlow (blackbox_agent/gsm8k_agent_flow.py)\n
"},{"location":"components/blackbox-agent/#blackboxagentflowbase","title":"BlackBoxAgentFlowBase","text":"

\u6240\u6709\u9ed1\u76d2 Agent Flow \u7684\u57fa\u7c7b\uff0c\u5b9e\u73b0\u4e86\u5b8c\u6574\u7684 Gateway \u534f\u8bae\uff1a

class BlackBoxAgentFlowBase(AgentFlowBase):\n\n    async def run(self, sampling_params, **kwargs) -> int:\n        # 1. \u63d0\u53d6 channel\u3001prompt_uid\u3001metadata\n        channel, prompt_uid, metadata = self._prepare_params(kwargs)\n\n        # 2. init_trajectory \u2192 \u83b7\u53d6 base_url\n        init_resp = await http.post(f\"{self.gateway_url}/init_trajectory\")\n        base_url = ...\n\n        # 3. register_trajectory \u2192 \u6ce8\u518c channel \u548c metadata\n        await http.post(f\"{base_url}/v1/register_trajectory\", json={...})\n\n        # 4. \u8c03\u7528\u5b50\u7c7b\u5b9e\u73b0\u7684 _run_agent\n        num_turns = await self._run_agent(base_url, kwargs)\n\n        # 5. complete_trajectory \u2192 \u6807\u8bb0\u5b8c\u6210\n        await http.post(f\"{base_url}/v1/complete_trajectory\")\n\n        return num_turns\n\n    @abstractmethod\n    async def _run_agent(self, base_url: str, kwargs: dict) -> int:\n        \"\"\"\u5b50\u7c7b\u5b9e\u73b0\uff1a\u521b\u5efa\u5e76\u8fd0\u884c\u5177\u4f53 Agent\u3002\"\"\"\n        ...\n

\u5b50\u7c7b\u53ea\u9700\u5b9e\u73b0 _run_agent\uff1a\u4ece kwargs \u4e2d\u63d0\u53d6\u4efb\u52a1\u53c2\u6570\uff0c\u521b\u5efa Agent \u5b9e\u4f8b\uff0c\u8c03\u7528 Agent \u7684\u6267\u884c\u65b9\u6cd5\u3002

"},{"location":"components/blackbox-agent/#blackboxgsm8kagentflow","title":"BlackBoxGSM8KAgentFlow","text":"

GSM8K \u6570\u5b66\u9898\u7684\u5177\u4f53\u5b9e\u73b0\uff1a

@register(\"blackbox_gsm8k_agent\")\nclass BlackBoxGSM8KAgentFlow(BlackBoxAgentFlowBase):\n\n    async def _run_agent(self, base_url: str, kwargs: dict) -> int:\n        from claw_r1.blackbox_agent.gsm8k_agent import GSM8KAgent\n\n        question = ...   # \u4ece kwargs[\"raw_prompt\"] \u63d0\u53d6\n        ground_truth = ...  # \u4ece kwargs[\"reward_model\"] \u63d0\u53d6\n        max_turns = self.config.actor_rollout_ref.rollout.get(\"max_turns\", 3)\n\n        agent = GSM8KAgent(base_url=base_url)\n        return await agent.solve(\n            question=question,\n            ground_truth=ground_truth,\n            max_turns=max_turns,\n        )\n
"},{"location":"components/blackbox-agent/#gsm8kagent","title":"GSM8KAgent","text":"

\u4e00\u4e2a\u8bad\u7ec3\u65e0\u5173\u7684 Agent\uff0c\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u89e3\u51b3 GSM8K \u6570\u5b66\u9898\uff1a

  • \u63a5\u6536 base_url\uff08\u6307\u5411 Gateway\uff09\u548c\u4efb\u52a1\u53c2\u6570
  • \u4f7f\u7528 tool calling\uff08check_answer \u5de5\u5177\uff09\u8fdb\u884c\u591a\u8f6e\u63a8\u7406
  • \u652f\u6301 Qwen \u98ce\u683c\u7684 tool call \u89e3\u6790\uff08\u273fFUNCTION\u273f \u683c\u5f0f\uff09
  • \u8fd4\u56de\u4f7f\u7528\u7684\u8f6e\u6b21\u6570
agent = GSM8KAgent(base_url=\"http://gateway:8100/traj123/1\")\nnum_turns = await agent.solve(\n    question=\"What is 15 * 23?\",\n    ground_truth=\"345\",\n    max_turns=3,\n)\n
"},{"location":"components/blackbox-agent/#agent","title":"\u6dfb\u52a0\u65b0\u7684\u9ed1\u76d2 Agent","text":"
  1. \u5b9e\u73b0 Agent \u7c7b\uff08\u8bad\u7ec3\u65e0\u5173\uff09\uff1a
# claw_r1/blackbox_agent/my_agent.py\nclass MyAgent:\n    def __init__(self, base_url: str):\n        self.client = AsyncOpenAI(base_url=base_url, api_key=\"x\")\n\n    async def solve(self, task: str, **kwargs) -> int:\n        # \u4f7f\u7528 self.client \u8fdb\u884c\u591a\u8f6e\u5bf9\u8bdd\n        # \u8fd4\u56de\u4f7f\u7528\u7684\u8f6e\u6b21\u6570\n        ...\n
  1. \u5b9e\u73b0 Flow \u5b50\u7c7b\uff1a
# claw_r1/blackbox_agent/my_agent_flow.py\nfrom claw_r1.agent_flow.agent_flow import register\nfrom claw_r1.blackbox_agent.blackbox_agent_flow import BlackBoxAgentFlowBase\n\n@register(\"blackbox_my_agent\")\nclass BlackBoxMyAgentFlow(BlackBoxAgentFlowBase):\n    async def _run_agent(self, base_url, kwargs):\n        from claw_r1.blackbox_agent.my_agent import MyAgent\n        task = kwargs.get(\"raw_prompt\", \"\")\n        agent = MyAgent(base_url=base_url)\n        return await agent.solve(task=task)\n
  1. \u6ce8\u518c\u5230\u914d\u7f6e\uff1a
# agent_flow_config.yaml\n- name: blackbox_my_agent\n  _target_: claw_r1.blackbox_agent.my_agent_flow.BlackBoxMyAgentFlow\n
  1. \u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u4f7f\u7528\uff1a
python3 -m claw_r1.async_main \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_my_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    ...\n
"},{"location":"components/blackbox-agent/#_6","title":"\u6587\u4ef6\u7ed3\u6784","text":"
claw_r1/blackbox_agent/\n\u251c\u2500\u2500 blackbox_agent_flow.py      # BlackBoxAgentFlowBase \u57fa\u7c7b\n\u251c\u2500\u2500 gsm8k_agent_flow.py         # GSM8K Flow \u5b50\u7c7b\n\u251c\u2500\u2500 gsm8k_agent.py              # GSM8K Agent\uff08\u8bad\u7ec3\u65e0\u5173\uff09\n\u2514\u2500\u2500 agent_flow_config.yaml      # Agent Flow \u6ce8\u518c\u914d\u7f6e\n
"},{"location":"components/datapool/","title":"DataPool","text":"

DataPool \u662f Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u6838\u5fc3 \u2014 \u4e00\u4e2a Ray Actor\uff0c\u627f\u62c5\u7740 Agent \u4ea4\u4e92\u6570\u636e\u7684\u5b58\u50a8\u3001\u7d22\u5f15\u3001\u8d28\u91cf\u8ffd\u8e2a\u3001\u5206\u533a\u7ba1\u7406\u548c\u6309\u9700\u4f9b\u7ed9\u3002\u5b83\u4e0d\u4ec5\u662f Agent \u4fa7\u4e0e Training \u4fa7\u4e4b\u95f4\u7684\u7f13\u51b2\u533a\uff0c\u66f4\u662f\u6574\u4e2a\u6570\u636e\u57fa\u7840\u8bbe\u65bd\u7684\u4e2d\u67a2\u3002

"},{"location":"components/datapool/#_1","title":"\u5728\u67b6\u6784\u4e2d\u7684\u89d2\u8272","text":"
Gateway \u2500\u2500\u25ba DataPool.submit_steps()     (\u6570\u636e\u91c7\u96c6\uff1a\u5f02\u6b65\u5199\u5165)\nTrainer \u25c4\u2500\u2500 DataPool.fetch_batch()      (\u6570\u636e\u4f9b\u7ed9\uff1a\u963b\u585e\u62c9\u53d6\u5c31\u7eea\u7ec4)\n            DataPool.get_statistics()   (\u6570\u636e\u76d1\u63a7\uff1a\u5b9e\u65f6\u7edf\u8ba1)\n

DataPool \u5b8c\u5168\u89e3\u8026\u4e86\u6570\u636e\u91c7\u96c6\u901f\u5ea6\uff08\u7531 Agent \u8bf7\u6c42\u9891\u7387\u9a71\u52a8\uff09\u548c\u6570\u636e\u6d88\u8d39\u901f\u5ea6\uff08\u7531\u8bad\u7ec3\u541e\u5410\u91cf\u9a71\u52a8\uff09\u3002\u53cc\u65b9\u4e92\u4e0d\u7b49\u5f85\u3002

"},{"location":"components/datapool/#channel","title":"Channel \u7cfb\u7edf\uff08\u6570\u636e\u5206\u533a\uff09","text":"

DataPool \u901a\u8fc7 channel \u5bf9\u6570\u636e\u8fdb\u884c\u5206\u533a\u7ba1\u7406\u3002\u9ed8\u8ba4 channel \u4e3a \"train\"\uff0c\u9a8c\u8bc1\u6d41\u7a0b\u4f7f\u7528 \"val\" channel \u4ee5\u9694\u79bb\u6570\u636e\u3002

# \u8bad\u7ec3\u6570\u636e\ndata_pool.submit_step(step, channel=\"train\")\n\n# \u9a8c\u8bc1\u6570\u636e\ndata_pool.submit_step(step, channel=\"val\")\n

\u6bcf\u4e2a channel \u62e5\u6709\u72ec\u7acb\u7684\u5b58\u50a8\u3001\u7d22\u5f15\u548c FIFO \u961f\u5217\u3002

"},{"location":"components/datapool/#_2","title":"\u6570\u636e\u6a21\u578b","text":"

DataPool \u4ee5 step \u7c92\u5ea6 \u5b58\u50a8 trajectory\u3002\u6bcf\u4e2a step \u662f\u4e00\u4e2a (s, a, r) \u5143\u7ec4\uff1a

@dataclass\nclass Step:\n    prompt_ids:     list[int]   # state: \u5b8c\u6574\u4e0a\u4e0b\u6587 token IDs\n    response_ids:   list[int]   # action: LLM \u751f\u6210\u7684 token IDs\n    reward:         float       # \u8be5 step \u7684\u5373\u65f6 reward\n    trajectory_uid: str         # \u540c\u4e00\u5bf9\u8bdd\u4e2d\u7684 step \u5171\u4eab\u6b64 ID\n    prompt_uid:     str         # \u540c\u4e00 prompt \u7684 rollout \u5171\u4eab\u6b64 ID\uff08\u7528\u4e8e GRPO\uff09\n    step_index:     int         # trajectory \u5185\u7684\u4f4d\u7f6e\uff080-indexed\uff09\n    policy_version: int         # \u751f\u6210\u8be5 step \u65f6\u7684\u7b56\u7565\u7248\u672c\n    is_last:        bool        # \u662f\u5426\u4e3a trajectory \u7684\u6700\u540e\u4e00\u4e2a step\n    metadata:       dict        # \u8f85\u52a9\u6570\u636e\uff08\u6570\u636e\u96c6\u5b57\u6bb5\u3001\u6765\u6e90\u4fe1\u606f\u7b49\uff09\n
"},{"location":"components/datapool/#_3","title":"\u5185\u90e8\u7d22\u5f15","text":"\u7d22\u5f15 \u7c7b\u578b \u7528\u9014 trajectory_index dict[str, list[int]] trajectory_uid \u2192 step \u7d22\u5f15\u5217\u8868 trajectory_complete dict[str, bool] \u8ffd\u8e2a trajectory \u662f\u5426\u5df2\u6536\u5230 is_last step prompt_groups dict[str, PromptGroup] prompt_uid \u2192 trajectory \u5217\u8868\u548c\u5b8c\u6210\u72b6\u6001"},{"location":"components/datapool/#producer-api","title":"Producer API","text":""},{"location":"components/datapool/#submit_stepstep-step-channeltrain","title":"submit_step(step: Step, channel=\"train\")","text":"

\u6dfb\u52a0\u5355\u4e2a step \u5230\u6307\u5b9a channel\u3002\u7531 Gateway \u901a\u8fc7 Ray RPC \u8c03\u7528\u3002

"},{"location":"components/datapool/#submit_stepssteps-liststep-channeltrain","title":"submit_steps(steps: list[Step], channel=\"train\")","text":"

\u6279\u91cf\u63d0\u4ea4\u591a\u4e2a step\u3002\u6bd4\u5faa\u73af\u8c03\u7528 submit_step \u66f4\u9ad8\u6548\u3002

"},{"location":"components/datapool/#complete_trajectorytrajectory_uid-rewardnone-channeltrain","title":"complete_trajectory(trajectory_uid, reward=None, channel=\"train\")","text":"

\u6807\u8bb0\u4e00\u6761 trajectory \u5b8c\u6210\u3002\u7528\u4e8e\u9ed1\u76d2\u6a21\u5f0f\uff0cAgent \u901a\u8fc7 Gateway \u7684 v1/complete_trajectory \u7aef\u70b9\u89e6\u53d1\u3002

"},{"location":"components/datapool/#consumer-api","title":"Consumer API","text":""},{"location":"components/datapool/#fetch_batchn_rollouts-channeltrain-liststep-none","title":"fetch_batch(n_rollouts, channel=\"train\") \u2192 list[Step] | None","text":"

FIFO \u62c9\u53d6\u4e0b\u4e00\u4e2a\u5c31\u7eea\u7684 prompt_uid \u7ec4\u3002\u4e00\u4e2a\u7ec4\u5728\u6240\u6709 trajectory \u90fd\u6536\u5230 is_last step \u540e\u53d8\u4e3a\"\u5c31\u7eea\"\u3002

\u5f53\u6ca1\u6709\u5b8c\u6574\u7ec4\u53ef\u7528\u65f6\u8fd4\u56de None\u3002

# Trainer \u4fa7\nwhile True:\n    batch = await data_pool.fetch_batch.remote(n_rollouts=5)\n    if batch is not None:\n        train_on_batch(batch)\n
"},{"location":"components/datapool/#_4","title":"\u5bb9\u91cf\u7ba1\u7406\u4e0e\u80cc\u538b\u63a7\u5236","text":"

\u5f53\u8bbe\u7f6e max_queue_size \u65f6\uff0cDataPool \u5728\u961f\u5217\u6ee1\u65f6\u81ea\u52a8\u4e22\u5f03\u6700\u65e7\u7684\u5c31\u7eea\u7ec4\uff0c\u9632\u6b62\u6570\u636e\u5806\u79ef\u5bfc\u81f4\u5185\u5b58\u65e0\u9650\u589e\u957f\u3002\u8fd9\u79cd\u80cc\u538b\u673a\u5236\u4e5f\u786e\u4fdd\u4e86\u8bad\u7ec3\u4fa7\u6d88\u8d39\u7684\u6570\u636e\u5c3d\u53ef\u80fd\u65b0\u9c9c\uff1a

async_training:\n  max_queue_size: null   # null = \u65e0\u9650\n
"},{"location":"components/datapool/#training-backend","title":"Training Backend\uff08\u6570\u636e\u4f9b\u7ed9\u9002\u914d\uff09","text":"

DataPool \u901a\u8fc7\u53ef\u63d2\u62d4\u7684 TrainingBackend \u5c06 list[Step] \u8f6c\u6362\u4e3a\u4efb\u610f\u8bad\u7ec3\u5f15\u64ce\u7684\u539f\u751f\u683c\u5f0f\uff0c\u5b9e\u73b0\u6570\u636e\u7ba1\u7406\u4e0e\u8bad\u7ec3\u6846\u67b6\u7684\u89e3\u8026\uff1a

class VerlBackend(TrainingBackend):\n    \"\"\"\u5c06 Step \u5217\u8868\u8f6c\u6362\u4e3a verl DataProto\u3002\"\"\"\n\n    def convert(self, steps: list[Step]) -> DataProto:\n        # prompt_ids: \u5de6\u586b\u5145\u5230 prompt_length\n        # response_ids: \u53f3\u586b\u5145\u5230 response_length\n        # input_ids: [prompt_ids | response_ids]\n        # attention_mask, position_ids, response_mask \u7b49\n        ...\n
"},{"location":"components/datapool/#off-policy","title":"Off-policy \u652f\u6301\uff08\u6570\u636e\u65b0\u9c9c\u5ea6\u7ba1\u63a7\uff09","text":"

\u6bcf\u4e2a Step \u90fd\u8bb0\u5f55\u4e86\u751f\u6210\u65f6\u7684 policy_version\uff0cDataPool \u548c Trainer \u53ef\u4ee5\u636e\u6b64\u5224\u65ad\u6570\u636e\u7684\u65b0\u9c9c\u5ea6\u3002Trainer \u901a\u8fc7 staleness threshold \u914d\u7f6e\u6765\u5904\u7406\u5386\u53f2\uff08off-policy\uff09\u6570\u636e\uff1a

async_training:\n  staleness_threshold: 0.1   # policy_version \u6ede\u540e > threshold \u7684 step \u4e3a off-policy\n

Off-policy step \u4ecd\u5305\u542b\u5728 batch \u4e2d\uff0c\u4f46\u5728 loss \u8ba1\u7b97\u65f6\u901a\u8fc7 importance sampling \u8fdb\u884c\u964d\u6743\u3002

"},{"location":"components/gateway/","title":"Gateway Server","text":"

Gateway Server \u662f\u4e00\u4e2a FastAPI HTTP \u670d\u52a1\uff0c\u4f5c\u4e3a Agent \u4e0e Claw-R1 \u8bad\u7ec3\u57fa\u7840\u8bbe\u65bd\u4e4b\u95f4\u7684\u7f51\u7edc\u5c42\u4ee3\u7406\u3002

"},{"location":"components/gateway/#_1","title":"\u8bbe\u8ba1\u539f\u5219","text":"
  • \u72ec\u7acb\u8fdb\u7a0b\uff1aGateway \u4f5c\u4e3a\u666e\u901a OS \u8fdb\u7a0b\u8fd0\u884c\uff08\u975e Ray Actor\uff09\uff0c\u53ef\u4ee5\u72ec\u7acb\u4e8e Ray \u96c6\u7fa4\u91cd\u542f\u3002
  • \u7eaf\u4ee3\u7406\uff1aGateway \u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f\uff0c\u53ea\u8d1f\u8d23\u8f6c\u53d1\u8bf7\u6c42\u3001\u6536\u96c6 Step\u3001\u63d0\u4ea4\u5230 DataPool\u3002
  • OpenAI \u517c\u5bb9\uff1a\u9ed1\u76d2\u7aef\u70b9\u5b9e\u73b0\u4e0e OpenAI chat completions API \u76f8\u540c\u7684\u63a5\u53e3\uff0c\u53ef\u4f5c\u4e3a drop-in \u66ff\u6362\u3002
  • \u5ef6\u8fdf\u521d\u59cb\u5316\uff1a\u542f\u52a8\u65f6\u5148\u5feb\u901f\u521d\u59cb\u5316 Ray \u8fde\u63a5\u548c\u914d\u7f6e\uff0cHTTP \u670d\u52a1\u7acb\u5373\u53ef\u7528\uff1btokenizer \u5728\u540e\u53f0\u7ebf\u7a0b\u52a0\u8f7d\uff0c\u901a\u8fc7 /ready \u7aef\u70b9\u62a5\u544a\u5c31\u7eea\u72b6\u6001\u3002
"},{"location":"components/gateway/#_2","title":"\u542f\u52a8\u65b9\u5f0f","text":"

Gateway \u901a\u5e38\u7531 AsyncRollouter \u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u81ea\u52a8\u542f\u52a8\u3002\u4e5f\u53ef\u624b\u52a8\u542f\u52a8\uff1a

python -m claw_r1.gateway.gateway \\\n    --data-pool-name  data_pool \\\n    --vllm-addresses  http://host1:8001,http://host2:8001 \\\n    --tokenizer-path  /path/to/model \\\n    --prompt-length   4096 \\\n    --response-length 1024 \\\n    --reward-worker-name reward_loop_worker \\\n    --ray-address     auto \\\n    --ray-namespace   default \\\n    --host            0.0.0.0 \\\n    --port            8100\n
"},{"location":"components/gateway/#_3","title":"\u53c2\u6570","text":"\u53c2\u6570 \u5fc5\u586b \u8bf4\u660e --data-pool-name \u662f DataPool \u7684 Ray Actor \u540d\u79f0 --vllm-addresses \u662f \u9017\u53f7\u5206\u9694\u7684 vLLM \u670d\u52a1\u5668\u5730\u5740\u5217\u8868\uff08\u8f6e\u8be2\u8d1f\u8f7d\u5747\u8861\uff09 --tokenizer-path \u662f HuggingFace tokenizer \u8def\u5f84 --prompt-length \u662f \u6700\u5927 prompt token \u957f\u5ea6\uff08\u7528\u4e8e padding\uff09 --response-length \u662f \u6700\u5927 response token \u957f\u5ea6\uff08\u7528\u4e8e padding\uff09 --reward-worker-name \u5426 RewardLoopWorker \u7684 Ray Actor \u540d\u79f0 --ray-address \u5426 Ray GCS \u5730\u5740\uff08\u9ed8\u8ba4 auto\uff09 --ray-namespace \u5426 Ray namespace --host \u5426 \u76d1\u542c\u5730\u5740\uff08\u9ed8\u8ba4 0.0.0.0\uff09 --port \u5426 \u76d1\u542c\u7aef\u53e3\uff08\u9ed8\u8ba4 8100\uff09"},{"location":"components/gateway/#_4","title":"\u4e24\u79cd\u5de5\u4f5c\u6a21\u5f0f","text":""},{"location":"components/gateway/#white-box","title":"White-box \u6a21\u5f0f","text":"

\u767d\u76d2 Agent\uff08AgentFlowBase \u5b50\u7c7b\uff09\u901a\u8fc7 Gateway \u6839\u8def\u5f84\u7aef\u70b9\u4ea4\u4e92\uff1a

AgentFlow \u2192 POST /generate        \u2192 vLLM \u2192 \u8fd4\u56de token IDs\nAgentFlow \u2192 POST /submit_steps    \u2192 DataPool\nAgentFlow \u2192 POST /compute_reward  \u2192 RewardLoopWorker\n

Agent \u81ea\u5df1\u7ba1\u7406 tokenize\u3001Step \u6784\u5efa\u548c\u63d0\u4ea4\u3002

"},{"location":"components/gateway/#black-box","title":"Black-box \u6a21\u5f0f","text":"

\u9ed1\u76d2 Agent \u53ea\u9700\u8981\u4e00\u4e2a base_url\uff0c\u901a\u8fc7\u6807\u51c6 OpenAI \u63a5\u53e3\u4ea4\u4e92\uff1a

1. BlackBoxAgentFlow \u2192 POST /init_trajectory           \u2192 \u83b7\u53d6 base_url\n2. BlackBoxAgentFlow \u2192 POST {base_url}/v1/register_trajectory  \u2192 \u6ce8\u518c channel/metadata\n3. Agent             \u2192 POST {base_url}/v1/chat/completions     \u2192 \u6807\u51c6 OpenAI \u8c03\u7528\uff08\u53ef\u591a\u8f6e\uff09\n4. BlackBoxAgentFlow \u2192 POST {base_url}/v1/complete_trajectory  \u2192 \u6807\u8bb0\u5b8c\u6210\n

Gateway \u5728 v1/chat/completions \u5185\u90e8\u81ea\u52a8\u5b8c\u6210 tokenize\u3001Step \u6784\u5efa\u548c DataPool \u63d0\u4ea4\uff0cAgent \u5b8c\u5168\u65e0\u611f\u77e5\u3002

"},{"location":"components/gateway/#base_url","title":"base_url \u673a\u5236","text":"

base_url \u7684\u683c\u5f0f\u4e3a\uff1a

http://<host>:<port>/<trajectory_uid>/<prompt_uid>\n

trajectory_uid \u548c prompt_uid \u7f16\u7801\u5728 URL path \u4e2d\uff0c\u4f7f\u5f97 Gateway \u80fd\u5c06\u8bf7\u6c42\u5173\u8054\u5230\u6b63\u786e\u7684 trajectory\uff0c\u800c Agent \u7aef\u53ea\u9700\u4fee\u6539 base_url \u5373\u53ef\u63a5\u5165\u8bad\u7ec3\u7cfb\u7edf\u3002

from openai import OpenAI\n\nclient = OpenAI(\n    base_url=\"http://gateway:8100/abc123/1\",  # base_url \u7531 init_trajectory \u8fd4\u56de\n    api_key=\"not-needed\",\n)\nresponse = client.chat.completions.create(\n    model=\"qwen\",\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\n)\n
"},{"location":"components/gateway/#_5","title":"\u5185\u90e8\u72b6\u6001\u7ba1\u7406","text":"

Gateway \u4e3a\u6bcf\u6761 trajectory \u7ef4\u62a4\u4ee5\u4e0b\u72b6\u6001\uff1a

\u72b6\u6001 \u8bf4\u660e _trajectory_step_counter \u6bcf\u6761 trajectory \u7684\u4e0b\u4e00\u4e2a step_index _trajectory_channel trajectory \u5bf9\u5e94\u7684 DataPool channel\uff08\u9ed8\u8ba4 \"train\"\uff09 _trajectory_metadata trajectory \u5173\u8054\u7684 metadata\uff08\u5982 reward_model\u3001data_source \u7b49\uff09

\u8fd9\u4e9b\u72b6\u6001\u5728 register_trajectory \u65f6\u8bbe\u7f6e\uff0c\u5728 complete_trajectory \u65f6\u6e05\u7406\u3002

"},{"location":"components/gateway/#_6","title":"\u8d1f\u8f7d\u5747\u8861","text":"

\u5f53\u63d0\u4f9b\u591a\u4e2a --vllm-addresses \u65f6\uff0cGateway \u4f7f\u7528 round-robin \u8f6e\u8be2\u5206\u53d1\u8bf7\u6c42\uff1a

_vllm_cycle = itertools.cycle(vllm_addresses)\nvllm_url = next(_vllm_cycle)\n
"},{"location":"components/gateway/#api","title":"API \u53c2\u8003","text":"

\u5b8c\u6574\u7684\u7aef\u70b9\u6587\u6863\u89c1 Gateway API\u3002

"},{"location":"components/reward-system/","title":"Reward System","text":"

The RewardLoopWorker is a Ray Actor responsible for assigning reward scores to trajectory steps. It bridges the gap between raw agent interactions and trainable reward signals.

"},{"location":"components/reward-system/#three-reward-sources","title":"Three Reward Sources","text":"

Claw-R1 supports three types of reward computation, which can be combined:

Type Description Best For Rule-based Deterministic function of step output Verifiable tasks (math, code execution) Discriminative RM Binary classifier reward model Preference learning, safety evaluation Generative RM LLM-based evaluator via custom scoring function Complex quality assessment, nuanced feedback"},{"location":"components/reward-system/#reward-in-production-vs-research-settings","title":"Reward in Production vs. Research Settings","text":"

In research settings (white-box offline mode), rewards are computed from known ground truth:

Trajectory:   [user msg] \u2192 [agent think] \u2192 [tool call] \u2192 [tool result] \u2192 [final reply]\nReward:            0.0            0.3            0.7            0.9            0.8\n
  • Rule-based: is the final answer correct? does the code pass tests?
  • Model-based: is each step logically sound? is the tool use appropriate?

In production settings (online mode), rewards come from real user signals:

Signal Type Interpretation User sends follow-up Implicit positive Agent answer was relevant but incomplete User corrects the agent Negative feedback Factual or task error User says \"thanks\" Positive signal Task completed satisfactorily No follow-up after task Neutral / estimated Reward Model estimates step quality

Claw-R1 uses a Reward Model to convert these soft signals into scalar process rewards, filling the gap between verifiable task rewards and open-ended conversational rewards.

"},{"location":"components/reward-system/#rewardloopworker-api","title":"RewardLoopWorker API","text":""},{"location":"components/reward-system/#compute_score_batchsteps-liststep-listfloat","title":"compute_score_batch(steps: list[Step]) \u2192 list[float]","text":"

Computes rewards for a batch of steps. This is the primary interface used by the Trainer.

# In AsyncTrainer\nrewards = await reward_worker.compute_score_batch.remote(batch_steps)\nfor step, reward in zip(batch_steps, rewards):\n    step.reward = reward\n
"},{"location":"components/reward-system/#custom-reward-function","title":"Custom Reward Function","text":"

Register a custom generative reward model by implementing the reward_loop_manager interface:

# custom_reward.py\ndef compute_reward(step: dict, model, tokenizer) -> float:\n    \"\"\"\n    Args:\n        step: dict with keys 'messages', 'response', 'metadata'\n        model: loaded reward model\n        tokenizer: model tokenizer\n    Returns:\n        scalar reward in [0.0, 1.0]\n    \"\"\"\n    prompt = build_evaluation_prompt(step)\n    score = model.score(prompt)\n    return score\n

Then register it in the configuration:

reward:\n  type: genrm\n  reward_loop_manager: path.to.custom_reward.compute_reward\n  model_path: /path/to/reward/model\n
"},{"location":"components/reward-system/#reward-in-the-training-loop","title":"Reward in the Training Loop","text":"

Reward computation is decoupled from the agent service:

  1. The Gateway does not compute rewards before submitting steps to DataPool
  2. DataPool stores steps with reward=0.0 initially
  3. The Trainer calls RewardLoopWorker.compute_score_batch() before the PPO update
  4. Updated rewards are used for advantage computation

This ensures that even slow generative reward models (which may call an external LLM) do not affect agent service latency.

Reward Design

For new tasks, start with simple rule-based rewards (e.g., exact match, code execution pass rate). Generative reward models are more expressive but introduce variance and computational cost. Use discriminative models as a middle ground.

"},{"location":"concepts/","title":"Core Concepts","text":"

Claw-R1 \u7684\u8bbe\u8ba1\u56f4\u7ed5\u4e09\u4e2a\u6838\u5fc3\u6982\u5ff5\u5c55\u5f00\uff1a\u901a\u7528\u6570\u636e\u91c7\u96c6\u3001\u6570\u636e\u4e2d\u95f4\u4ef6\u7ba1\u7406\u548c\u6570\u636e\u9a71\u52a8\u7684\u6301\u7eed\u8fdb\u5316\u3002\u5b83\u4eec\u5171\u540c\u6784\u6210\u4e00\u4e2a\u4ece\u91c7\u96c6\u5230\u8bad\u7ec3\u7684\u6570\u636e\u98de\u8f6e\u3002

  • Base URL Integration \u00b7 \u901a\u7528\u6570\u636e\u91c7\u96c6

    \u96f6\u4ee3\u7801\u4fb5\u5165\u7684 Agent \u6570\u636e\u91c7\u96c6\u673a\u5236\u3002\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u53ea\u9700\u4fee\u6539 base_url\uff0cGateway \u5373\u53ef\u81ea\u52a8\u91c7\u96c6\u5176\u4ea4\u4e92\u6570\u636e\u3002

    Base URL Integration

  • Middleware Layer \u00b7 \u6570\u636e\u4e2d\u95f4\u4ef6

    Gateway + DataPool \u6570\u636e\u57fa\u7840\u8bbe\u65bd\u3002\u7edf\u4e00\u89e3\u51b3\u6570\u636e\u7684\u91c7\u96c6\u5165\u53e3\u3001\u8d28\u91cf\u7ba1\u7406\u3001\u5206\u533a\u7f13\u51b2\u548c\u6309\u9700\u4f9b\u7ed9\u3002

    Middleware Layer

  • Production Scenario \u00b7 \u6570\u636e\u9a71\u52a8\u8fdb\u5316

    \"\u90e8\u7f72 = \u8bad\u7ec3\" \u8303\u5f0f\u3002Agent \u5728\u670d\u52a1\u7528\u6237\u7684\u540c\u65f6\u6301\u7eed\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\uff0c\u7528\u6237\u884c\u4e3a\u5929\u7136\u6210\u4e3a\u6570\u636e\u8d28\u91cf\u4fe1\u53f7\uff0c\u9a71\u52a8\u6a21\u578b\u6301\u7eed\u8fdb\u5316\u3002

    Production Scenario

"},{"location":"concepts/#_1","title":"\u6570\u636e\u98de\u8f6e","text":"
                    base_url\n                 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                 \u2502 \u4efb\u610f Agent  \u2502\n                 \u2502 (\u767d\u76d2/\u9ed1\u76d2) \u2502\n                 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2518\n                        \u2502 OpenAI API\n                        \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    Gateway       \u2502 \u2190 \u6570\u636e\u91c7\u96c6\u5165\u53e3\n              \u2502  (\u81ea\u52a8\u91c7\u96c6 Step)  \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    DataPool      \u2502 \u2190 \u6570\u636e\u7ba1\u7406\u6838\u5fc3\n              \u2502  (\u8bc4\u4f30\u00b7\u7b5b\u9009\u00b7\u4f9b\u7ed9) \u2502    (\u8d28\u91cf\u8bc4\u4f30 + \u5206\u533a\u7ba1\u7406)\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    Trainer       \u2502 \u2190 \u6570\u636e\u6d88\u8d39\n              \u2502  (\u6301\u7eed\u8bad\u7ec3)       \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502 \u6743\u91cd\u540c\u6b65\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    vLLM          \u2502\n              \u2502  (\u66f4\u597d\u7684\u6a21\u578b)     \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n

\u4e09\u4e2a\u6982\u5ff5\u7684\u534f\u540c\uff1a

  1. Base URL \u8ba9\u4efb\u4f55 Agent \u7684\u4ea4\u4e92\u6570\u636e\u96f6\u6210\u672c\u88ab\u91c7\u96c6
  2. Middleware \u7ba1\u7406\u6570\u636e\u7684\u8d28\u91cf\u3001\u5206\u533a\u548c\u4f9b\u7ed9
  3. Production Scenario \u8ba9\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u81ea\u7136\u878d\u5165\u6570\u636e\uff0c\u9a71\u52a8\u6a21\u578b\u6301\u7eed\u8fdb\u5316
"},{"location":"concepts/base-url-integration/","title":"Base URL Integration","text":""},{"location":"concepts/base-url-integration/#agent-llm","title":"\u95ee\u9898\uff1a\u5982\u4f55\u62e6\u622a\u9ed1\u76d2 Agent \u7684 LLM \u8c03\u7528\uff1f","text":"

\u5728 Agentic RL \u4e2d\uff0c\u8bad\u7ec3\u7cfb\u7edf\u9700\u8981\u62e6\u622a Agent \u4e0e LLM \u4e4b\u95f4\u7684\u6bcf\u6b21\u4ea4\u4e92\uff0c\u4ee5\u6536\u96c6 (state, action, reward) \u6570\u636e\u3002\u5bf9\u4e8e\u767d\u76d2 Agent\uff08\u6e90\u7801\u53ef\u63a7\uff09\uff0c\u8fd9\u5f88\u7b80\u5355\u3002\u4f46\u5bf9\u4e8e\u9ed1\u76d2 Agent\uff08\u5982\u7b2c\u4e09\u65b9\u670d\u52a1\u3001\u7f16\u8bd1\u540e\u7684\u4e8c\u8fdb\u5236\u6587\u4ef6\uff09\uff0c\u5982\u4f55\u5728\u4e0d\u4fee\u6539 Agent \u4ee3\u7801\u7684\u60c5\u51b5\u4e0b\u62e6\u622a\uff1f

"},{"location":"concepts/base-url-integration/#_1","title":"\u65b9\u6848\u5bf9\u6bd4","text":"\u65b9\u6848 \u4fb5\u5165\u6027 \u53ef\u9760\u6027 \u9002\u7528\u8303\u56f4 SDK monkey-patch \u4e2d \u4f4e\uff08\u7248\u672c\u66f4\u65b0\u6613\u5931\u6548\uff09 \u4ec5\u9650\u7279\u5b9a SDK \u4ee3\u7406\u5c42\uff08Proxy\uff09 \u9ad8 \u4e2d\uff08\u9700\u914d\u7f6e\u7f51\u7edc\uff09 \u901a\u7528 base_url \u66ff\u6362 \u6781\u4f4e \u9ad8 \u6240\u6709 OpenAI \u517c\u5bb9 SDK"},{"location":"concepts/base-url-integration/#base_url","title":"base_url \u673a\u5236","text":"

\u51e0\u4e4e\u6240\u6709 OpenAI \u517c\u5bb9\u7684 SDK \u90fd\u652f\u6301\u81ea\u5b9a\u4e49 base_url\u3002Claw-R1 \u5229\u7528\u8fd9\u4e00\u70b9\uff1a

  1. Gateway \u66b4\u9732 POST {base_url}/v1/chat/completions \u7aef\u70b9
  2. Agent \u53ea\u9700\u5c06 base_url \u4ece https://api.openai.com \u6539\u4e3a Gateway \u7684\u5730\u5740
  3. Gateway \u900f\u660e\u5730\u8f6c\u53d1\u8bf7\u6c42\u5230 vLLM\uff0c\u540c\u65f6\u81ea\u52a8\u6536\u96c6\u8bad\u7ec3\u6570\u636e
from openai import OpenAI\n\n# \u539f\u59cb\u4ee3\u7801\nclient = OpenAI(base_url=\"https://api.openai.com/v1\")\n\n# \u63a5\u5165 Claw-R1\uff1a\u53ea\u6539\u4e00\u884c\nclient = OpenAI(\n    base_url=\"http://gateway:8100/traj123/prompt1\",\n    api_key=\"not-needed\",\n)\n\n# \u540e\u7eed\u4ee3\u7801\u5b8c\u5168\u4e0d\u53d8\nresponse = client.chat.completions.create(\n    model=\"qwen\",\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\n)\n
"},{"location":"concepts/base-url-integration/#base_url_1","title":"base_url \u7684\u7ed3\u6784","text":"
http://<host>:<port>/<trajectory_uid>/<prompt_uid>\n
  • trajectory_uid\uff1a\u6807\u8bc6\u4e00\u6761\u5b8c\u6574\u7684\u5bf9\u8bdd\u8f68\u8ff9
  • prompt_uid\uff1a\u6807\u8bc6\u540c\u4e00 prompt \u7684\u591a\u6b21 rollout\uff08\u7528\u4e8e GRPO \u5206\u7ec4\uff09

\u8fd9\u4e24\u4e2a ID \u7f16\u7801\u5728 URL path \u4e2d\uff0cGateway \u4ece path \u4e2d\u63d0\u53d6\uff0cAgent \u5b8c\u5168\u65e0\u611f\u77e5\u3002

"},{"location":"concepts/base-url-integration/#claw-r1","title":"\u5728 Claw-R1 \u4e2d\u7684\u4f7f\u7528","text":""},{"location":"concepts/base-url-integration/#_2","title":"\u9ed1\u76d2\u79bb\u7ebf\u6a21\u5f0f","text":"

BlackBoxAgentFlowBase \u81ea\u52a8\u7ba1\u7406 base_url \u7684\u751f\u547d\u5468\u671f\uff1a

1. POST /init_trajectory              \u2192 \u83b7\u53d6 base_url\n2. POST {base_url}/v1/register_trajectory  \u2192 \u6ce8\u518c channel/metadata\n3. Agent \u4f7f\u7528 base_url \u8fdb\u884c\u591a\u8f6e\u5bf9\u8bdd     \u2192 Gateway \u81ea\u52a8\u6536\u96c6 Step\n4. POST {base_url}/v1/complete_trajectory  \u2192 \u6807\u8bb0\u5b8c\u6210\n

Agent \u53ea\u9700\u8981\u63a5\u6536 base_url \u53c2\u6570\uff0c\u5176\u4f59\u7531\u8bad\u7ec3\u6846\u67b6\u5904\u7406\u3002

"},{"location":"concepts/base-url-integration/#_3","title":"\u9ed1\u76d2\u5728\u7ebf\u6a21\u5f0f","text":"

\u5728\u7ebf\u6a21\u5f0f\u4e0b\uff0c\u5916\u90e8\u670d\u52a1\u76f4\u63a5\u8c03\u7528 Gateway \u7684 init_trajectory \u83b7\u53d6 base_url\uff0c\u7136\u540e\u5c06\u5176\u4f20\u9012\u7ed9 Agent\u3002Agent \u7684\u6bcf\u6b21 LLM \u8c03\u7528\u90fd\u81ea\u52a8\u88ab Gateway \u8bb0\u5f55\u3002

"},{"location":"concepts/base-url-integration/#sdk-hook","title":"\u4e3a\u4ec0\u4e48\u4f18\u4e8e SDK Hook","text":"\u7ef4\u5ea6 SDK Hook base_url Agent \u4ee3\u7801\u4fee\u6539 \u9700\u8981\u6ce8\u5165 hook \u4ee3\u7801 \u53ea\u6539\u4e00\u4e2a\u53c2\u6570 \u591a\u8bed\u8a00\u652f\u6301 \u6bcf\u79cd\u8bed\u8a00\u9700\u8981\u5355\u72ec\u5b9e\u73b0 \u6240\u6709\u8bed\u8a00\u901a\u7528 \u7248\u672c\u517c\u5bb9\u6027 SDK \u66f4\u65b0\u53ef\u80fd\u7834\u574f hook HTTP \u534f\u8bae\u7a33\u5b9a \u8c03\u8bd5\u96be\u5ea6 Hook \u5c42\u589e\u52a0\u8c03\u8bd5\u590d\u6742\u5ea6 \u6807\u51c6 HTTP \u8bf7\u6c42\uff0c\u6613\u4e8e\u8c03\u8bd5 \u751f\u4ea7\u53ef\u9760\u6027 \u4e2d\u7b49 \u9ad8"},{"location":"concepts/base-url-integration/#sdk","title":"\u652f\u6301\u7684 SDK \u548c\u6846\u67b6","text":"

\u4efb\u4f55\u652f\u6301\u81ea\u5b9a\u4e49 base_url \u7684 OpenAI \u517c\u5bb9 SDK \u90fd\u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528\uff1a

  • Python: openai, httpx, requests
  • JavaScript/TypeScript: openai-node
  • Go: go-openai
  • \u6846\u67b6: LangChain, LlamaIndex, AutoGen, CrewAI \u7b49
"},{"location":"concepts/middleware-layer/","title":"Middleware Layer","text":""},{"location":"concepts/middleware-layer/#_1","title":"\u4e3a\u4ec0\u4e48\u9700\u8981\u6570\u636e\u4e2d\u95f4\u4ef6\uff1f","text":"

Agentic RL \u4e2d\uff0cAgent \u4ea7\u751f\u4ea4\u4e92\u6570\u636e\uff0cTrainer \u6d88\u8d39\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\u3002\u7136\u800c\u5728\u5b9e\u9645\u573a\u666f\u4e2d\uff0c\u4e24\u8005\u4e4b\u95f4\u5b58\u5728\u663e\u8457\u7684\u4e0d\u5bf9\u79f0\uff1a

  • \u6570\u636e\u6765\u6e90\u591a\u6837\uff1a\u767d\u76d2 Agent\u3001\u9ed1\u76d2 Agent\u3001\u5728\u7ebf\u670d\u52a1 Agent\uff0c\u4ea7\u51fa\u7684\u6570\u636e\u683c\u5f0f\u548c\u9891\u7387\u5404\u4e0d\u76f8\u540c
  • \u6570\u636e\u8d28\u91cf\u53c2\u5dee\uff1a\u5e76\u975e\u6240\u6709\u4ea4\u4e92\u90fd\u6709\u8bad\u7ec3\u4ef7\u503c\uff0c\u9700\u8981\u8bc4\u4f30\u548c\u7b5b\u9009
  • \u4ea7\u6d88\u901f\u7387\u4e0d\u5339\u914d\uff1aAgent \u4fa7\u7684\u6570\u636e\u4ea7\u751f\u901f\u7387\u4e0e Trainer \u4fa7\u7684\u6d88\u8d39\u901f\u7387\u5f80\u5f80\u4e0d\u540c\u6b65
  • \u6570\u636e\u9700\u8981\u7ba1\u7406\uff1a\u5206\u533a\u3001\u7d22\u5f15\u3001\u80cc\u538b\u63a7\u5236\u3001\u7edf\u8ba1\u76d1\u63a7 \u2014 \u8fd9\u4e9b\u4e0d\u662f\u7b80\u5355\u7684\u961f\u5217\u80fd\u89e3\u51b3\u7684

Claw-R1 \u901a\u8fc7 Middleware Layer\uff08Gateway + DataPool\uff09\u5728 Agent \u4fa7\u548c Training \u4fa7\u4e4b\u95f4\u5efa\u7acb\u4e00\u5c42\u6570\u636e\u57fa\u7840\u8bbe\u65bd\uff0c\u7edf\u4e00\u89e3\u51b3\u6570\u636e\u7684\u91c7\u96c6\u3001\u7ba1\u7406\u548c\u4f9b\u7ed9\u95ee\u9898\u3002

"},{"location":"concepts/middleware-layer/#gateway-datapool","title":"Gateway + DataPool \u67b6\u6784","text":"
Agent \u4fa7                    Middleware                    Training \u4fa7\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510           \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510           \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 Agent    \u2502\u2500\u2500HTTP\u2500\u2500\u25ba  \u2502  Gateway         \u2502\u2500\u2500Ray RPC\u2500\u2500\u25ba\u2502  DataPool    \u2502\n\u2502 (\u4efb\u610f)   \u2502\u25c4\u2500\u2500HTTP\u2500\u2500  \u2502  (FastAPI, 8100) \u2502           \u2502  (Ray Actor) \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518           \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518           \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                                             \u2502 fetch_batch()\n                                                             \u25bc\n                                                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                                                      \u2502  Trainer     \u2502\n                                                      \u2502  (Ray Actor) \u2502\n                                                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
"},{"location":"concepts/middleware-layer/#gateway","title":"Gateway\uff1a\u6570\u636e\u91c7\u96c6\u5165\u53e3","text":"

Gateway \u662f\u4e00\u4e2a\u72ec\u7acb\u8fdb\u7a0b\uff08FastAPI\uff09\uff0c\u8d1f\u8d23\u4ece Agent \u4ea4\u4e92\u4e2d\u91c7\u96c6\u8bad\u7ec3\u6570\u636e\uff1a

  • \u7eaf\u4ee3\u7406\uff1a\u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f\uff0c\u53ea\u8f6c\u53d1\u8bf7\u6c42\u548c\u91c7\u96c6\u6570\u636e
  • OpenAI \u517c\u5bb9\uff1a\u9ed1\u76d2 Agent \u901a\u8fc7 base_url \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u4ece\u5bf9\u8bdd\u4e2d\u6784\u5efa Step
  • \u5ef6\u8fdf\u521d\u59cb\u5316\uff1aHTTP \u670d\u52a1\u7acb\u5373\u53ef\u7528\uff0ctokenizer \u5728\u540e\u53f0\u52a0\u8f7d

Gateway \u652f\u6301\u4e24\u79cd\u6570\u636e\u91c7\u96c6\u6a21\u5f0f\uff1a

\u6a21\u5f0f \u7aef\u70b9 \u6570\u636e\u91c7\u96c6\u65b9\u5f0f \u767d\u76d2 /generate, /submit_steps Agent \u81ea\u884c\u6784\u5efa Step \u5e76\u63d0\u4ea4 \u9ed1\u76d2 {base_url}/v1/chat/completions Gateway \u81ea\u52a8 tokenize \u5e76\u6784\u5efa Step

\u8be6\u89c1 Gateway Server\u3002

"},{"location":"concepts/middleware-layer/#datapool","title":"DataPool\uff1a\u6570\u636e\u7ba1\u7406\u6838\u5fc3","text":"

DataPool \u662f\u4e00\u4e2a Ray Actor\uff0c\u4e0d\u4ec5\u662f trajectory \u7f13\u51b2\u533a\uff0c\u66f4\u662f Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u4e2d\u67a2\uff1a

\u80fd\u529b \u8bf4\u660e \u6570\u636e\u5b58\u50a8 \u4ee5 Step \u7c92\u5ea6\u5b58\u50a8\u4ea4\u4e92\u6570\u636e\uff0c\u652f\u6301\u591a\u7ef4\u7d22\u5f15 \u8d28\u91cf\u8ffd\u8e2a \u6bcf\u4e2a Step \u8bb0\u5f55 policy_version\uff0c\u652f\u6301\u65b0\u9c9c\u5ea6\u68c0\u6d4b Channel \u5206\u533a \"train\" \u548c \"val\" \u6570\u636e\u9694\u79bb\uff0c\u4e92\u4e0d\u5e72\u6270 GRPO \u5206\u7ec4 \u6309 prompt_uid \u5206\u7ec4\uff0c\u51d1\u9f50\u6240\u6709 rollout \u540e\u624d\u4f9b\u7ed9\u8bad\u7ec3 \u5bb9\u91cf\u7ba1\u7406 \u53ef\u914d\u7f6e max_queue_size\uff0c\u8d85\u9650\u81ea\u52a8\u4e22\u5f03\u6700\u65e7\u6570\u636e \u7edf\u8ba1\u76d1\u63a7 \u5b9e\u65f6\u63d0\u4f9b\u961f\u5217\u6df1\u5ea6\u3001produce/consume/drop \u901f\u7387\u7b49\u6307\u6807

\u8be6\u89c1 DataPool\u3002

"},{"location":"concepts/middleware-layer/#step","title":"Step \u6570\u636e\u6a21\u578b","text":"

Step \u662f\u6570\u636e\u7ba1\u7406\u7684\u539f\u5b50\u5355\u4f4d\uff0c\u8bb0\u5f55\u4e86\u4e00\u6b21 Agent \u4ea4\u4e92\u7684\u5b8c\u6574\u4fe1\u606f\uff1a

@dataclass\nclass Step:\n    prompt_ids:     list[int]   # state: \u5b8c\u6574\u4e0a\u4e0b\u6587 token IDs\n    response_ids:   list[int]   # action: LLM \u751f\u6210\u7684 token IDs\n    reward:         float       # \u5373\u65f6 reward\uff08\u8d28\u91cf\u8bc4\u5206\uff09\n    trajectory_uid: str         # \u540c\u4e00\u5bf9\u8bdd\u7684 step \u5171\u4eab\u6b64 ID\n    prompt_uid:     str         # \u540c\u4e00 prompt \u7684 rollout \u5171\u4eab\u6b64 ID\n    step_index:     int         # trajectory \u5185\u7684\u4f4d\u7f6e\n    policy_version: int         # \u751f\u6210\u65f6\u7684\u7b56\u7565\u7248\u672c\uff08\u65b0\u9c9c\u5ea6\u8ffd\u8e2a\uff09\n    is_last:        bool        # \u662f\u5426\u4e3a\u6700\u540e\u4e00\u4e2a step\n    metadata:       dict        # \u8f85\u52a9\u6570\u636e\uff08\u6765\u6e90\u3001\u6570\u636e\u96c6\u5b57\u6bb5\u7b49\uff09\n
"},{"location":"concepts/middleware-layer/#reward","title":"Reward \u6807\u6ce8\u4e0e\u6570\u636e\u8d28\u91cf\u8bc4\u4f30","text":"

Reward \u8ba1\u7b97\u4e0e Agent \u670d\u52a1\u89e3\u8026\uff0c\u786e\u4fdd\u6570\u636e\u8d28\u91cf\u8bc4\u4f30\u4e0d\u5f71\u54cd Agent \u670d\u52a1\u5ef6\u8fdf\uff1a

  1. Gateway \u91c7\u96c6 Step \u65f6 reward=0.0\uff08\u539f\u59cb\u6570\u636e\uff09
  2. DataPool \u5b58\u50a8\u539f\u59cb Step
  3. Trainer \u5728\u6d88\u8d39\u6570\u636e\u524d\u901a\u8fc7 RewardLoopWorker \u8bc4\u4f30\u6570\u636e\u8d28\u91cf\uff08\u8ba1\u7b97 reward\uff09
  4. \u8bc4\u4f30\u540e\u7684 reward \u7528\u4e8e advantage \u8ba1\u7b97\u548c\u6570\u636e\u7b5b\u9009

\u8fd9\u79cd\u8bbe\u8ba1\u4f7f\u5f97\u5373\u4f7f\u662f\u6162\u901f\u7684 generative reward model \u6216\u4eba\u7c7b\u53cd\u9988\u7ba1\u7ebf\u4e5f\u4e0d\u4f1a\u5f71\u54cd Agent \u7684\u6b63\u5e38\u670d\u52a1\u3002

"},{"location":"concepts/production-scenario/","title":"Production Agent Scenario","text":""},{"location":"concepts/production-scenario/#agentic-rl","title":"Agentic RL \u4e2d\u7684\u9690\u542b\u5047\u8bbe","text":"

\u51e0\u4e4e\u6240\u6709 Agentic RL \u6846\u67b6\u90fd\u5efa\u7acb\u5728\u4e00\u4e2a\u9690\u542b\u5047\u8bbe\u4e0a\uff1a

\u8bad\u7ec3\u9636\u6bb5 \u2260 \u90e8\u7f72\u9636\u6bb5

\u6807\u51c6\u6d41\u7a0b\uff1a\u5728\u79bb\u7ebf/\u6a21\u62df\u6570\u636e\u4e0a\u8bad\u7ec3 \u2192 \u90e8\u7f72\u56fa\u5b9a\u6a21\u578b \u2192 \u5b9a\u671f\u91cd\u8bad\u3002

\u8fd9\u5728\u7814\u7a76\u573a\u666f\u4e0b\u53ef\u884c\uff0c\u4f46\u5728\u751f\u4ea7\u73af\u5883\u4e2d\u9047\u5230\u6839\u672c\u6027\u969c\u788d\uff1a

\u95ee\u9898 \u8868\u73b0 \u5206\u5e03\u504f\u79fb \u8bad\u7ec3\u6570\u636e\u662f\u5408\u6210\u7684\uff1b\u771f\u5b9e\u7528\u6237\u8bf7\u6c42\u5206\u5e03\u4e0d\u540c \u2192 \u90e8\u7f72\u540e\u80fd\u529b\u9000\u5316 \u51b7\u542f\u52a8 \u65b0\u90e8\u7f72\u7684\u6a21\u578b\u5bf9\u7279\u5b9a\u7528\u6237\u7684\u4e60\u60ef\u3001\u5de5\u5177\u3001\u5de5\u4f5c\u6d41\u4e00\u65e0\u6240\u77e5 \u2192 \u6f2b\u957f\u7684\"\u9884\u70ed\"\u671f \u957f\u5c3e\u4efb\u52a1 Benchmark \u8986\u76d6\u5e38\u89c1\u4efb\u52a1\uff1b\u7528\u6237\u7684\u5c0f\u4f17\u9700\u6c42\u65e0\u6cd5\u88ab\u79bb\u7ebf\u8bad\u7ec3\u8986\u76d6 \u73af\u5883\u6f02\u79fb \u5de5\u5177 API \u66f4\u65b0\u3001\u7528\u6237\u884c\u4e3a\u53d8\u5316 \u2192 \u9759\u6001\u6a21\u578b\u65e0\u6cd5\u81ea\u9002\u5e94"},{"location":"concepts/production-scenario/#claw-r1-agent","title":"Claw-R1 \u7684\u6838\u5fc3\u573a\u666f\uff1a\u4e2a\u4eba Agent \u81ea\u6211\u8fdb\u5316","text":"

Claw-R1 \u7684\u9996\u4e2a\u9a8c\u8bc1\u573a\u666f\u662f OpenClaw \u4e2a\u4eba\u52a9\u624b\uff1a

\u8bbe\u7f6e\uff1a\n  \u7528\u6237\u5728 Mac Mini \u4e0a\u90e8\u7f72 OpenClaw\uff0c\u8fde\u63a5 Slack / \u5fae\u4fe1 / \u90ae\u4ef6\u3002\n  \u6bcf\u5929\u901a\u8fc7\u6d88\u606f\u4e0e OpenClaw \u4ea4\u4e92\uff1a\u65e5\u7a0b\u5b89\u6392\u3001\u4fe1\u606f\u68c0\u7d22\u3001\u4ee3\u7801\u8f85\u52a9\u7b49\u3002\n\n\u4f20\u7edf\u65b9\u6848\uff1a\n  OpenClaw \u4f7f\u7528\u56fa\u5b9a\u7684 GPT-4o / Claude 3.5\u3002\n  \u80fd\u529b\u4e0d\u4f1a\u968f\u4f7f\u7528\u800c\u589e\u957f\u3002\n\nClaw-R1 \u65b9\u6848\uff1a\n  1. \u7528\u6237\u6d88\u606f \u2192 OpenClaw \u2192 Gateway\uff08\u62e6\u622a LLM \u8c03\u7528\uff09\n  2. Gateway \u8bb0\u5f55\u6bcf\u6b21\u4ea4\u4e92 \u2192 DataPool\uff08\u672c\u5730\uff09\n  3. Reward Model \u5bf9\u6bcf\u6b21\u4ea4\u4e92\u8bc4\u5206\n  4. \u8fdc\u7a0b\u670d\u52a1\u5668\u4e0a\u7684\u8bad\u7ec3\u5f15\u64ce\u6301\u7eed\u6d88\u8d39 DataPool\uff0c\u66f4\u65b0\u6a21\u578b\u6743\u91cd\n  5. \u66f4\u65b0\u7684\u6743\u91cd\u63a8\u9001\u56de Gateway\uff1b\u4e0b\u6b21\u8c03\u7528\u4f7f\u7528\u6539\u8fdb\u540e\u7684\u6a21\u578b\n\n\u7ed3\u679c\uff1a\n  \u7528\u6237 Mac Mini \u4e0a\u7684 OpenClaw \u4f1a\u968f\u65f6\u95f4\u63a8\u79fb\u8d8a\u6765\u8d8a\u4e86\u89e3\u8be5\u7528\u6237\u3002\n
"},{"location":"concepts/production-scenario/#rl","title":"\u4f20\u7edf RL \u6846\u67b6\u65e0\u6cd5\u6ee1\u8db3\u7684\u4e09\u4e2a\u9700\u6c42","text":""},{"location":"concepts/production-scenario/#1","title":"\u2460 \u670d\u52a1\u8fde\u7eed\u6027","text":"

\u6a21\u578b\u6743\u91cd\u66f4\u65b0\u4e0d\u80fd\u4e2d\u65ad Gateway \u7684\u8bf7\u6c42\u5904\u7406\u3002\u5728 Claw-R1 \u4e2d\uff1a

  • Trainer \u76f4\u63a5\u7ba1\u7406 Rollout Engine \u548c Reward Model \u7684\u751f\u547d\u5468\u671f\uff08wake_up / sleep / \u6743\u91cd\u540c\u6b65\uff09
  • Gateway \u662f\u7eaf HTTP \u4ee3\u7406 \u2014 \u53ea\u8f6c\u53d1\u8bf7\u6c42\u548c\u63d0\u4ea4 step\uff1b\u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f
  • \u8fd9\u4fdd\u8bc1\u4e86\u5373\u4f7f\u5728\u6743\u91cd\u66f4\u65b0\u671f\u95f4\uff0c\u8bf7\u6c42\u8f6c\u53d1\u548c\u6570\u636e\u6536\u96c6\u4e5f\u80fd\u6301\u7eed\u8fdb\u884c
"},{"location":"concepts/production-scenario/#2","title":"\u2461 \u65e0\u9884\u8bbe\u6570\u636e","text":"

\u4f20\u7edf\u6846\u67b6\u9700\u8981\u9884\u5148\u6536\u96c6\u7684\u6570\u636e\u96c6\u3002Claw-R1 \u7684\u8bad\u7ec3\u6570\u636e\u5b8c\u5168\u6765\u81ea\u5b9e\u65f6\u7528\u6237\u4ea4\u4e92\uff1a

  • \u7528\u6237\u95ee\u4e86\u4ec0\u4e48\u3001Agent \u5982\u4f55\u56de\u7b54\u3001\u8c03\u7528\u4e86\u54ea\u4e9b\u5de5\u5177 \u2014 \u8fd9\u4e9b\u81ea\u52a8\u6210\u4e3a\u8bad\u7ec3\u6570\u636e
  • \u96f6\u6570\u636e\u5de5\u7a0b\uff1b\u6570\u636e\u968f\u670d\u52a1\u8fd0\u884c\u81ea\u7136\u79ef\u7d2f
"},{"location":"concepts/production-scenario/#3-reward","title":"\u2462 \u771f\u5b9e\u73af\u5883\u7684 Reward \u4fe1\u53f7","text":"

\u4f20\u7edf RLVR \u7684 reward \u6765\u81ea\u53ef\u9a8c\u8bc1\u7684\u4efb\u52a1\u7ed3\u679c\u3002\u751f\u4ea7\u73af\u5883\u7684 reward \u66f4\u52a0\u5fae\u5999\uff1a

  • \u7528\u6237\u7ee7\u7eed\u8ffd\u95ee \u2192 \u9690\u5f0f\u6b63\u4fe1\u53f7
  • \u7528\u6237\u7ea0\u6b63 Agent \u2192 \u8d1f\u53cd\u9988
  • \u4efb\u52a1\u5b8c\u6210\u540e\u65e0\u540e\u7eed \u2192 Reward Model \u4f30\u8ba1\u4e2d\u95f4\u6b65\u9aa4\u8d28\u91cf

Claw-R1 \u4f7f\u7528 Reward Model \u5c06\u8fd9\u4e9b\u8f6f\u4fe1\u53f7\u8f6c\u6362\u4e3a\u53ef\u8bad\u7ec3\u7684 process reward\u3002

"},{"location":"concepts/production-scenario/#_1","title":"\u4e09\u79cd\u8fd0\u884c\u6a21\u5f0f","text":"\u6a21\u5f0f Agent \u7c7b\u578b \u6570\u636e\u6765\u6e90 \u8bf4\u660e \u767d\u76d2\u79bb\u7ebf AgentFlow (Python) \u5408\u6210\u6570\u636e\u96c6\u6216\u9884\u6536\u96c6\u7684 trajectory \u5df2\u5b8c\u6574\u5b9e\u73b0\uff1b\u63a8\u8350\u7528\u4e8e\u7814\u7a76 \u9ed1\u76d2\u79bb\u7ebf \u4efb\u4f55 HTTP Agent \u9884\u6536\u96c6\u7684\u6570\u636e\u96c6 \u5df2\u5b8c\u6574\u5b9e\u73b0\uff1b\u901a\u8fc7 base_url \u63a5\u5165 \u9ed1\u76d2\u5728\u7ebf \u4efb\u4f55 HTTP Agent \u5b9e\u65f6\u7528\u6237\u4ea4\u4e92 \u76ee\u6807\u751f\u4ea7\u6a21\u5f0f\uff1bGateway \u7aef\u70b9\u5df2\u5b9e\u73b0"},{"location":"concepts/production-scenario/#_2","title":"\u90e8\u7f72 = \u8bad\u7ec3","text":"

Claw-R1 \u5f15\u5165\u4e86\u4e00\u79cd\u65b0\u8303\u5f0f\uff1a

\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502         \u4f20\u7edf\uff1a\u8bad\u7ec3 \u2192 \u90e8\u7f72\uff08\u56fa\u5b9a\uff09                      \u2502\n\u2502                                                      \u2502\n\u2502  [\u5408\u6210\u6570\u636e] \u2192 [\u8bad\u7ec3] \u2192 [\u56fa\u5b9a\u6a21\u578b] \u2192 \u7528\u6237               \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502         Claw-R1\uff1a\u90e8\u7f72 = \u8bad\u7ec3\uff08\u6301\u7eed\uff09                   \u2502\n\u2502                                                      \u2502\n\u2502  \u7528\u6237 \u2500\u2500\u25ba Agent \u2500\u2500\u25ba [\u5b9e\u65f6\u6570\u636e] \u2500\u2500\u25ba \u8bad\u7ec3 \u2500\u2500\u25ba Agent     \u2502\n\u2502           \u25b2___________________________________|      \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n

\u5728\u8fd9\u79cd\u8303\u5f0f\u4e0b\uff1a

  • \u6bcf\u6b21\u7528\u6237\u4ea4\u4e92\u90fd\u662f\u4e00\u4e2a\u8bad\u7ec3\u6837\u672c
  • \u6bcf\u6b21\u6a21\u578b\u66f4\u65b0\u90fd\u6539\u5584 Agent \u7684\u771f\u5b9e\u4e16\u754c\u8868\u73b0
  • Agent \u8fd0\u884c\u65f6\u95f4\u8d8a\u957f\uff0c\u5bf9\u5176\u7279\u5b9a\u7528\u6237\u548c\u73af\u5883\u7684\u8868\u73b0\u8d8a\u597d
"},{"location":"configuration/","title":"Configuration Reference","text":"

Claw-R1 \u4f7f\u7528 Hydra \u8fdb\u884c\u5c42\u6b21\u5316\u914d\u7f6e\u7ba1\u7406\u3002\u6240\u6709 YAML \u914d\u7f6e\u4f4d\u4e8e claw_r1/config/\u3002

"},{"location":"configuration/#_1","title":"\u914d\u7f6e\u6587\u4ef6","text":"\u6587\u4ef6 \u7528\u9014 agent_ppo_trainer.yaml \u57fa\u7840 PPO trainer \u914d\u7f6e\uff08\u7ee7\u627f veRL \u7684 ppo_trainer\uff09 async_ppo_trainer.yaml \u5f02\u6b65\u8bad\u7ec3\u4e13\u7528\u914d\u7f6e overrides/rollout.yaml Rollout worker \u8bbe\u7f6e\uff08\u5f02\u6b65\u6a21\u5f0f\u3001Agent Flow\uff09"},{"location":"configuration/#async_ppo_traineryaml","title":"async_ppo_trainer.yaml","text":"

\u5f02\u6b65\u8bad\u7ec3\u7684\u6838\u5fc3\u914d\u7f6e\u6587\u4ef6\uff1a

defaults:\n  - ppo_trainer\n  - /overrides/rollout@actor_rollout_ref.rollout\n  - _self_\n\n# -- \u5f02\u6b65\u8bad\u7ec3\u8bbe\u7f6e --\nasync_training:\n  staleness_threshold: 0.1           # off-policy \u5bb9\u5fcd\u5ea6\n  trigger_parameter_sync_step: 4     # \u6bcf N \u6b65\u540c\u6b65\u6743\u91cd\u5230 Rollouter\n  require_batches: 1                 # \u6bcf\u6b21\u4ece DataPool \u53d6\u7684 batch \u6570\n  partial_rollout: false             # \u540c\u6b65\u65f6\u662f\u5426\u4e2d\u65ad\u8fdb\u884c\u4e2d\u7684 rollout\n  use_rollout_log_probs: true        # \u4f7f\u7528 rollout \u65f6\u6536\u96c6\u7684 log_probs\n  max_queue_size: null               # DataPool \u961f\u5217\u5927\u5c0f\uff08null = \u65e0\u9650\uff09\n\n  checkpoint_engine:\n    enable: true\n    device_buffer_size_M: 4096\n\n# -- Training GPU Pool --\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 4\n\n# -- Rollout GPU Pool --\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 4\n  total_epochs: 10\n  test_freq: 1\n\n# -- Actor \u914d\u7f6e --\nactor_rollout_ref:\n  hybrid_engine: false\n  actor:\n    use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, true}\n  checkpoint_engine: ${oc.select:async_training.checkpoint_engine, null}\n

GPU \u5206\u914d

trainer \u548c rollout \u90fd\u5fc5\u987b\u5206\u914d GPU\u3002\u603b GPU \u6570 = trainer.nnodes \u00d7 trainer.n_gpus_per_node + rollout.nnodes \u00d7 rollout.n_gpus_per_node\u3002

"},{"location":"configuration/#overridesrolloutyaml","title":"overrides/rollout.yaml","text":"

Rollout worker \u7684\u914d\u7f6e\u8986\u76d6\uff1a

name: vllm\nmode: async\n\nagent:\n  default_agent_flow: single_step_single_turn_agent\n  agent_flow_config_path: null\n
"},{"location":"configuration/#gateway","title":"Gateway \u914d\u7f6e","text":"

Gateway \u4f5c\u4e3a\u72ec\u7acb\u8fdb\u7a0b\u8fd0\u884c\uff0c\u901a\u8fc7 CLI \u53c2\u6570\u914d\u7f6e\uff08\u975e Hydra\uff09\uff1a

python -m claw_r1.gateway.gateway \\\n    --data-pool-name   data_pool \\\n    --vllm-addresses   host1:8001,host2:8001 \\\n    --tokenizer-path   /path/to/model \\\n    --prompt-length    4096 \\\n    --response-length  1024 \\\n    --reward-worker-name reward_loop_worker \\\n    --ray-address      auto \\\n    --ray-namespace    default \\\n    --host             0.0.0.0 \\\n    --port             8100\n

Gateway \u542f\u52a8\u8d85\u65f6\u53ef\u901a\u8fc7 Hydra \u914d\u7f6e\uff1a

trainer:\n  gateway_startup_timeout: 300   # \u79d2\uff0c\u9ed8\u8ba4 300\n
"},{"location":"configuration/#agent-flow","title":"Agent Flow \u914d\u7f6e","text":""},{"location":"configuration/#agent-flow_1","title":"\u767d\u76d2 Agent Flow","text":"

\u5728 overrides/rollout.yaml \u4e2d\u6307\u5b9a\uff1a

agent:\n  default_agent_flow: single_step_single_turn_agent\n
"},{"location":"configuration/#agent-flow_2","title":"\u9ed1\u76d2 Agent Flow","text":"

\u901a\u8fc7\u5916\u90e8 YAML \u6587\u4ef6\u6ce8\u518c\uff1a

# claw_r1/blackbox_agent/agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n

\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u5f15\u7528\uff1a

actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\nactor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml\n
"},{"location":"configuration/#gpu","title":"\u591a GPU \u914d\u7f6e","text":"
# \u72ec\u7acb\u7684 GPU \u6c60\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 2    # 2 GPU \u7528\u4e8e\u8bad\u7ec3\uff08Actor + Critic\uff09\n\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 1    # 1 GPU \u7528\u4e8e\u63a8\u7406\uff08vLLM\uff09\n

\u8d44\u6e90\u6c60\u9694\u79bb

Claw-R1 \u4f7f\u7528 Ray \u7684\u8d44\u6e90\u7ec4\u673a\u5236\u786e\u4fdd Trainer \u548c Rollouter \u7684 GPU \u4e0d\u91cd\u53e0\u3002\u4f7f\u7528 async_ppo_trainer.yaml \u65f6\u81ea\u52a8\u914d\u7f6e\u3002\u8be6\u89c1 Async Training\u3002

"},{"location":"configuration/#_2","title":"\u5b8c\u6574\u8bad\u7ec3\u811a\u672c\u793a\u4f8b","text":"
python3 -m claw_r1.async_main \\\n    algorithm.adv_estimator=grpo \\\n    data.train_files=$TRAIN_FILE \\\n    data.val_files=$VAL_FILE \\\n    data.train_batch_size=128 \\\n    data.max_prompt_length=512 \\\n    data.max_response_length=1024 \\\n    data.return_raw_chat=True \\\n    actor_rollout_ref.model.path=$MODEL \\\n    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \\\n    actor_rollout_ref.rollout.name=vllm \\\n    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \\\n    actor_rollout_ref.rollout.n=5 \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    trainer.n_gpus_per_node=2 \\\n    trainer.nnodes=1 \\\n    rollout.n_gpus_per_node=1 \\\n    rollout.nnodes=1 \\\n    async_training.trigger_parameter_sync_step=1 \\\n    async_training.use_rollout_log_probs=true\n

\u66f4\u591a\u793a\u4f8b\u89c1 example/ \u76ee\u5f55\u3002

"},{"location":"getting-started/","title":"Getting Started","text":"
  • Installation

    \u73af\u5883\u914d\u7f6e\u3001\u4f9d\u8d56\u5b89\u88c5\u548c\u9a8c\u8bc1\u3002

    Installation

  • Quick Start

    5 \u5206\u949f\u5185\u8fd0\u884c\u4f60\u7684\u7b2c\u4e00\u4e2a\u5f02\u6b65\u8bad\u7ec3\u5b9e\u9a8c\u3002

    Quick Start

"},{"location":"getting-started/#_1","title":"\u524d\u7f6e\u6761\u4ef6","text":"\u4f9d\u8d56 \u6700\u4f4e\u7248\u672c Python 3.10+ PyTorch 2.0+ CUDA 12.1+ Ray 2.10+ GPU 3 \u5f20\uff082 \u8bad\u7ec3 + 1 \u63a8\u7406\uff09"},{"location":"getting-started/#_2","title":"\u67b6\u6784\u4e00\u89c8","text":"
\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502   Agent     \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 Gateway  \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 DataPool \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 Trainer  \u2502\n\u2502 (\u9ed1\u76d2/\u767d\u76d2) \u2502\u25c4\u2500\u2500\u2500\u2500\u2502 (:8100)  \u2502     \u2502          \u2502     \u2502          \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2518\n                                                           \u2502 \u6743\u91cd\u540c\u6b65\n                                                           \u25bc\n                                                     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                                                     \u2502  vLLM    \u2502\n                                                     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n
"},{"location":"getting-started/installation/","title":"Installation Guide","text":"

Claw-R1 uses the same environment setup as verl.

"},{"location":"getting-started/installation/#base-environment","title":"Base Environment","text":"

Follow the official verl installation guide, but make sure the environment ends up with verl==0.7.0.

If you want a broader overview of the base training workflow, the verl quickstart is also useful.

"},{"location":"getting-started/installation/#what-this-means-for-claw-r1","title":"What This Means for Claw-R1","text":"

Once the verl environment is working, Claw-R1 should run in the same environment. In practice, that means you can:

  • prepare a Python environment with verl==0.7.0
  • clone this repository
  • run Claw-R1 commands directly from the repository root

You do not need to install Claw-R1 as a separate package.

The documentation in this repository intentionally does not duplicate a separate environment guide, so that the infrastructure setup stays aligned with verl.

"},{"location":"getting-started/quickstart/","title":"Quick Start","text":"

\u672c\u6307\u5357\u5c55\u793a\u5982\u4f55\u5feb\u901f\u8fd0\u884c Claw-R1 \u7684\u5f02\u6b65\u8bad\u7ec3\u3002

"},{"location":"getting-started/quickstart/#_1","title":"\u524d\u7f6e\u6761\u4ef6","text":"
  • \u5df2\u5b8c\u6210 \u5b89\u88c5
  • \u81f3\u5c11 3 \u5f20 GPU\uff082 \u5f20\u8bad\u7ec3 + 1 \u5f20\u63a8\u7406\uff09
  • \u8bad\u7ec3\u6570\u636e\uff08parquet \u683c\u5f0f\uff09
"},{"location":"getting-started/quickstart/#black-box","title":"Black-box \u6a21\u5f0f\uff08\u63a8\u8350\u5165\u95e8\uff09","text":"

\u9ed1\u76d2\u6a21\u5f0f\u4e0b\uff0cAgent \u4f7f\u7528\u6807\u51c6 OpenAI API \u4e0e Gateway \u4ea4\u4e92\uff0c\u65e0\u9700\u4fee\u6539 Agent \u4ee3\u7801\u3002\u4ee5 GSM8K \u6570\u5b66\u9898\u4e3a\u4f8b\uff1a

"},{"location":"getting-started/quickstart/#1","title":"1. \u51c6\u5907\u6570\u636e","text":"
# \u4e0b\u8f7d GSM8K \u6570\u636e\u96c6\uff08parquet \u683c\u5f0f\uff09\n# \u786e\u4fdd train.parquet \u548c test.parquet \u5728 ~/data/gsm8k/ \u4e0b\n
"},{"location":"getting-started/quickstart/#2","title":"2. \u8fd0\u884c\u8bad\u7ec3","text":"
export CUDA_VISIBLE_DEVICES=0,1,2\n\nsh example/test_async_blackbox.sh\n

\u8be5\u811a\u672c\u4f1a\uff1a

  1. \u542f\u52a8 Ray \u96c6\u7fa4
  2. \u521b\u5efa DataPool\uff08Ray Actor\uff09
  3. \u5728 GPU 0-1 \u4e0a\u90e8\u7f72 Actor + Critic\uff08\u8bad\u7ec3\uff09
  4. \u5728 GPU 2 \u4e0a\u90e8\u7f72 vLLM\uff08\u63a8\u7406\uff09
  5. \u542f\u52a8 Gateway\uff08\u7aef\u53e3 8100\uff09
  6. \u8fd0\u884c BlackBoxGSM8KAgentFlow\uff1a
    • \u4e3a\u6bcf\u4e2a\u6837\u672c\u8c03\u7528 init_trajectory \u83b7\u53d6 base_url
    • \u521b\u5efa GSM8KAgent\uff0c\u4f7f\u7528 base_url \u4f5c\u4e3a OpenAI API \u7684 endpoint
    • Agent \u901a\u8fc7\u591a\u8f6e tool calling \u89e3\u9898
    • Gateway \u81ea\u52a8\u6536\u96c6\u6bcf\u8f6e\u5bf9\u8bdd\u4e3a Step \u5e76\u63d0\u4ea4\u5230 DataPool
  7. AsyncTrainer \u4ece DataPool \u62c9\u53d6 batch \u8fdb\u884c PPO \u8bad\u7ec3
  8. \u5b9a\u671f\u540c\u6b65\u6743\u91cd\u5230 vLLM
"},{"location":"getting-started/quickstart/#3","title":"3. \u5173\u952e\u914d\u7f6e\u53c2\u6570","text":"
# GPU \u5206\u914d\ntrainer.n_gpus_per_node=2        # \u8bad\u7ec3\u7528 2 \u5f20 GPU\nrollout.n_gpus_per_node=1        # \u63a8\u7406\u7528 1 \u5f20 GPU\n\n# Agent Flow\nactor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent\nactor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml\n\n# \u5f02\u6b65\u8bad\u7ec3\nasync_training.trigger_parameter_sync_step=1   # \u6bcf\u6b65\u540c\u6b65\u6743\u91cd\nactor_rollout_ref.rollout.n=5                  # \u6bcf\u4e2a prompt \u751f\u6210 5 \u6761 trajectory\n
"},{"location":"getting-started/quickstart/#white-box","title":"White-box \u6a21\u5f0f","text":"

\u767d\u76d2\u6a21\u5f0f\u4e0b\uff0cAgent \u903b\u8f91\u7528 Python \u7f16\u5199\uff0c\u76f4\u63a5\u901a\u8fc7 Gateway \u7684 /generate \u548c /submit_steps \u7aef\u70b9\u4ea4\u4e92\u3002

export CUDA_VISIBLE_DEVICES=0,1,2\n\nsh example/test_async.sh\n

\u767d\u76d2\u6a21\u5f0f\u4f7f\u7528 MultiStepAgentFlow \u6216 SingleStepSingleTurnAgentFlow\uff0cAgent \u81ea\u884c\u7ba1\u7406 tokenize \u548c Step \u6784\u5efa\u3002

"},{"location":"getting-started/quickstart/#agent","title":"\u81ea\u5b9a\u4e49 Agent","text":""},{"location":"getting-started/quickstart/#agent_1","title":"\u6dfb\u52a0\u9ed1\u76d2 Agent","text":"
  1. \u5b9e\u73b0 Agent \u7c7b\uff08\u53ea\u9700 base_url \u548c OpenAI API\uff09
  2. \u5b9e\u73b0 BlackBoxAgentFlowBase \u5b50\u7c7b
  3. \u5728 agent_flow_config.yaml \u4e2d\u6ce8\u518c
  4. \u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u6307\u5b9a

\u8be6\u7ec6\u6b65\u9aa4\u89c1 Black-box Agent\u3002

"},{"location":"getting-started/quickstart/#agent_2","title":"\u6dfb\u52a0\u767d\u76d2 Agent","text":"
  1. \u7ee7\u627f AgentFlowBase\uff08\u6216 MultiStepAgentFlow\uff09
  2. \u5b9e\u73b0 run() \u65b9\u6cd5
  3. \u4f7f\u7528 @register(\"name\") \u6ce8\u518c

\u8be6\u7ec6\u6b65\u9aa4\u89c1 Agent Flow\u3002

"},{"location":"getting-started/quickstart/#_2","title":"\u76d1\u63a7\u8bad\u7ec3","text":"

\u8bad\u7ec3\u65e5\u5fd7\u9ed8\u8ba4\u8f93\u51fa\u5230\u63a7\u5236\u53f0\u3002\u53ef\u914d\u7f6e SwanLab \u7b49\u65e5\u5fd7\u540e\u7aef\uff1a

trainer.logger='[\"console\",\"swanlab\"]'\ntrainer.project_name='my_project'\ntrainer.experiment_name='my_experiment'\n
"},{"location":"getting-started/quickstart/#_3","title":"\u4e0b\u4e00\u6b65","text":"
  • Components \u2014 \u4e86\u89e3\u5404\u7ec4\u4ef6\u7684\u8be6\u7ec6\u8bbe\u8ba1
  • Configuration \u2014 \u5b8c\u6574\u914d\u7f6e\u53c2\u8003
  • Gateway API \u2014 HTTP \u7aef\u70b9\u6587\u6863
"}]} \ No newline at end of file