From 6d489a351085f6fedacdc046e746fd99bc351479 Mon Sep 17 00:00:00 2001
From: lqc <chouli@mail.ustc.edu.cn>
Date: Tue, 31 Mar 2026 11:32:58 +0800
Subject: [PATCH] update docs

---
 index.html               | 2 +-
 search/search_index.json | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/index.html b/index.html
index 54eab3a..c330c57 100644
--- a/index.html
+++ b/index.html
@@ -1509,7 +1509,7 @@ <h2 id="team">Team<a class="headerlink" href="#team" title="Permanent link">&par
 <h2 id="citation">Citation<a class="headerlink" href="#citation" title="Permanent link">&para;</a></h2>
 <div class="language-bibtex highlight"><pre><span></span><code><span id="__span-1-1"><a id="__codelineno-1-1" name="__codelineno-1-1" href="#__codelineno-1-1"></a><span class="nc">@misc</span><span class="p">{</span><span class="nl">clawr1-2026</span><span class="p">,</span>
 </span><span id="__span-1-2"><a id="__codelineno-1-2" name="__codelineno-1-2" href="#__codelineno-1-2"></a><span class="w">  </span><span class="na">title</span><span class="p">=</span><span class="s">{Claw-R1: The Data Foundation for Agentic Reinforcement Learning}</span><span class="p">,</span>
-</span><span id="__span-1-3"><a id="__codelineno-1-3" name="__codelineno-1-3" href="#__codelineno-1-3"></a><span class="w">  </span><span class="na">author</span><span class="p">=</span><span class="s">{Wang, Daoyu and Ouyang, Jie and Yu, Shuo and Cheng, Mingyue and Liu, Qi}</span><span class="p">,</span>
+</span><span id="__span-1-3"><a id="__codelineno-1-3" name="__codelineno-1-3" href="#__codelineno-1-3"></a><span class="w">  </span><span class="na">author</span><span class="p">=</span><span class="s">{Wang, Daoyu and Li, Qingchuan and Ouyang, Jie and Yu, Shuo and Cheng, Mingyue and Liu, Qi}</span><span class="p">,</span>
 </span><span id="__span-1-4"><a id="__codelineno-1-4" name="__codelineno-1-4" href="#__codelineno-1-4"></a><span class="w">  </span><span class="na">year</span><span class="p">=</span><span class="s">{2025}</span><span class="p">,</span>
 </span><span id="__span-1-5"><a id="__codelineno-1-5" name="__codelineno-1-5" href="#__codelineno-1-5"></a><span class="w">  </span><span class="na">howpublished</span><span class="p">=</span><span class="s">{\url{https://github.com/AgentR1/Claw-R1}}</span><span class="p">,</span>
 </span><span id="__span-1-6"><a id="__codelineno-1-6" name="__codelineno-1-6" href="#__codelineno-1-6"></a><span class="w">  </span><span class="na">note</span><span class="p">=</span><span class="s">{GitHub repository}</span>
diff --git a/search/search_index.json b/search/search_index.json
index b657128..6eba891 100644
--- a/search/search_index.json
+++ b/search/search_index.json
@@ -1 +1 @@
-{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"],"fields":{"title":{"boost":1000.0},"text":{"boost":1.0},"tags":{"boost":1000000.0}}},"docs":[{"location":"","title":"Claw-R1","text":"<p>The Data Foundation for Agentic Reinforcement Learning</p> <p>Claw-R1 \u662f Agentic RL \u7684\u6570\u636e\u57fa\u7840\u8bbe\u65bd \u2014 \u4e13\u6ce8\u4e8e\u4ece\u4efb\u610f Agent \u4ea4\u4e92\u4e2d\u91c7\u96c6\u3001\u8bc4\u4f30\u548c\u7b5b\u9009\u9ad8\u8d28\u91cf\u8bad\u7ec3\u6570\u636e\uff0c\u5e76\u652f\u6301\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u7684\u6574\u5408\u3002</p> <ul> <li> <p> Universal Data Collection</p> <p>\u4ece\u767d\u76d2\u3001\u9ed1\u76d2\u5230\u5728\u7ebf\u670d\u52a1 Agent\uff0c\u901a\u8fc7 <code>base_url</code> \u673a\u5236\u96f6\u4ee3\u7801\u63a5\u5165\uff0c\u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\u3002\u652f\u6301 LangChain\u3001AutoGen\u3001CrewAI \u7b49\u4efb\u610f OpenAI \u517c\u5bb9 Agent\u3002</p> <p> Base URL Integration</p> </li> <li> <p> Data Middleware Layer</p> <p>Gateway + DataPool \u6570\u636e\u4e2d\u95f4\u4ef6\uff1aGateway \u91c7\u96c6\u4ea4\u4e92\u6570\u636e\uff0cDataPool \u7ba1\u7406\u6570\u636e\u8d28\u91cf\u3001\u5206\u533a\u7f13\u51b2\u3001\u6309\u9700\u4f9b\u7ed9\u8bad\u7ec3\u5f15\u64ce\u3002</p> <p> Middleware Layer</p> </li> <li> <p> Data Evaluation &amp; Curation</p> <p>\u591a\u7ef4 Reward \u7cfb\u7edf\uff08\u89c4\u5219/\u5224\u522b\u5f0f RM/\u751f\u6210\u5f0f RM\uff09+ \u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u6574\u5408 + \u7b56\u7565\u7248\u672c\u8ffd\u8e2a\uff0c\u7cfb\u7edf\u6027\u8bc4\u4f30\u548c\u7b5b\u9009\u6570\u636e\u8d28\u91cf\u3002</p> <p> Reward System</p> </li> <li> <p> Production Agent Scenario</p> <p>\"\u90e8\u7f72 = \u8bad\u7ec3\" \u8303\u5f0f\u3002Agent \u5728\u670d\u52a1\u7528\u6237\u7684\u540c\u65f6\u6301\u7eed\u91c7\u96c6\u6570\u636e\uff0c\u7528\u6237\u884c\u4e3a\uff08\u91c7\u7eb3\u3001\u4fee\u6539\u3001\u8ffd\u95ee\uff09\u5929\u7136\u6210\u4e3a\u6570\u636e\u8d28\u91cf\u4fe1\u53f7\u3002</p> <p> Production Scenario</p> </li> </ul>"},{"location":"#why-claw-r1","title":"Why Claw-R1?","text":"<p>Agentic RL \u751f\u6001\u6b63\u84ec\u52c3\u53d1\u5c55 \u2014 verl\u3001Agent-R1\u3001Forge \u7b49\u4f18\u79c0\u6846\u67b6\u5728 Runtime \u548c\u8bad\u7ec3\u7b97\u6cd5\u65b9\u9762\u6301\u7eed\u63a8\u8fdb\u3002\u7136\u800c\uff0c\u968f\u7740 Agent \u4ece\u7b80\u5355 ReAct \u6f14\u8fdb\u5230 Claude Code\u3001OpenClaw \u7b49\u901a\u7528\u67b6\u6784\uff0c\u4e00\u4e2a\u76f8\u5bf9\u6b20\u7f3a\u3001\u503c\u5f97\u6df1\u8015\u7684\u65b9\u5411\u9010\u6e10\u6d6e\u73b0\uff1a\u5982\u4f55\u4ece\u591a\u6837\u7684 Agent \u4ea4\u4e92\u4e2d\u7cfb\u7edf\u6027\u5730\u91c7\u96c6\u3001\u8bc4\u4f30\u548c\u7b5b\u9009\u9ad8\u8d28\u91cf\u8bad\u7ec3\u6570\u636e\uff1f</p> <p>Claw-R1 \u805a\u7126\u4e8e\u8fd9\u4e00\u65b9\u5411\uff0c\u63d0\u4f9b Agent \u4e0e Trainer \u4e4b\u95f4\u7684\u6570\u636e\u57fa\u7840\u8bbe\u65bd\u3002</p> \u7ef4\u5ea6 \u4f20\u7edf Agentic RL \u6846\u67b6 Claw-R1 \u6838\u5fc3\u5173\u6ce8 \u8bad\u7ec3\u7b97\u6cd5\u4e0e Runtime \u6570\u636e\u7684\u91c7\u96c6\u3001\u8bc4\u4f30\u4e0e\u7b5b\u9009 Agent \u63a5\u5165 \u9700\u8981\u7528\u6846\u67b6 API \u91cd\u5199 \u53ea\u6539 <code>base_url</code>\uff0c\u96f6\u4ee3\u7801\u4fb5\u5165 \u6570\u636e\u6765\u6e90 \u9884\u6536\u96c6\u7684\u79bb\u7ebf\u6570\u636e \u5b9e\u65f6\u4ea4\u4e92\u81ea\u52a8\u91c7\u96c6 + \u79bb\u7ebf\u6570\u636e\u96c6 \u6570\u636e\u8d28\u91cf\u7ba1\u63a7 \u8f83\u5c11\u5173\u6ce8 \u591a\u7ef4 Reward + \u4eba\u7c7b\u53cd\u9988 + \u65b0\u9c9c\u5ea6\u68c0\u6d4b \u8bad\u7ec3\u5f15\u64ce \u5185\u7f6e\u7ed1\u5b9a \u53ef\u63d2\u62d4 TrainingBackend\uff0c\u5bf9\u63a5\u4efb\u610f\u5f15\u64ce"},{"location":"#_1","title":"\u5feb\u901f\u5f00\u59cb","text":"<pre><code># \u514b\u9686\u4ed3\u5e93\ngit clone https://github.com/AgentR1/Claw-R1 &amp;&amp; cd Claw-R1\n\n# \u8fd0\u884c\u9ed1\u76d2 GSM8K \u8bad\u7ec3\nexport CUDA_VISIBLE_DEVICES=0,1,2\nsh example/test_async_blackbox.sh\n</code></pre> <p> \u5b8c\u6574\u5b89\u88c5\u6307\u5357 \u00b7  Quick Start</p>"},{"location":"#_2","title":"\u9879\u76ee\u72b6\u6001","text":"\u80fd\u529b \u72b6\u6001 \u767d\u76d2 Agent \u6570\u636e\u91c7\u96c6  \u5df2\u5b9e\u73b0 \u9ed1\u76d2 Agent \u6570\u636e\u91c7\u96c6  \u5df2\u5b9e\u73b0 \u5728\u7ebf\u670d\u52a1\u6570\u636e\u91c7\u96c6  \u5f00\u53d1\u4e2d \u5f02\u6b65\u8bad\u7ec3\u4f9b\u7ed9  \u5df2\u5b9e\u73b0 \u4eba\u7c7b\u53cd\u9988\u7ba1\u7ebf  \u89c4\u5212\u4e2d \u6570\u636e\u8d28\u91cf Dashboard  \u89c4\u5212\u4e2d"},{"location":"#team","title":"Team","text":"<p>State Key Laboratory of Cognitive Intelligence, USTC</p>"},{"location":"#citation","title":"Citation","text":"<pre><code>@misc{clawr1-2026,\n  title={Claw-R1: The Data Foundation for Agentic Reinforcement Learning},\n  author={Wang, Daoyu and Ouyang, Jie and Yu, Shuo and Cheng, Mingyue and Liu, Qi},\n  year={2025},\n  howpublished={\\url{https://github.com/AgentR1/Claw-R1}},\n  note={GitHub repository}\n}\n</code></pre>"},{"location":"contributing/","title":"Contributing","text":"<p>\u611f\u8c22\u4f60\u5bf9 Claw-R1 \u7684\u5173\u6ce8\uff01\u6b22\u8fce\u8d21\u732e\u4ee3\u7801\u3001\u6587\u6863\u548c\u60f3\u6cd5\u3002</p>"},{"location":"contributing/#_1","title":"\u9879\u76ee\u7ed3\u6784","text":"<pre><code>claw_r1/\n\u251c\u2500\u2500 agent_flow/           # Agent \u6267\u884c\u6846\u67b6\uff08\u767d\u76d2 + \u7ba1\u7406\u5668\uff09\n\u251c\u2500\u2500 blackbox_agent/       # \u9ed1\u76d2 Agent \u7cfb\u7edf\uff08Flow + Agent \u5b9e\u73b0\uff09\n\u251c\u2500\u2500 config/               # Hydra \u914d\u7f6e\u6587\u4ef6\n\u251c\u2500\u2500 data_pool/            # DataPool\uff08Ray Actor + Training Backend\uff09\n\u251c\u2500\u2500 gateway/              # Gateway Server\uff08FastAPI\uff09\n\u251c\u2500\u2500 async_main.py         # \u5f02\u6b65\u8bad\u7ec3\u5165\u53e3\n\u251c\u2500\u2500 async_rollouter.py    # AsyncRollouter\uff08Rollout GPU Pool\uff09\n\u251c\u2500\u2500 async_trainer.py      # AsyncTrainer\uff08Training GPU Pool\uff09\n\u251c\u2500\u2500 param_sync.py         # ParameterSynchronizer\n\u251c\u2500\u2500 detach_workers.py     # \u5206\u79bb\u5f0f Actor/Rollout Worker\n\u251c\u2500\u2500 core_algos.py         # PPO/GAE/GRPO \u6838\u5fc3\u7b97\u6cd5\n\u251c\u2500\u2500 reward_loop.py        # RewardLoopWorker\n\u251c\u2500\u2500 metric_utils.py       # \u6307\u6807\u805a\u5408\n\u251c\u2500\u2500 ray_agent_trainer.py  # \u540c\u6b65 Ray PPO Trainer\n\u2514\u2500\u2500 main_agent_ppo.py     # \u540c\u6b65\u8bad\u7ec3\u5165\u53e3\n</code></pre>"},{"location":"contributing/#_2","title":"\u4ee3\u7801\u98ce\u683c","text":"<ul> <li>\u4f7f\u7528 Ruff \u8fdb\u884c lint \u548c\u683c\u5f0f\u5316</li> <li>\u9075\u5faa PEP 8</li> <li>\u7c7b\u578b\u6ce8\u89e3\uff08Python 3.10+ \u8bed\u6cd5\uff09</li> </ul> <pre><code># \u5b89\u88c5 pre-commit hooks\npip install pre-commit\npre-commit install\n\n# \u624b\u52a8\u68c0\u67e5\nruff check .\nruff format .\n</code></pre>"},{"location":"contributing/#_3","title":"\u8d21\u732e\u65b9\u5411","text":""},{"location":"contributing/#_4","title":"\u9ad8\u4f18\u5148\u7ea7","text":"<ul> <li>\u65b0\u7684\u9ed1\u76d2 Agent \u5b9e\u73b0\uff08\u53c2\u8003 <code>blackbox_agent/gsm8k_agent.py</code>\uff09</li> <li>\u65b0\u7684 Reward \u51fd\u6570</li> <li>\u6027\u80fd\u4f18\u5316\uff08DataPool \u541e\u5410\u3001Gateway \u5ef6\u8fdf\uff09</li> </ul>"},{"location":"contributing/#_5","title":"\u6587\u6863","text":"<ul> <li>\u6559\u7a0b\u548c\u793a\u4f8b</li> <li>API \u6587\u6863\u8865\u5145</li> <li>\u4e2d\u82f1\u6587\u7ffb\u8bd1</li> </ul>"},{"location":"contributing/#_6","title":"\u7814\u7a76","text":"<ul> <li>\u65b0\u7684 advantage \u8ba1\u7b97\u7b97\u6cd5</li> <li>\u5728\u7ebf\u5b66\u4e60\u7b56\u7565</li> <li>\u591a Agent \u534f\u4f5c\u8bad\u7ec3</li> </ul>"},{"location":"contributing/#pr","title":"PR \u6d41\u7a0b","text":"<ol> <li>Fork \u4ed3\u5e93</li> <li>\u521b\u5efa feature branch\uff1a<code>git checkout -b feature/my-feature</code></li> <li>\u7f16\u5199\u4ee3\u7801\u548c\u6d4b\u8bd5</li> <li>\u786e\u4fdd <code>ruff check .</code> \u901a\u8fc7</li> <li>\u63d0\u4ea4 PR\uff0c\u63cf\u8ff0\u6539\u52a8\u5185\u5bb9\u548c\u52a8\u673a</li> </ol>"},{"location":"contributing/#_7","title":"\u672c\u5730\u6784\u5efa\u6587\u6863","text":"<pre><code>pip install mkdocs-material\nmkdocs serve\n# \u8bbf\u95ee http://localhost:8000\n</code></pre>"},{"location":"contributing/#_8","title":"\u8054\u7cfb","text":"<ul> <li>GitHub Issues: AgentR1/Claw-R1</li> </ul>"},{"location":"api/","title":"API Reference","text":"<p>\u672c\u8282\u6587\u6863\u5316 Claw-R1 \u5404\u7ec4\u4ef6\u66b4\u9732\u7684 HTTP \u548c Python API\u3002</p> <ul> <li> <p>Gateway HTTP API</p> <p>REST \u7aef\u70b9\uff0c\u7528\u4e8e Agent \u96c6\u6210\u548c Step \u63d0\u4ea4\u3002\u5305\u62ec\u767d\u76d2\u7aef\u70b9\uff08<code>/generate</code>\u3001<code>/submit_steps</code>\uff09\u548c\u9ed1\u76d2\u7aef\u70b9\uff08<code>{base_url}/v1/chat/completions</code>\uff09\u3002</p> <p> Gateway API</p> </li> </ul>"},{"location":"api/#python","title":"Python \u63a5\u53e3","text":""},{"location":"api/#datapool-ray-actor","title":"DataPool (Ray Actor)","text":"<pre><code>import ray\nfrom claw_r1.data_pool import DataPool\n\ndata_pool = ray.get_actor(\"data_pool\")\n\n# Producer\uff08\u7531 Gateway \u5185\u90e8\u8c03\u7528\uff09\nray.get(data_pool.submit_step.remote(step, channel=\"train\"))\nray.get(data_pool.submit_steps.remote(steps, channel=\"train\"))\nray.get(data_pool.complete_trajectory.remote(trajectory_uid, channel=\"train\"))\n\n# Consumer\uff08\u7531 Trainer \u8c03\u7528\uff09\nbatch = ray.get(data_pool.fetch_batch.remote(n_rollouts=5, channel=\"train\"))\n</code></pre>"},{"location":"api/#rewardloopworker-ray-actor","title":"RewardLoopWorker (Ray Actor)","text":"<pre><code>from claw_r1.reward_loop import RewardLoopWorker\n\nreward_worker = ray.get_actor(\"reward_loop_worker\")\nrewards = ray.get(reward_worker.compute_score_batch.remote(steps))\n</code></pre>"},{"location":"api/#agentflowbase-python-class","title":"AgentFlowBase (Python class)","text":"<pre><code>from claw_r1.agent_flow import SingleStepSingleTurnAgentFlow\n\nclass MyFlow(SingleStepSingleTurnAgentFlow):\n    async def run(self, sampling_params, **kwargs) -&gt; int:\n        text, response_ids, prompt_ids = await self.gateway_generate(\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            messages=[{\"role\": \"user\", \"content\": kwargs[\"question\"]}],\n        )\n        # \u6784\u5efa Step \u5e76\u63d0\u4ea4 ...\n        return 1\n</code></pre>"},{"location":"api/#blackboxagentflowbase-python-class","title":"BlackBoxAgentFlowBase (Python class)","text":"<pre><code>from claw_r1.agent_flow.agent_flow import register\nfrom claw_r1.blackbox_agent.blackbox_agent_flow import BlackBoxAgentFlowBase\n\n@register(\"my_blackbox_agent\")\nclass MyBlackBoxFlow(BlackBoxAgentFlowBase):\n    async def _run_agent(self, base_url: str, kwargs: dict) -&gt; int:\n        # \u521b\u5efa Agent\uff0c\u4f7f\u7528 base_url \u4f5c\u4e3a OpenAI API endpoint\n        agent = MyAgent(base_url=base_url)\n        return await agent.solve(task=kwargs[\"raw_prompt\"])\n</code></pre>"},{"location":"api/gateway/","title":"Gateway API","text":"<p>Gateway \u9ed8\u8ba4\u76d1\u542c\u7aef\u53e3 8100\uff08\u901a\u8fc7 <code>--port</code> \u914d\u7f6e\uff09\u3002\u6240\u6709\u7aef\u70b9\u5747\u63a5\u53d7\u548c\u8fd4\u56de JSON\u3002</p>"},{"location":"api/gateway/#base-url","title":"Base URL","text":"<pre><code>http://&lt;gateway-host&gt;:8100\n</code></pre>"},{"location":"api/gateway/#white-box","title":"White-box \u7aef\u70b9","text":"<p>\u8fd9\u4e9b\u7aef\u70b9\u7531 <code>AgentFlowBase</code> \u7684\u767d\u76d2 Agent \u8c03\u7528\u3002</p>"},{"location":"api/gateway/#post-generate","title":"<code>POST /generate</code>","text":"<p>\u5c06\u751f\u6210\u8bf7\u6c42\u8f6c\u53d1\u5230 vLLM \u5e76\u8fd4\u56de\u5e26 token ID \u7684\u54cd\u5e94\u3002</p> <p>\u8c03\u7528\u65b9: <code>AgentFlowBase.gateway_generate()</code></p>"},{"location":"api/gateway/#request","title":"Request","text":"<pre><code>{\n  \"trajectory_uid\": \"string\",\n  \"prompt_uid\": \"string\",\n  \"messages\": [\n    { \"role\": \"user\", \"content\": \"string\" }\n  ],\n  \"max_tokens\": 1024,\n  \"temperature\": 1.0,\n  \"top_p\": 1.0\n}\n</code></pre> \u5b57\u6bb5 \u7c7b\u578b \u5fc5\u586b \u8bf4\u660e <code>trajectory_uid</code> string \u662f \u5f53\u524d\u5bf9\u8bdd\u7684\u552f\u4e00 ID <code>prompt_uid</code> string \u662f Prompt \u7ec4 ID\uff08\u7528\u4e8e GRPO \u5206\u7ec4\uff09 <code>messages</code> array \u662f OpenAI \u683c\u5f0f\u7684\u804a\u5929\u6d88\u606f <code>max_tokens</code> int \u5426 \u6700\u5927\u54cd\u5e94\u957f\u5ea6\uff08\u9ed8\u8ba4\u53d6 <code>--response-length</code>\uff09 <code>temperature</code> float \u5426 \u91c7\u6837\u6e29\u5ea6\uff08\u9ed8\u8ba4 1.0\uff09 <code>top_p</code> float \u5426 Top-p \u91c7\u6837\uff08\u9ed8\u8ba4 1.0\uff09"},{"location":"api/gateway/#response","title":"Response","text":"<pre><code>{\n  \"response_text\": \"string\",\n  \"response_ids\": [101, 202, 303],\n  \"prompt_ids\": [50, 60, 70, 80]\n}\n</code></pre>"},{"location":"api/gateway/#post-submit_steps","title":"<code>POST /submit_steps</code>","text":"<p>\u63d0\u4ea4\u4e00\u4e2a\u6216\u591a\u4e2a <code>Step</code> \u5bf9\u8c61\u5230 DataPool\u3002</p> <p>\u8c03\u7528\u65b9: <code>AgentFlowBase.gateway_submit_steps()</code></p>"},{"location":"api/gateway/#request_1","title":"Request","text":"<pre><code>{\n  \"steps\": [\n    {\n      \"trajectory_uid\": \"string\",\n      \"prompt_uid\": \"string\",\n      \"prompt_ids\": [50, 60, 70],\n      \"response_ids\": [101, 202],\n      \"reward\": 0.0,\n      \"step_index\": 0,\n      \"policy_version\": 42,\n      \"is_last\": true,\n      \"metadata\": {}\n    }\n  ]\n}\n</code></pre>"},{"location":"api/gateway/#response_1","title":"Response","text":"<pre><code>{\n  \"accepted\": 1\n}\n</code></pre>"},{"location":"api/gateway/#post-compute_reward","title":"<code>POST /compute_reward</code>","text":"<p>\u4e3a\u4e00\u4e2a step \u8ba1\u7b97 reward\uff08\u7531 Trainer \u8c03\u7528\uff0c\u4e0d\u7531 Agent \u8c03\u7528\uff09\u3002</p>"},{"location":"api/gateway/#request_2","title":"Request","text":"<pre><code>{\n  \"trajectory_uid\": \"string\",\n  \"messages\": [...],\n  \"dataset_fields\": {\n    \"ground_truth\": \"string\",\n    \"task_type\": \"string\"\n  }\n}\n</code></pre>"},{"location":"api/gateway/#response_2","title":"Response","text":"<pre><code>{\n  \"reward\": 0.85\n}\n</code></pre>"},{"location":"api/gateway/#black-box","title":"Black-box \u7aef\u70b9","text":"<p>\u8fd9\u4e9b\u7aef\u70b9\u4f9b\u9ed1\u76d2 Agent \u4f7f\u7528\u3002\u9ed1\u76d2 Agent \u53ea\u9700\u8981\u77e5\u9053\u4e00\u4e2a <code>base_url</code>\uff0c\u6240\u6709\u4ea4\u4e92\u90fd\u901a\u8fc7\u8be5 URL \u5b8c\u6210\u3002</p> <p><code>base_url</code> \u7684\u683c\u5f0f\u4e3a <code>http://&lt;host&gt;:&lt;port&gt;/&lt;trajectory_uid&gt;/&lt;prompt_uid&gt;</code>\uff0c\u7531 <code>POST /init_trajectory</code> \u8fd4\u56de\u3002</p>"},{"location":"api/gateway/#post-init_trajectory","title":"<code>POST /init_trajectory</code>","text":"<p>\u5206\u914d\u4e00\u6761\u65b0\u7684 trajectory \u5e76\u8fd4\u56de <code>base_url</code>\u3002</p>"},{"location":"api/gateway/#request_3","title":"Request","text":"<p>\u65e0\u8bf7\u6c42\u4f53\u3002</p>"},{"location":"api/gateway/#response_3","title":"Response","text":"<pre><code>{\n  \"trajectory_uid\": \"a1b2c3d4e5f6...\",\n  \"base_url\": \"http://0.0.0.0:8100/a1b2c3d4e5f6.../1\"\n}\n</code></pre>"},{"location":"api/gateway/#post-base_urlv1register_trajectory","title":"<code>POST {base_url}/v1/register_trajectory</code>","text":"<p>\u6ce8\u518c trajectory \u7684 channel \u548c metadata\u3002\u5728 Agent \u5f00\u59cb\u4ea4\u4e92\u4e4b\u524d\u8c03\u7528\u3002</p> <p><code>trajectory_uid</code> \u4ece URL path \u4e2d\u63d0\u53d6\uff0c\u65e0\u9700\u5728 body \u4e2d\u4f20\u9012\u3002</p>"},{"location":"api/gateway/#request_4","title":"Request","text":"<pre><code>{\n  \"channel\": \"train\",\n  \"metadata\": {\n    \"data_source\": \"gsm8k\",\n    \"ground_truth\": \"42\"\n  }\n}\n</code></pre> <p>\u6240\u6709\u5b57\u6bb5\u5747\u4e3a\u53ef\u9009\u3002<code>channel</code> \u9ed8\u8ba4\u4e3a <code>\"train\"</code>\u3002</p>"},{"location":"api/gateway/#response_4","title":"Response","text":"<pre><code>{ \"status\": \"ok\" }\n</code></pre>"},{"location":"api/gateway/#post-base_urlv1chatcompletions","title":"<code>POST {base_url}/v1/chat/completions</code>","text":"<p>OpenAI \u517c\u5bb9\u7684\u804a\u5929\u8865\u5168\u7aef\u70b9\u3002\u9ed1\u76d2 Agent \u53ea\u9700\u5c06 <code>base_url</code> \u8bbe\u4e3a OpenAI SDK \u7684 <code>base_url</code>\uff0c\u5373\u53ef\u900f\u660e\u63a5\u5165\u8bad\u7ec3\u7cfb\u7edf\u3002</p> <p>Gateway \u4f1a\uff1a</p> <ol> <li>\u5c06\u8bf7\u6c42\u8f6c\u53d1\u5230 vLLM \u670d\u52a1\u5668</li> <li>\u5bf9 prompt \u548c response \u8fdb\u884c tokenize</li> <li>\u81ea\u52a8\u6784\u5efa <code>Step</code> \u5e76\u63d0\u4ea4\u5230 DataPool</li> <li>\u8fd4\u56de\u6807\u51c6 OpenAI \u683c\u5f0f\u7684\u54cd\u5e94</li> </ol>"},{"location":"api/gateway/#request_5","title":"Request","text":"<p>\u6807\u51c6 OpenAI <code>chat/completions</code> \u8bf7\u6c42\u4f53\u3002</p> <pre><code>{\n  \"model\": \"qwen\",\n  \"messages\": [\n    { \"role\": \"user\", \"content\": \"What is 2+2?\" }\n  ],\n  \"temperature\": 0.7\n}\n</code></pre>"},{"location":"api/gateway/#response_5","title":"Response","text":"<p>\u6807\u51c6 OpenAI <code>chat/completions</code> \u54cd\u5e94\u4f53\u3002</p> <pre><code>{\n  \"id\": \"chatcmpl-...\",\n  \"object\": \"chat.completion\",\n  \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\": \"assistant\",\n        \"content\": \"4\"\n      },\n      \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 10,\n    \"completion_tokens\": 1,\n    \"total_tokens\": 11\n  }\n}\n</code></pre>"},{"location":"api/gateway/#post-base_urlv1complete_trajectory","title":"<code>POST {base_url}/v1/complete_trajectory</code>","text":"<p>\u6807\u8bb0\u4e00\u6761 trajectory \u5b8c\u6210\u3002Agent \u5b8c\u6210\u6240\u6709\u4ea4\u4e92\u540e\u8c03\u7528\u3002</p>"},{"location":"api/gateway/#request_6","title":"Request","text":"<p>\u65e0\u8bf7\u6c42\u4f53\u3002</p>"},{"location":"api/gateway/#response_6","title":"Response","text":"<pre><code>{ \"status\": \"ok\" }\n</code></pre>"},{"location":"api/gateway/#post-complete_trajectorytrajectory_uid","title":"<code>POST /complete_trajectory/{trajectory_uid}</code>","text":"<p>\u5185\u90e8\u7aef\u70b9\uff0c\u901a\u8fc7 trajectory_uid \u76f4\u63a5\u6807\u8bb0\u5b8c\u6210\u3002\u53ef\u9009\u4f20\u5165 reward \u548c channel\u3002</p>"},{"location":"api/gateway/#request_7","title":"Request","text":"<pre><code>{\n  \"channel\": \"train\",\n  \"reward\": 0.9\n}\n</code></pre>"},{"location":"api/gateway/#response_7","title":"Response","text":"<pre><code>{ \"status\": \"ok\" }\n</code></pre>"},{"location":"api/gateway/#_1","title":"\u5c31\u7eea\u68c0\u67e5","text":""},{"location":"api/gateway/#get-ready","title":"<code>GET /ready</code>","text":"<p>\u5f53 Gateway \u5b8c\u5168\u521d\u59cb\u5316\uff08\u5305\u62ec tokenizer \u52a0\u8f7d\u5b8c\u6210\uff09\u540e\u8fd4\u56de 200\u3002\u7528\u4e8e Rollouter \u542f\u52a8\u65f6\u7684\u5065\u5eb7\u68c0\u67e5\u3002</p>"},{"location":"api/gateway/#response-200","title":"Response (200)","text":"<pre><code>{ \"status\": \"ready\" }\n</code></pre>"},{"location":"api/gateway/#response-503","title":"Response (503)","text":"<pre><code>{ \"detail\": \"Gateway not ready (tokenizer still loading)\" }\n</code></pre>"},{"location":"api/gateway/#get-docs","title":"<code>GET /docs</code>","text":"<p>FastAPI \u81ea\u52a8\u751f\u6210\u7684 Swagger UI \u6587\u6863\u9875\u9762\u3002</p>"},{"location":"components/","title":"Components","text":"<p>Claw-R1 \u7684\u7ec4\u4ef6\u56f4\u7ed5\u6570\u636e\u6d41\u7ec4\u7ec7\uff1a\u4ece Agent \u4ea4\u4e92\u7684\u91c7\u96c6\uff0c\u5230\u6570\u636e\u7684\u7ba1\u7406\u4e0e\u8d28\u91cf\u8bc4\u4f30\uff0c\u518d\u5230\u5411\u8bad\u7ec3\u5f15\u64ce\u7684\u4f9b\u7ed9\u3002\u5404\u7ec4\u4ef6\u901a\u8fc7 HTTP \u548c Ray RPC \u901a\u4fe1\u3002</p> <ul> <li> <p>Gateway Server \u00b7 \u6570\u636e\u91c7\u96c6\u5165\u53e3</p> <p>FastAPI HTTP \u670d\u52a1\u3002\u6240\u6709 Agent LLM \u8c03\u7528\u7684\u7edf\u4e00\u5165\u53e3\uff0c\u81ea\u52a8\u4ece\u4ea4\u4e92\u4e2d\u91c7\u96c6\u8bad\u7ec3\u6570\u636e\uff08Step\uff09\u5e76\u63d0\u4ea4\u5230 DataPool\u3002\u652f\u6301\u767d\u76d2\u663e\u5f0f\u63d0\u4ea4\u548c\u9ed1\u76d2\u81ea\u52a8\u91c7\u96c6\u4e24\u79cd\u6a21\u5f0f\u3002</p> <p> Gateway Server</p> </li> <li> <p>DataPool \u00b7 \u6570\u636e\u7ba1\u7406\u6838\u5fc3</p> <p>Ray Actor\u3002Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u4e2d\u67a2 \u2014 \u5b58\u50a8\u3001\u7d22\u5f15\u3001\u5206\u533a\u548c\u4f9b\u7ed9\u4ea4\u4e92\u6570\u636e\u3002\u652f\u6301 Channel \u9694\u79bb\u3001GRPO \u5206\u7ec4\u3001\u5bb9\u91cf\u80cc\u538b\u63a7\u5236\u548c\u5b9e\u65f6\u7edf\u8ba1\u76d1\u63a7\u3002</p> <p> DataPool</p> </li> <li> <p>Reward System \u00b7 \u6570\u636e\u8d28\u91cf\u8bc4\u4f30</p> <p><code>RewardLoopWorker</code> Ray Actor\u3002\u591a\u7ef4\u5ea6\u6570\u636e\u8d28\u91cf\u8bc4\u4f30\uff1arule-based\u3001discriminative RM\u3001generative RM\uff0c\u4ee5\u53ca\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u7684\u6574\u5408\u3002</p> <p> Reward System</p> </li> <li> <p>Agent Flow \u00b7 \u767d\u76d2\u6570\u636e\u91c7\u96c6</p> <p>Agent \u6267\u884c\u751f\u547d\u5468\u671f\u7ba1\u7406\u3002\u767d\u76d2 Agent \u901a\u8fc7 Python API \u663e\u5f0f\u63d0\u4ea4 Step\uff0c\u5b8c\u6574\u63a7\u5236\u6570\u636e\u91c7\u96c6\u8fc7\u7a0b\u3002</p> <p> Agent Flow</p> </li> <li> <p>Black-box Agent \u00b7 \u9ed1\u76d2\u6570\u636e\u91c7\u96c6</p> <p>\u96f6\u4ee3\u7801\u4fb5\u5165\u7684\u9ed1\u76d2 Agent \u63a5\u5165\u3002\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u901a\u8fc7 <code>base_url</code> \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\u3002</p> <p> Black-box Agent</p> </li> <li> <p>Async Training \u00b7 \u6570\u636e\u6d88\u8d39\u4e0e\u8bad\u7ec3</p> <p><code>AsyncTrainer</code> \u548c <code>AsyncRollouter</code> Ray Actor\u3002\u6301\u7eed\u4ece DataPool \u6d88\u8d39\u9ad8\u8d28\u91cf\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\uff0c\u5e26\u53c2\u6570\u540c\u6b65\u3002</p> <p> Async Training</p> </li> </ul>"},{"location":"components/#_1","title":"\u6570\u636e\u6d41\u5168\u666f","text":"<pre><code>                        \u6570\u636e\u91c7\u96c6\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n  \u9ed1\u76d2 Agent \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u25ba\u2502                                         \u2502\n  (base_url)          \u2502         GATEWAY SERVER                  \u2502\n                      \u2502         (FastAPI, \u7aef\u53e3 8100)             \u2502\n  \u767d\u76d2 Agent \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u25ba\u2502         \u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92 Step                 \u2502\n  (AgentFlow)         \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                   \u2502 Ray RPC (submit_steps)\n                                   \u25bc\n                        \u6570\u636e\u7ba1\u7406\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         DATAPOOL                         \u2502\n                      \u2502         (Ray Actor)                      \u2502\n                      \u2502                                          \u2502\n                      \u2502  \u2022 \u5b58\u50a8\u4e0e\u7d22\u5f15    \u2022 Channel \u5206\u533a            \u2502\n                      \u2502  \u2022 GRPO \u5206\u7ec4     \u2022 \u5bb9\u91cf\u80cc\u538b\u63a7\u5236            \u2502\n                      \u2502  \u2022 \u8d28\u91cf\u8bc4\u4f30      \u2022 \u5b9e\u65f6\u7edf\u8ba1\u76d1\u63a7            \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                         \u2502 fetch_batch()\n                                         \u25bc\n                        \u6570\u636e\u6d88\u8d39\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         ASYNC TRAINER                    \u2502\n                      \u2502         (Ray Actor, Training GPU Pool)   \u2502\n                      \u2502   \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n                      \u2502   \u2502  Actor \u2502 Critic \u2502 RefPolicy      \u2502   \u2502\n                      \u2502   \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                       \u2502 NCCL weight sync\n                                       \u25bc\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         ASYNC ROLLOUTER                  \u2502\n                      \u2502         (Ray Actor, Rollout GPU Pool)    \u2502\n                      \u2502         vLLM servers                     \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre>"},{"location":"components/agent-flow/","title":"Agent Flow","text":"<p>Agent Flow \u662f Claw-R1 \u4e2d\u7ba1\u7406 Agent \u6267\u884c\u751f\u547d\u5468\u671f\u7684\u6846\u67b6\u3002\u5b83\u5206\u4e3a\u4e24\u5927\u7c7b\uff1a</p> <ul> <li>\u767d\u76d2 Agent Flow\uff1aAgent \u903b\u8f91\u7528 Python \u7f16\u5199\uff0c\u76f4\u63a5\u901a\u8fc7 Gateway \u7684 <code>/generate</code>\u3001<code>/submit_steps</code> \u7b49\u7aef\u70b9\u4ea4\u4e92\uff0c\u81ea\u884c\u7ba1\u7406 tokenize \u548c Step \u6784\u5efa\u3002</li> <li>\u9ed1\u76d2 Agent Flow\uff1aAgent \u4f7f\u7528\u6807\u51c6 OpenAI API\uff0c\u901a\u8fc7 <code>base_url</code> \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u5904\u7406 tokenize \u548c Step \u63d0\u4ea4\u3002</li> </ul>"},{"location":"components/agent-flow/#_1","title":"\u7c7b\u5c42\u6b21","text":"<pre><code>AgentFlowBase                              (abstract base)\n    \u2502\n    \u251c\u2500\u2500 SingleStepSingleTurnAgentFlow      (\u767d\u76d2\uff1a\u5355\u8f6e\u95ee\u7b54)\n    \u251c\u2500\u2500 MultiStepAgentFlow                 (\u767d\u76d2\uff1a\u591a\u8f6e\u5de5\u5177\u8c03\u7528)\n    \u2502\n    \u2514\u2500\u2500 BlackBoxAgentFlowBase              (\u9ed1\u76d2\u57fa\u7c7b)\n            \u2514\u2500\u2500 BlackBoxGSM8KAgentFlow     (\u9ed1\u76d2\uff1aGSM8K \u6570\u5b66\u9898)\n</code></pre>"},{"location":"components/agent-flow/#agentflowbase","title":"AgentFlowBase","text":"<p>\u6240\u6709 Agent Flow \u7684\u62bd\u8c61\u57fa\u7c7b\uff0c\u63d0\u4f9b\uff1a</p> <ul> <li>Gateway URL \u7ba1\u7406</li> <li>\u914d\u7f6e\u8bbf\u95ee\uff08<code>self.config</code>\uff09</li> <li>\u62bd\u8c61\u65b9\u6cd5 <code>run(sampling_params, **kwargs) -&gt; int</code></li> </ul>"},{"location":"components/agent-flow/#_2","title":"\u767d\u76d2\u8f85\u52a9\u65b9\u6cd5","text":"<p>\u767d\u76d2 Agent Flow \u53ef\u4f7f\u7528\u4ee5\u4e0b\u65b9\u6cd5\u4e0e Gateway \u4ea4\u4e92\uff1a</p>"},{"location":"components/agent-flow/#gateway_generatetrajectory_uid-prompt_uid-messages-kwargs","title":"<code>gateway_generate(trajectory_uid, prompt_uid, messages, **kwargs)</code>","text":"<p>\u5411 Gateway <code>/generate</code> \u53d1\u9001\u5f02\u6b65 HTTP POST\uff0c\u8fd4\u56de\u751f\u6210\u6587\u672c\u548c token IDs\u3002</p> <pre><code>text, response_ids, prompt_ids = await self.gateway_generate(\n    trajectory_uid=\"traj-abc\",\n    prompt_uid=\"prompt-xyz\",\n    messages=[{\"role\": \"user\", \"content\": \"Summarize this document.\"}],\n    max_tokens=512,\n    temperature=0.8,\n)\n</code></pre>"},{"location":"components/agent-flow/#gateway_submit_stepssteps-channeltrain","title":"<code>gateway_submit_steps(steps, channel=\"train\")</code>","text":"<p>\u5411 Gateway <code>/submit_steps</code> \u63d0\u4ea4 Step \u5217\u8868\u3002</p>"},{"location":"components/agent-flow/#gateway_compute_rewardtrajectory_uid-messages-dataset_fields","title":"<code>gateway_compute_reward(trajectory_uid, messages, dataset_fields)</code>","text":"<p>\u5411 Gateway <code>/compute_reward</code> \u8bf7\u6c42 reward \u8ba1\u7b97\u3002</p>"},{"location":"components/agent-flow/#singlestepsingleturnagentflow","title":"SingleStepSingleTurnAgentFlow","text":"<p>\u6700\u7b80\u5355\u7684\u767d\u76d2\u5b9e\u73b0\uff1a\u5355\u4e2a prompt \u4ea7\u751f\u5355\u4e2a response\u3002\u9002\u7528\u4e8e\u6bcf\u4e2a\u6837\u672c\u90fd\u662f\u72ec\u7acb\u95ee\u7b54\u5bf9\u7684\u6570\u636e\u96c6\u3002</p> <pre><code>class MyAgentFlow(SingleStepSingleTurnAgentFlow):\n    async def run(self, sampling_params, **kwargs) -&gt; int:\n        messages = [{\"role\": \"user\", \"content\": kwargs[\"raw_prompt\"]}]\n        text, response_ids, prompt_ids = await self.gateway_generate(\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            messages=messages,\n        )\n        step = Step(\n            prompt_ids=prompt_ids,\n            response_ids=response_ids,\n            reward=0.0,\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            step_index=0,\n            is_last=True,\n        )\n        await self.gateway_submit_steps([step])\n        return 1\n</code></pre>"},{"location":"components/agent-flow/#multistepagentflow","title":"MultiStepAgentFlow","text":"<p>\u591a\u8f6e Agent Flow\uff0c\u652f\u6301\u5de5\u5177\u8c03\u7528\u3001\u89c4\u5212\u7b49\u573a\u666f\u3002\u6bcf\u8f6e\u4ea7\u751f\u4e00\u4e2a Step\uff0c\u901a\u8fc7 <code>trajectory_uid</code> \u4e32\u8054\u3002</p> <pre><code>class ToolAgentFlow(MultiStepAgentFlow):\n    async def run(self, sampling_params, **kwargs) -&gt; int:\n        messages = [{\"role\": \"user\", \"content\": kwargs[\"task\"]}]\n        step_index = 0\n\n        while True:\n            text, response_ids, prompt_ids = await self.gateway_generate(...)\n            is_last = self.is_terminal(text)\n\n            step = Step(\n                prompt_ids=prompt_ids,\n                response_ids=response_ids,\n                step_index=step_index,\n                is_last=is_last,\n                ...\n            )\n            await self.gateway_submit_steps([step])\n\n            if is_last:\n                break\n\n            messages.append({\"role\": \"assistant\", \"content\": text})\n            tool_result = await self.execute_tool(text)\n            messages.append({\"role\": \"tool\", \"content\": tool_result})\n            step_index += 1\n\n        return step_index + 1\n</code></pre>"},{"location":"components/agent-flow/#blackboxagentflowbase","title":"BlackBoxAgentFlowBase","text":"<p>\u9ed1\u76d2 Agent Flow \u7684\u57fa\u7c7b\u3002\u5904\u7406\u4e0e Gateway \u7684\u5b8c\u6574\u534f\u8bae\uff08init \u2192 register \u2192 complete\uff09\uff0c\u5c06 Agent \u6267\u884c\u59d4\u6258\u7ed9\u5b50\u7c7b\u7684 <code>_run_agent</code> \u65b9\u6cd5\u3002</p> <p>\u8be6\u7ec6\u6587\u6863\u89c1 Black-box Agent\u3002</p>"},{"location":"components/agent-flow/#_3","title":"\u6ce8\u518c\u673a\u5236","text":"<p>Agent Flow \u901a\u8fc7 <code>@register(\"name\")</code> \u88c5\u9970\u5668\u6ce8\u518c\u5230\u5168\u5c40\u6ce8\u518c\u8868\uff1a</p> <pre><code>from claw_r1.agent_flow.agent_flow import register\n\n@register(\"my_agent_flow\")\nclass MyAgentFlow(AgentFlowBase):\n    ...\n</code></pre> <p>\u4e5f\u53ef\u901a\u8fc7 YAML \u914d\u7f6e\u6587\u4ef6\u6ce8\u518c\uff08\u7528\u4e8e\u9ed1\u76d2 Agent\uff09\uff1a</p> <pre><code># agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n</code></pre>"},{"location":"components/agent-flow/#agentflowmanager-agentflowworker","title":"AgentFlowManager \u548c AgentFlowWorker","text":"<ul> <li>AgentFlowManager\uff1a\u7ba1\u7406\u591a\u4e2a <code>AgentFlowWorker</code>\uff0c\u5c06 batch \u4e2d\u7684\u6bcf\u4e2a\u6837\u672c\u5206\u53d1\u7ed9\u5bf9\u5e94\u7684 Agent Flow \u6267\u884c\u3002</li> <li>AgentFlowWorker\uff1aRay Actor\uff0c\u6301\u6709 tokenizer \u548c\u914d\u7f6e\uff0c\u6267\u884c\u5177\u4f53\u7684 Agent Flow\u3002</li> </ul> <pre><code>AsyncRollouter\n    \u2514\u2500\u2500 AgentFlowManager\n            \u2514\u2500\u2500 AgentFlowWorker (Ray Actor, \u53ef\u591a\u4e2a)\n                    \u2514\u2500\u2500 AgentFlowBase \u5b50\u7c7b\u5b9e\u4f8b\n</code></pre>"},{"location":"components/agent-flow/#_4","title":"\u914d\u7f6e","text":"<p>\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u6307\u5b9a Agent Flow\uff1a</p> <pre><code>python3 -m claw_r1.async_main \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    ...\n</code></pre>"},{"location":"components/async-training/","title":"Async Training","text":"<p>Claw-R1 \u7684\u5f02\u6b65\u8bad\u7ec3\u67b6\u6784\u5c06 rollout\uff08trajectory \u751f\u6210\uff09\u548c training\uff08\u6743\u91cd\u66f4\u65b0\uff09\u5206\u79bb\u4e3a\u4e24\u4e2a\u72ec\u7acb\u7684 Ray Actor\uff0c\u8fd0\u884c\u5728\u4e0d\u540c\u7684 GPU \u6c60\u4e0a\u3002</p>"},{"location":"components/async-training/#_1","title":"\u67b6\u6784","text":"<pre><code>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  Rollout GPU Pool                                        \u2502\n\u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n\u2502  \u2502  AsyncRollouter (Ray Actor)                      \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 DataLoader (\u904d\u5386\u6570\u636e\u96c6)                      \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 vLLM replicas (\u63a8\u7406\u5f15\u64ce)                     \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 AgentFlowManager (\u7ba1\u7406 Agent \u6267\u884c)           \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Gateway (FastAPI \u5b50\u8fdb\u7a0b, \u7aef\u53e3 8100)          \u2502   \u2502\n\u2502  \u2502  \u2514\u2500\u2500 RewardLoopWorker (\u8ba1\u7b97 reward)               \u2502   \u2502\n\u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  submit_step (via Gateway \u2192 DataPool)\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502   DataPool       \u2502   \u2190 \u5171\u4eab Ray Actor\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  fetch_batch()\n                       \u25bc\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  Training GPU Pool                                       \u2502\n\u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n\u2502  \u2502  AsyncTrainer (Ray Actor)                        \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Actor worker group (\u7b56\u7565\u6a21\u578b)                \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Critic worker group (\u4ef7\u503c\u6a21\u578b)               \u2502   \u2502\n\u2502  \u2502  \u2514\u2500\u2500 RefPolicy worker group (KL baseline)        \u2502   \u2502\n\u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  NCCL weight broadcast\n                       \u25bc\n              AsyncRollouter.update_weights()\n</code></pre>"},{"location":"components/async-training/#asynctrainer","title":"AsyncTrainer","text":"<p><code>AsyncTrainer</code> \u662f\u8fd0\u884c\u5728 Training GPU Pool \u4e0a\u7684 Ray Actor\uff0c\u6267\u884c\u6301\u7eed\u7684 PPO \u8bad\u7ec3\u5faa\u73af\uff1a</p> <ol> <li>\u4ece DataPool <code>fetch_batch()</code> \u2014 \u963b\u585e\u7b49\u5f85\u5b8c\u6574\u7684 <code>prompt_uid</code> \u7ec4</li> <li>\u901a\u8fc7 <code>RewardLoopWorker</code> \u8ba1\u7b97 batch \u7684 reward</li> <li>\u8ba1\u7b97 advantage\uff08GAE \u6216 GRPO\uff09</li> <li>\u6267\u884c PPO Actor + Critic \u66f4\u65b0</li> <li>\u6bcf <code>trigger_parameter_sync_step</code> \u6b65\u89e6\u53d1\u6743\u91cd\u540c\u6b65</li> </ol>"},{"location":"components/async-training/#worker","title":"Worker \u521d\u59cb\u5316","text":"<p>AsyncTrainer \u5728 <code>init_workers()</code> \u4e2d\u521b\u5efa Actor\u3001Critic\u3001RefPolicy \u7684 worker group\uff0c\u5e76\u5c06\u5b83\u4eec\u90e8\u7f72\u5230 Training GPU Pool\uff1a</p> <pre><code># \u521b\u5efa\u987a\u5e8f\uff1aCritic \u2192 RefPolicy \u2192 Actor\uff08\u6700\u540e\u521b\u5efa Actor \u4ee5\u514d\u5f71\u54cd vLLM \u5185\u5b58\u4f30\u7b97\uff09\nself.critic_wg.init_model()\nself.ref_policy_wg.init_model()\nself.actor_wg.init_model()\n</code></pre>"},{"location":"components/async-training/#asyncrollouter","title":"AsyncRollouter","text":"<p><code>AsyncRollouter</code> \u8fd0\u884c\u5728 Rollout GPU Pool \u4e0a\uff0c\u6301\u6709\uff1a</p> <ul> <li>DataLoader\uff1a\u904d\u5386\u8bad\u7ec3\u6570\u636e\u96c6</li> <li>vLLM replicas\uff1a\u9ad8\u541e\u5410\u63a8\u7406\u670d\u52a1\u5668</li> <li>AgentFlowManager\uff1a\u7ba1\u7406 <code>AgentFlowBase</code> worker</li> <li>Gateway\uff1aFastAPI HTTP \u670d\u52a1\u5668\uff08\u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u542f\u52a8\uff09</li> <li>RewardLoopWorker\uff1a\u5728 rollout \u671f\u95f4\u8ba1\u7b97 reward</li> </ul>"},{"location":"components/async-training/#gateway","title":"Gateway \u542f\u52a8\u6d41\u7a0b","text":"<p>Rollouter \u5c06 Gateway \u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u542f\u52a8\uff1a</p> <ol> <li>\u5feb\u901f\u521d\u59cb\u5316\uff08Ray \u8fde\u63a5\u3001DataPool\u3001vLLM \u5730\u5740\uff09\u2192 HTTP \u7acb\u5373\u53ef\u7528</li> <li>Tokenizer \u5728\u540e\u53f0\u7ebf\u7a0b\u52a0\u8f7d</li> <li>Rollouter \u8f6e\u8be2 <code>GET /ready</code> \u7b49\u5f85 Gateway \u5b8c\u5168\u5c31\u7eea</li> <li>\u8d85\u65f6\u65f6\u95f4\u53ef\u901a\u8fc7 <code>trainer.gateway_startup_timeout</code> \u914d\u7f6e\uff08\u9ed8\u8ba4 300 \u79d2\uff09</li> </ol>"},{"location":"components/async-training/#_2","title":"\u6682\u505c/\u6062\u590d\uff08\u6743\u91cd\u540c\u6b65\uff09","text":"<p>\u6743\u91cd\u540c\u6b65\u671f\u95f4\uff0cRollouter \u6682\u505c\u751f\u6210\uff1a</p> <pre><code>rollouter.pause()                          # \u505c\u6b62\u65b0\u751f\u6210\uff0c\u7b49\u5f85\u8fdb\u884c\u4e2d\u7684\u8bf7\u6c42\u5b8c\u6210\n# NCCL broadcast: Actor weights \u2192 vLLM\nrollouter.update_param_version(new_version)\nrollouter.resume()                         # \u4f7f\u7528\u66f4\u65b0\u540e\u7684\u6743\u91cd\u6062\u590d\u751f\u6210\n</code></pre>"},{"location":"components/async-training/#parametersynchronizer","title":"ParameterSynchronizer","text":"<p>\u8f7b\u91cf\u7ea7 Ray Actor\uff0c\u534f\u8c03 AsyncTrainer \u548c AsyncRollouter \u4e4b\u95f4\u7684\u6743\u91cd\u540c\u6b65\uff1a</p> <pre><code>class ParameterSynchronizer:\n    def sync_weights(self, version, validate=False):\n        # 1. \u6682\u505c rollout\n        # 2. NCCL broadcast: trainer Actor \u2192 vLLM\n        # 3. \u66f4\u65b0 rollouter \u7684 param_version\n        # 4. \u53ef\u9009\uff1a\u8fd0\u884c\u9a8c\u8bc1\n        # 5. \u6062\u590d rollout\n</code></pre>"},{"location":"components/async-training/#advantage","title":"Advantage \u8ba1\u7b97","text":""},{"location":"components/async-training/#gae-generalized-advantage-estimation","title":"GAE (Generalized Advantage Estimation)","text":"<p>\u7528\u4e8e trajectory \u7ea7\u522b\u7684 value baseline\u3002\u5728 step \u7ea7\u522b \u8ba1\u7b97 advantage\uff0c\u7136\u540e\u5e7f\u64ad\u5230 token \u7ea7\u522b\uff08\u540c\u4e00 step \u5185\u6240\u6709 response token \u5171\u4eab\u76f8\u540c\u7684 advantage\uff09\u3002</p>"},{"location":"components/async-training/#grpo-group-relative-policy-optimization","title":"GRPO (Group Relative Policy Optimization)","text":"<p>\u7528\u4e8e prompt \u7ea7\u522b\u7684 baseline\u3002\u5c06\u6765\u81ea\u540c\u4e00 <code>prompt_uid</code> \u7684\u591a\u4e2a rollout \u5206\u7ec4\uff0c\u5728\u7ec4\u5185\u5f52\u4e00\u5316 advantage\u3002\u4e0d\u9700\u8981\u5355\u72ec\u7684 Critic \u6a21\u578b\uff0c\u66f4\u8282\u7701\u5185\u5b58\u3002</p>"},{"location":"components/async-training/#_3","title":"\u8d44\u6e90\u6c60\u914d\u7f6e","text":"<p>Trainer \u548c Rollouter \u8fd0\u884c\u5728\u72ec\u7acb\u7684 GPU \u6c60\u4e0a\uff0c\u9632\u6b62\u8d44\u6e90\u7ade\u4e89\uff1a</p> <pre><code># async_ppo_trainer.yaml\n\n# Training GPU Pool (Actor, Critic, RefPolicy)\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 2\n\n# Rollout GPU Pool (vLLM)\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 1\n</code></pre> <p>\u603b GPU \u6570 = <code>trainer.nnodes \u00d7 trainer.n_gpus_per_node + rollout.nnodes \u00d7 rollout.n_gpus_per_node</code>\u3002</p> <p>GPU \u5206\u914d</p> <p>\u5fc5\u987b\u540c\u65f6\u4e3a trainer \u548c rollout \u914d\u7f6e GPU\u3002\u5982\u679c trainer \u6ca1\u6709\u5206\u914d GPU\uff0c\u8bad\u7ec3\u53c2\u6570\uff08Actor\u3001Critic\uff09\u5c06\u65e0\u6cd5\u90e8\u7f72\u5230 GPU \u4e0a\u3002</p>"},{"location":"components/async-training/#_4","title":"\u5173\u952e\u914d\u7f6e","text":"<pre><code># async_ppo_trainer.yaml\nasync_training:\n  staleness_threshold: 0.1           # off-policy \u5bb9\u5fcd\u5ea6\n  trigger_parameter_sync_step: 4     # \u6bcf N \u6b65\u540c\u6b65\u6743\u91cd\n  require_batches: 1                 # \u6bcf\u6b21\u4ece DataPool \u53d6\u591a\u5c11\u4e2a batch\n  use_rollout_log_probs: true        # \u4f7f\u7528 rollout \u65f6\u7684 log_probs\n  max_queue_size: null               # DataPool \u961f\u5217\u5927\u5c0f\uff08null = \u65e0\u9650\uff09\n  partial_rollout: false             # \u540c\u6b65\u65f6\u662f\u5426\u4e2d\u65ad rollout\n\n  checkpoint_engine:\n    enable: true\n    device_buffer_size_M: 4096\n</code></pre>"},{"location":"components/async-training/#_5","title":"\u5165\u53e3","text":"<pre><code>python3 -m claw_r1.async_main \\\n    trainer.n_gpus_per_node=2 \\\n    trainer.nnodes=1 \\\n    rollout.n_gpus_per_node=1 \\\n    rollout.nnodes=1 \\\n    async_training.trigger_parameter_sync_step=1 \\\n    ...\n</code></pre> <p>\u5b8c\u6574\u793a\u4f8b\u89c1 <code>example/test_async_blackbox.sh</code>\u3002</p>"},{"location":"components/blackbox-agent/","title":"Black-box Agent","text":"<p>Black-box Agent \u7cfb\u7edf\u5141\u8bb8\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u63a5\u5165 Claw-R1 \u7684\u8bad\u7ec3\u5faa\u73af\uff0c\u65e0\u9700\u4fee\u6539 Agent \u5185\u90e8\u903b\u8f91\u3002Agent \u53ea\u9700\u5c06 <code>base_url</code> \u6307\u5411 Gateway\uff0c\u5373\u53ef\u900f\u660e\u5730\u6536\u96c6\u8bad\u7ec3\u6570\u636e\u3002</p>"},{"location":"components/blackbox-agent/#_1","title":"\u67b6\u6784\u6982\u89c8","text":"<pre><code>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  BlackBoxAgentFlowBase (\u8bad\u7ec3\u4fa7\u7f16\u6392)                           \u2502\n\u2502                                                               \u2502\n\u2502  1. POST /init_trajectory          \u2192 \u83b7\u53d6 base_url            \u2502\n\u2502  2. POST {base_url}/v1/register_trajectory \u2192 \u6ce8\u518c metadata    \u2502\n\u2502  3. \u8c03\u7528 _run_agent(base_url, kwargs)                         \u2502\n\u2502     \u2502                                                         \u2502\n\u2502     \u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510                    \u2502\n\u2502     \u2514\u2500\u2500\u2502  \u5177\u4f53 Agent (\u5982 GSM8KAgent)      \u2502                    \u2502\n\u2502        \u2502  \u53ea\u77e5\u9053 base_url\uff0c\u4f7f\u7528 OpenAI API \u2502                    \u2502\n\u2502        \u2502  POST {base_url}/v1/chat/completions (\u591a\u8f6e)          \u2502\n\u2502        \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518                    \u2502\n\u2502  4. POST {base_url}/v1/complete_trajectory \u2192 \u6807\u8bb0\u5b8c\u6210          \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre>"},{"location":"components/blackbox-agent/#_2","title":"\u6838\u5fc3\u8bbe\u8ba1","text":""},{"location":"components/blackbox-agent/#_3","title":"\u5173\u6ce8\u70b9\u5206\u79bb","text":"<ul> <li>BlackBoxAgentFlowBase\uff1a\u5904\u7406\u4e0e Gateway \u7684\u5b8c\u6574\u534f\u8bae\uff08init \u2192 register \u2192 complete\uff09\uff0c\u662f\u8bad\u7ec3\u4fa7\u7684\u7f16\u6392\u5c42\u3002</li> <li>\u5177\u4f53 Agent\uff08\u5982 <code>GSM8KAgent</code>\uff09\uff1a\u53ea\u63a5\u6536 <code>base_url</code> \u548c\u4efb\u52a1\u53c2\u6570\uff0c\u4f7f\u7528\u6807\u51c6 OpenAI API \u5b8c\u6210\u4efb\u52a1\u3002Agent \u5b8c\u5168\u4e0d\u77e5\u9053\u8bad\u7ec3\u7cfb\u7edf\u7684\u5b58\u5728\u3002</li> </ul> <p>\u8fd9\u79cd\u5206\u79bb\u4f7f\u5f97\uff1a</p> <ul> <li>\u540c\u4e00\u4e2a Agent \u53ef\u4ee5\u5728\u8bad\u7ec3\u6a21\u5f0f\u548c\u72ec\u7acb\u670d\u52a1\u6a21\u5f0f\u4e0b\u590d\u7528</li> <li>\u65b0\u589e\u4efb\u52a1\u53ea\u9700\u5b9e\u73b0 Agent + \u5bf9\u5e94\u7684 Flow \u5b50\u7c7b</li> <li>Agent \u53ef\u4ee5\u7528\u4efb\u4f55\u8bed\u8a00/\u6846\u67b6\u5b9e\u73b0\uff0c\u53ea\u8981\u652f\u6301 OpenAI API</li> </ul>"},{"location":"components/blackbox-agent/#_4","title":"\u6ce8\u518c\u673a\u5236","text":"<p>Agent Flow \u901a\u8fc7 <code>@register(\"name\")</code> \u88c5\u9970\u5668\u6ce8\u518c\uff0c\u5e76\u5728 YAML \u914d\u7f6e\u4e2d\u5f15\u7528\uff1a</p> <pre><code># agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n</code></pre>"},{"location":"components/blackbox-agent/#_5","title":"\u7c7b\u5c42\u6b21","text":"<pre><code>AgentFlowBase                         (agent_flow/agent_flow.py)\n    \u2502\n    \u2514\u2500\u2500 BlackBoxAgentFlowBase          (blackbox_agent/blackbox_agent_flow.py)\n            \u2502\n            \u2514\u2500\u2500 BlackBoxGSM8KAgentFlow (blackbox_agent/gsm8k_agent_flow.py)\n</code></pre>"},{"location":"components/blackbox-agent/#blackboxagentflowbase","title":"BlackBoxAgentFlowBase","text":"<p>\u6240\u6709\u9ed1\u76d2 Agent Flow \u7684\u57fa\u7c7b\uff0c\u5b9e\u73b0\u4e86\u5b8c\u6574\u7684 Gateway \u534f\u8bae\uff1a</p> <pre><code>class BlackBoxAgentFlowBase(AgentFlowBase):\n\n    async def run(self, sampling_params, **kwargs) -&gt; int:\n        # 1. \u63d0\u53d6 channel\u3001prompt_uid\u3001metadata\n        channel, prompt_uid, metadata = self._prepare_params(kwargs)\n\n        # 2. init_trajectory \u2192 \u83b7\u53d6 base_url\n        init_resp = await http.post(f\"{self.gateway_url}/init_trajectory\")\n        base_url = ...\n\n        # 3. register_trajectory \u2192 \u6ce8\u518c channel \u548c metadata\n        await http.post(f\"{base_url}/v1/register_trajectory\", json={...})\n\n        # 4. \u8c03\u7528\u5b50\u7c7b\u5b9e\u73b0\u7684 _run_agent\n        num_turns = await self._run_agent(base_url, kwargs)\n\n        # 5. complete_trajectory \u2192 \u6807\u8bb0\u5b8c\u6210\n        await http.post(f\"{base_url}/v1/complete_trajectory\")\n\n        return num_turns\n\n    @abstractmethod\n    async def _run_agent(self, base_url: str, kwargs: dict) -&gt; int:\n        \"\"\"\u5b50\u7c7b\u5b9e\u73b0\uff1a\u521b\u5efa\u5e76\u8fd0\u884c\u5177\u4f53 Agent\u3002\"\"\"\n        ...\n</code></pre> <p>\u5b50\u7c7b\u53ea\u9700\u5b9e\u73b0 <code>_run_agent</code>\uff1a\u4ece <code>kwargs</code> \u4e2d\u63d0\u53d6\u4efb\u52a1\u53c2\u6570\uff0c\u521b\u5efa Agent \u5b9e\u4f8b\uff0c\u8c03\u7528 Agent \u7684\u6267\u884c\u65b9\u6cd5\u3002</p>"},{"location":"components/blackbox-agent/#blackboxgsm8kagentflow","title":"BlackBoxGSM8KAgentFlow","text":"<p>GSM8K \u6570\u5b66\u9898\u7684\u5177\u4f53\u5b9e\u73b0\uff1a</p> <pre><code>@register(\"blackbox_gsm8k_agent\")\nclass BlackBoxGSM8KAgentFlow(BlackBoxAgentFlowBase):\n\n    async def _run_agent(self, base_url: str, kwargs: dict) -&gt; int:\n        from claw_r1.blackbox_agent.gsm8k_agent import GSM8KAgent\n\n        question = ...   # \u4ece kwargs[\"raw_prompt\"] \u63d0\u53d6\n        ground_truth = ...  # \u4ece kwargs[\"reward_model\"] \u63d0\u53d6\n        max_turns = self.config.actor_rollout_ref.rollout.get(\"max_turns\", 3)\n\n        agent = GSM8KAgent(base_url=base_url)\n        return await agent.solve(\n            question=question,\n            ground_truth=ground_truth,\n            max_turns=max_turns,\n        )\n</code></pre>"},{"location":"components/blackbox-agent/#gsm8kagent","title":"GSM8KAgent","text":"<p>\u4e00\u4e2a\u8bad\u7ec3\u65e0\u5173\u7684 Agent\uff0c\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u89e3\u51b3 GSM8K \u6570\u5b66\u9898\uff1a</p> <ul> <li>\u63a5\u6536 <code>base_url</code>\uff08\u6307\u5411 Gateway\uff09\u548c\u4efb\u52a1\u53c2\u6570</li> <li>\u4f7f\u7528 tool calling\uff08<code>check_answer</code> \u5de5\u5177\uff09\u8fdb\u884c\u591a\u8f6e\u63a8\u7406</li> <li>\u652f\u6301 Qwen \u98ce\u683c\u7684 tool call \u89e3\u6790\uff08<code>\u273fFUNCTION\u273f</code> \u683c\u5f0f\uff09</li> <li>\u8fd4\u56de\u4f7f\u7528\u7684\u8f6e\u6b21\u6570</li> </ul> <pre><code>agent = GSM8KAgent(base_url=\"http://gateway:8100/traj123/1\")\nnum_turns = await agent.solve(\n    question=\"What is 15 * 23?\",\n    ground_truth=\"345\",\n    max_turns=3,\n)\n</code></pre>"},{"location":"components/blackbox-agent/#agent","title":"\u6dfb\u52a0\u65b0\u7684\u9ed1\u76d2 Agent","text":"<ol> <li>\u5b9e\u73b0 Agent \u7c7b\uff08\u8bad\u7ec3\u65e0\u5173\uff09\uff1a</li> </ol> <pre><code># claw_r1/blackbox_agent/my_agent.py\nclass MyAgent:\n    def __init__(self, base_url: str):\n        self.client = AsyncOpenAI(base_url=base_url, api_key=\"x\")\n\n    async def solve(self, task: str, **kwargs) -&gt; int:\n        # \u4f7f\u7528 self.client \u8fdb\u884c\u591a\u8f6e\u5bf9\u8bdd\n        # \u8fd4\u56de\u4f7f\u7528\u7684\u8f6e\u6b21\u6570\n        ...\n</code></pre> <ol> <li>\u5b9e\u73b0 Flow \u5b50\u7c7b\uff1a</li> </ol> <pre><code># claw_r1/blackbox_agent/my_agent_flow.py\nfrom claw_r1.agent_flow.agent_flow import register\nfrom claw_r1.blackbox_agent.blackbox_agent_flow import BlackBoxAgentFlowBase\n\n@register(\"blackbox_my_agent\")\nclass BlackBoxMyAgentFlow(BlackBoxAgentFlowBase):\n    async def _run_agent(self, base_url, kwargs):\n        from claw_r1.blackbox_agent.my_agent import MyAgent\n        task = kwargs.get(\"raw_prompt\", \"\")\n        agent = MyAgent(base_url=base_url)\n        return await agent.solve(task=task)\n</code></pre> <ol> <li>\u6ce8\u518c\u5230\u914d\u7f6e\uff1a</li> </ol> <pre><code># agent_flow_config.yaml\n- name: blackbox_my_agent\n  _target_: claw_r1.blackbox_agent.my_agent_flow.BlackBoxMyAgentFlow\n</code></pre> <ol> <li>\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u4f7f\u7528\uff1a</li> </ol> <pre><code>python3 -m claw_r1.async_main \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_my_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    ...\n</code></pre>"},{"location":"components/blackbox-agent/#_6","title":"\u6587\u4ef6\u7ed3\u6784","text":"<pre><code>claw_r1/blackbox_agent/\n\u251c\u2500\u2500 blackbox_agent_flow.py      # BlackBoxAgentFlowBase \u57fa\u7c7b\n\u251c\u2500\u2500 gsm8k_agent_flow.py         # GSM8K Flow \u5b50\u7c7b\n\u251c\u2500\u2500 gsm8k_agent.py              # GSM8K Agent\uff08\u8bad\u7ec3\u65e0\u5173\uff09\n\u2514\u2500\u2500 agent_flow_config.yaml      # Agent Flow \u6ce8\u518c\u914d\u7f6e\n</code></pre>"},{"location":"components/datapool/","title":"DataPool","text":"<p>DataPool \u662f Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u6838\u5fc3 \u2014 \u4e00\u4e2a Ray Actor\uff0c\u627f\u62c5\u7740 Agent \u4ea4\u4e92\u6570\u636e\u7684\u5b58\u50a8\u3001\u7d22\u5f15\u3001\u8d28\u91cf\u8ffd\u8e2a\u3001\u5206\u533a\u7ba1\u7406\u548c\u6309\u9700\u4f9b\u7ed9\u3002\u5b83\u4e0d\u4ec5\u662f Agent \u4fa7\u4e0e Training \u4fa7\u4e4b\u95f4\u7684\u7f13\u51b2\u533a\uff0c\u66f4\u662f\u6574\u4e2a\u6570\u636e\u57fa\u7840\u8bbe\u65bd\u7684\u4e2d\u67a2\u3002</p>"},{"location":"components/datapool/#_1","title":"\u5728\u67b6\u6784\u4e2d\u7684\u89d2\u8272","text":"<pre><code>Gateway \u2500\u2500\u25ba DataPool.submit_steps()     (\u6570\u636e\u91c7\u96c6\uff1a\u5f02\u6b65\u5199\u5165)\nTrainer \u25c4\u2500\u2500 DataPool.fetch_batch()      (\u6570\u636e\u4f9b\u7ed9\uff1a\u963b\u585e\u62c9\u53d6\u5c31\u7eea\u7ec4)\n            DataPool.get_statistics()   (\u6570\u636e\u76d1\u63a7\uff1a\u5b9e\u65f6\u7edf\u8ba1)\n</code></pre> <p>DataPool \u5b8c\u5168\u89e3\u8026\u4e86\u6570\u636e\u91c7\u96c6\u901f\u5ea6\uff08\u7531 Agent \u8bf7\u6c42\u9891\u7387\u9a71\u52a8\uff09\u548c\u6570\u636e\u6d88\u8d39\u901f\u5ea6\uff08\u7531\u8bad\u7ec3\u541e\u5410\u91cf\u9a71\u52a8\uff09\u3002\u53cc\u65b9\u4e92\u4e0d\u7b49\u5f85\u3002</p>"},{"location":"components/datapool/#channel","title":"Channel \u7cfb\u7edf\uff08\u6570\u636e\u5206\u533a\uff09","text":"<p>DataPool \u901a\u8fc7 channel \u5bf9\u6570\u636e\u8fdb\u884c\u5206\u533a\u7ba1\u7406\u3002\u9ed8\u8ba4 channel \u4e3a <code>\"train\"</code>\uff0c\u9a8c\u8bc1\u6d41\u7a0b\u4f7f\u7528 <code>\"val\"</code> channel \u4ee5\u9694\u79bb\u6570\u636e\u3002</p> <pre><code># \u8bad\u7ec3\u6570\u636e\ndata_pool.submit_step(step, channel=\"train\")\n\n# \u9a8c\u8bc1\u6570\u636e\ndata_pool.submit_step(step, channel=\"val\")\n</code></pre> <p>\u6bcf\u4e2a channel \u62e5\u6709\u72ec\u7acb\u7684\u5b58\u50a8\u3001\u7d22\u5f15\u548c FIFO \u961f\u5217\u3002</p>"},{"location":"components/datapool/#_2","title":"\u6570\u636e\u6a21\u578b","text":"<p>DataPool \u4ee5 step \u7c92\u5ea6 \u5b58\u50a8 trajectory\u3002\u6bcf\u4e2a step \u662f\u4e00\u4e2a <code>(s, a, r)</code> \u5143\u7ec4\uff1a</p> <pre><code>@dataclass\nclass Step:\n    prompt_ids:     list[int]   # state: \u5b8c\u6574\u4e0a\u4e0b\u6587 token IDs\n    response_ids:   list[int]   # action: LLM \u751f\u6210\u7684 token IDs\n    reward:         float       # \u8be5 step \u7684\u5373\u65f6 reward\n    trajectory_uid: str         # \u540c\u4e00\u5bf9\u8bdd\u4e2d\u7684 step \u5171\u4eab\u6b64 ID\n    prompt_uid:     str         # \u540c\u4e00 prompt \u7684 rollout \u5171\u4eab\u6b64 ID\uff08\u7528\u4e8e GRPO\uff09\n    step_index:     int         # trajectory \u5185\u7684\u4f4d\u7f6e\uff080-indexed\uff09\n    policy_version: int         # \u751f\u6210\u8be5 step \u65f6\u7684\u7b56\u7565\u7248\u672c\n    is_last:        bool        # \u662f\u5426\u4e3a trajectory \u7684\u6700\u540e\u4e00\u4e2a step\n    metadata:       dict        # \u8f85\u52a9\u6570\u636e\uff08\u6570\u636e\u96c6\u5b57\u6bb5\u3001\u6765\u6e90\u4fe1\u606f\u7b49\uff09\n</code></pre>"},{"location":"components/datapool/#_3","title":"\u5185\u90e8\u7d22\u5f15","text":"\u7d22\u5f15 \u7c7b\u578b \u7528\u9014 <code>trajectory_index</code> <code>dict[str, list[int]]</code> <code>trajectory_uid</code> \u2192 step \u7d22\u5f15\u5217\u8868 <code>trajectory_complete</code> <code>dict[str, bool]</code> \u8ffd\u8e2a trajectory \u662f\u5426\u5df2\u6536\u5230 <code>is_last</code> step <code>prompt_groups</code> <code>dict[str, PromptGroup]</code> <code>prompt_uid</code> \u2192 trajectory \u5217\u8868\u548c\u5b8c\u6210\u72b6\u6001"},{"location":"components/datapool/#producer-api","title":"Producer API","text":""},{"location":"components/datapool/#submit_stepstep-step-channeltrain","title":"<code>submit_step(step: Step, channel=\"train\")</code>","text":"<p>\u6dfb\u52a0\u5355\u4e2a step \u5230\u6307\u5b9a channel\u3002\u7531 Gateway \u901a\u8fc7 Ray RPC \u8c03\u7528\u3002</p>"},{"location":"components/datapool/#submit_stepssteps-liststep-channeltrain","title":"<code>submit_steps(steps: list[Step], channel=\"train\")</code>","text":"<p>\u6279\u91cf\u63d0\u4ea4\u591a\u4e2a step\u3002\u6bd4\u5faa\u73af\u8c03\u7528 <code>submit_step</code> \u66f4\u9ad8\u6548\u3002</p>"},{"location":"components/datapool/#complete_trajectorytrajectory_uid-rewardnone-channeltrain","title":"<code>complete_trajectory(trajectory_uid, reward=None, channel=\"train\")</code>","text":"<p>\u6807\u8bb0\u4e00\u6761 trajectory \u5b8c\u6210\u3002\u7528\u4e8e\u9ed1\u76d2\u6a21\u5f0f\uff0cAgent \u901a\u8fc7 Gateway \u7684 <code>v1/complete_trajectory</code> \u7aef\u70b9\u89e6\u53d1\u3002</p>"},{"location":"components/datapool/#consumer-api","title":"Consumer API","text":""},{"location":"components/datapool/#fetch_batchn_rollouts-channeltrain-liststep-none","title":"<code>fetch_batch(n_rollouts, channel=\"train\") \u2192 list[Step] | None</code>","text":"<p>FIFO \u62c9\u53d6\u4e0b\u4e00\u4e2a\u5c31\u7eea\u7684 <code>prompt_uid</code> \u7ec4\u3002\u4e00\u4e2a\u7ec4\u5728\u6240\u6709 trajectory \u90fd\u6536\u5230 <code>is_last</code> step \u540e\u53d8\u4e3a\"\u5c31\u7eea\"\u3002</p> <p>\u5f53\u6ca1\u6709\u5b8c\u6574\u7ec4\u53ef\u7528\u65f6\u8fd4\u56de <code>None</code>\u3002</p> <pre><code># Trainer \u4fa7\nwhile True:\n    batch = await data_pool.fetch_batch.remote(n_rollouts=5)\n    if batch is not None:\n        train_on_batch(batch)\n</code></pre>"},{"location":"components/datapool/#_4","title":"\u5bb9\u91cf\u7ba1\u7406\u4e0e\u80cc\u538b\u63a7\u5236","text":"<p>\u5f53\u8bbe\u7f6e <code>max_queue_size</code> \u65f6\uff0cDataPool \u5728\u961f\u5217\u6ee1\u65f6\u81ea\u52a8\u4e22\u5f03\u6700\u65e7\u7684\u5c31\u7eea\u7ec4\uff0c\u9632\u6b62\u6570\u636e\u5806\u79ef\u5bfc\u81f4\u5185\u5b58\u65e0\u9650\u589e\u957f\u3002\u8fd9\u79cd\u80cc\u538b\u673a\u5236\u4e5f\u786e\u4fdd\u4e86\u8bad\u7ec3\u4fa7\u6d88\u8d39\u7684\u6570\u636e\u5c3d\u53ef\u80fd\u65b0\u9c9c\uff1a</p> <pre><code>async_training:\n  max_queue_size: null   # null = \u65e0\u9650\n</code></pre>"},{"location":"components/datapool/#training-backend","title":"Training Backend\uff08\u6570\u636e\u4f9b\u7ed9\u9002\u914d\uff09","text":"<p>DataPool \u901a\u8fc7\u53ef\u63d2\u62d4\u7684 <code>TrainingBackend</code> \u5c06 <code>list[Step]</code> \u8f6c\u6362\u4e3a\u4efb\u610f\u8bad\u7ec3\u5f15\u64ce\u7684\u539f\u751f\u683c\u5f0f\uff0c\u5b9e\u73b0\u6570\u636e\u7ba1\u7406\u4e0e\u8bad\u7ec3\u6846\u67b6\u7684\u89e3\u8026\uff1a</p> <pre><code>class VerlBackend(TrainingBackend):\n    \"\"\"\u5c06 Step \u5217\u8868\u8f6c\u6362\u4e3a verl DataProto\u3002\"\"\"\n\n    def convert(self, steps: list[Step]) -&gt; DataProto:\n        # prompt_ids: \u5de6\u586b\u5145\u5230 prompt_length\n        # response_ids: \u53f3\u586b\u5145\u5230 response_length\n        # input_ids: [prompt_ids | response_ids]\n        # attention_mask, position_ids, response_mask \u7b49\n        ...\n</code></pre>"},{"location":"components/datapool/#off-policy","title":"Off-policy \u652f\u6301\uff08\u6570\u636e\u65b0\u9c9c\u5ea6\u7ba1\u63a7\uff09","text":"<p>\u6bcf\u4e2a Step \u90fd\u8bb0\u5f55\u4e86\u751f\u6210\u65f6\u7684 <code>policy_version</code>\uff0cDataPool \u548c Trainer \u53ef\u4ee5\u636e\u6b64\u5224\u65ad\u6570\u636e\u7684\u65b0\u9c9c\u5ea6\u3002Trainer \u901a\u8fc7 staleness threshold \u914d\u7f6e\u6765\u5904\u7406\u5386\u53f2\uff08off-policy\uff09\u6570\u636e\uff1a</p> <pre><code>async_training:\n  staleness_threshold: 0.1   # policy_version \u6ede\u540e &gt; threshold \u7684 step \u4e3a off-policy\n</code></pre> <p>Off-policy step \u4ecd\u5305\u542b\u5728 batch \u4e2d\uff0c\u4f46\u5728 loss \u8ba1\u7b97\u65f6\u901a\u8fc7 importance sampling \u8fdb\u884c\u964d\u6743\u3002</p>"},{"location":"components/gateway/","title":"Gateway Server","text":"<p>Gateway Server \u662f\u4e00\u4e2a FastAPI HTTP \u670d\u52a1\uff0c\u4f5c\u4e3a Agent \u4e0e Claw-R1 \u8bad\u7ec3\u57fa\u7840\u8bbe\u65bd\u4e4b\u95f4\u7684\u7f51\u7edc\u5c42\u4ee3\u7406\u3002</p>"},{"location":"components/gateway/#_1","title":"\u8bbe\u8ba1\u539f\u5219","text":"<ul> <li>\u72ec\u7acb\u8fdb\u7a0b\uff1aGateway \u4f5c\u4e3a\u666e\u901a OS \u8fdb\u7a0b\u8fd0\u884c\uff08\u975e Ray Actor\uff09\uff0c\u53ef\u4ee5\u72ec\u7acb\u4e8e Ray \u96c6\u7fa4\u91cd\u542f\u3002</li> <li>\u7eaf\u4ee3\u7406\uff1aGateway \u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f\uff0c\u53ea\u8d1f\u8d23\u8f6c\u53d1\u8bf7\u6c42\u3001\u6536\u96c6 Step\u3001\u63d0\u4ea4\u5230 DataPool\u3002</li> <li>OpenAI \u517c\u5bb9\uff1a\u9ed1\u76d2\u7aef\u70b9\u5b9e\u73b0\u4e0e OpenAI chat completions API \u76f8\u540c\u7684\u63a5\u53e3\uff0c\u53ef\u4f5c\u4e3a drop-in \u66ff\u6362\u3002</li> <li>\u5ef6\u8fdf\u521d\u59cb\u5316\uff1a\u542f\u52a8\u65f6\u5148\u5feb\u901f\u521d\u59cb\u5316 Ray \u8fde\u63a5\u548c\u914d\u7f6e\uff0cHTTP \u670d\u52a1\u7acb\u5373\u53ef\u7528\uff1btokenizer \u5728\u540e\u53f0\u7ebf\u7a0b\u52a0\u8f7d\uff0c\u901a\u8fc7 <code>/ready</code> \u7aef\u70b9\u62a5\u544a\u5c31\u7eea\u72b6\u6001\u3002</li> </ul>"},{"location":"components/gateway/#_2","title":"\u542f\u52a8\u65b9\u5f0f","text":"<p>Gateway \u901a\u5e38\u7531 <code>AsyncRollouter</code> \u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u81ea\u52a8\u542f\u52a8\u3002\u4e5f\u53ef\u624b\u52a8\u542f\u52a8\uff1a</p> <pre><code>python -m claw_r1.gateway.gateway \\\n    --data-pool-name  data_pool \\\n    --vllm-addresses  http://host1:8001,http://host2:8001 \\\n    --tokenizer-path  /path/to/model \\\n    --prompt-length   4096 \\\n    --response-length 1024 \\\n    --reward-worker-name reward_loop_worker \\\n    --ray-address     auto \\\n    --ray-namespace   default \\\n    --host            0.0.0.0 \\\n    --port            8100\n</code></pre>"},{"location":"components/gateway/#_3","title":"\u53c2\u6570","text":"\u53c2\u6570 \u5fc5\u586b \u8bf4\u660e <code>--data-pool-name</code> \u662f DataPool \u7684 Ray Actor \u540d\u79f0 <code>--vllm-addresses</code> \u662f \u9017\u53f7\u5206\u9694\u7684 vLLM \u670d\u52a1\u5668\u5730\u5740\u5217\u8868\uff08\u8f6e\u8be2\u8d1f\u8f7d\u5747\u8861\uff09 <code>--tokenizer-path</code> \u662f HuggingFace tokenizer \u8def\u5f84 <code>--prompt-length</code> \u662f \u6700\u5927 prompt token \u957f\u5ea6\uff08\u7528\u4e8e padding\uff09 <code>--response-length</code> \u662f \u6700\u5927 response token \u957f\u5ea6\uff08\u7528\u4e8e padding\uff09 <code>--reward-worker-name</code> \u5426 RewardLoopWorker \u7684 Ray Actor \u540d\u79f0 <code>--ray-address</code> \u5426 Ray GCS \u5730\u5740\uff08\u9ed8\u8ba4 <code>auto</code>\uff09 <code>--ray-namespace</code> \u5426 Ray namespace <code>--host</code> \u5426 \u76d1\u542c\u5730\u5740\uff08\u9ed8\u8ba4 <code>0.0.0.0</code>\uff09 <code>--port</code> \u5426 \u76d1\u542c\u7aef\u53e3\uff08\u9ed8\u8ba4 <code>8100</code>\uff09"},{"location":"components/gateway/#_4","title":"\u4e24\u79cd\u5de5\u4f5c\u6a21\u5f0f","text":""},{"location":"components/gateway/#white-box","title":"White-box \u6a21\u5f0f","text":"<p>\u767d\u76d2 Agent\uff08<code>AgentFlowBase</code> \u5b50\u7c7b\uff09\u901a\u8fc7 Gateway \u6839\u8def\u5f84\u7aef\u70b9\u4ea4\u4e92\uff1a</p> <pre><code>AgentFlow \u2192 POST /generate        \u2192 vLLM \u2192 \u8fd4\u56de token IDs\nAgentFlow \u2192 POST /submit_steps    \u2192 DataPool\nAgentFlow \u2192 POST /compute_reward  \u2192 RewardLoopWorker\n</code></pre> <p>Agent \u81ea\u5df1\u7ba1\u7406 tokenize\u3001Step \u6784\u5efa\u548c\u63d0\u4ea4\u3002</p>"},{"location":"components/gateway/#black-box","title":"Black-box \u6a21\u5f0f","text":"<p>\u9ed1\u76d2 Agent \u53ea\u9700\u8981\u4e00\u4e2a <code>base_url</code>\uff0c\u901a\u8fc7\u6807\u51c6 OpenAI \u63a5\u53e3\u4ea4\u4e92\uff1a</p> <pre><code>1. BlackBoxAgentFlow \u2192 POST /init_trajectory           \u2192 \u83b7\u53d6 base_url\n2. BlackBoxAgentFlow \u2192 POST {base_url}/v1/register_trajectory  \u2192 \u6ce8\u518c channel/metadata\n3. Agent             \u2192 POST {base_url}/v1/chat/completions     \u2192 \u6807\u51c6 OpenAI \u8c03\u7528\uff08\u53ef\u591a\u8f6e\uff09\n4. BlackBoxAgentFlow \u2192 POST {base_url}/v1/complete_trajectory  \u2192 \u6807\u8bb0\u5b8c\u6210\n</code></pre> <p>Gateway \u5728 <code>v1/chat/completions</code> \u5185\u90e8\u81ea\u52a8\u5b8c\u6210 tokenize\u3001Step \u6784\u5efa\u548c DataPool \u63d0\u4ea4\uff0cAgent \u5b8c\u5168\u65e0\u611f\u77e5\u3002</p>"},{"location":"components/gateway/#base_url","title":"base_url \u673a\u5236","text":"<p><code>base_url</code> \u7684\u683c\u5f0f\u4e3a\uff1a</p> <pre><code>http://&lt;host&gt;:&lt;port&gt;/&lt;trajectory_uid&gt;/&lt;prompt_uid&gt;\n</code></pre> <p><code>trajectory_uid</code> \u548c <code>prompt_uid</code> \u7f16\u7801\u5728 URL path \u4e2d\uff0c\u4f7f\u5f97 Gateway \u80fd\u5c06\u8bf7\u6c42\u5173\u8054\u5230\u6b63\u786e\u7684 trajectory\uff0c\u800c Agent \u7aef\u53ea\u9700\u4fee\u6539 <code>base_url</code> \u5373\u53ef\u63a5\u5165\u8bad\u7ec3\u7cfb\u7edf\u3002</p> <pre><code>from openai import OpenAI\n\nclient = OpenAI(\n    base_url=\"http://gateway:8100/abc123/1\",  # base_url \u7531 init_trajectory \u8fd4\u56de\n    api_key=\"not-needed\",\n)\nresponse = client.chat.completions.create(\n    model=\"qwen\",\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\n)\n</code></pre>"},{"location":"components/gateway/#_5","title":"\u5185\u90e8\u72b6\u6001\u7ba1\u7406","text":"<p>Gateway \u4e3a\u6bcf\u6761 trajectory \u7ef4\u62a4\u4ee5\u4e0b\u72b6\u6001\uff1a</p> \u72b6\u6001 \u8bf4\u660e <code>_trajectory_step_counter</code> \u6bcf\u6761 trajectory \u7684\u4e0b\u4e00\u4e2a step_index <code>_trajectory_channel</code> trajectory \u5bf9\u5e94\u7684 DataPool channel\uff08\u9ed8\u8ba4 <code>\"train\"</code>\uff09 <code>_trajectory_metadata</code> trajectory \u5173\u8054\u7684 metadata\uff08\u5982 reward_model\u3001data_source \u7b49\uff09 <p>\u8fd9\u4e9b\u72b6\u6001\u5728 <code>register_trajectory</code> \u65f6\u8bbe\u7f6e\uff0c\u5728 <code>complete_trajectory</code> \u65f6\u6e05\u7406\u3002</p>"},{"location":"components/gateway/#_6","title":"\u8d1f\u8f7d\u5747\u8861","text":"<p>\u5f53\u63d0\u4f9b\u591a\u4e2a <code>--vllm-addresses</code> \u65f6\uff0cGateway \u4f7f\u7528 round-robin \u8f6e\u8be2\u5206\u53d1\u8bf7\u6c42\uff1a</p> <pre><code>_vllm_cycle = itertools.cycle(vllm_addresses)\nvllm_url = next(_vllm_cycle)\n</code></pre>"},{"location":"components/gateway/#api","title":"API \u53c2\u8003","text":"<p>\u5b8c\u6574\u7684\u7aef\u70b9\u6587\u6863\u89c1 Gateway API\u3002</p>"},{"location":"components/reward-system/","title":"Reward System","text":"<p>The <code>RewardLoopWorker</code> is a Ray Actor responsible for assigning reward scores to trajectory steps. It bridges the gap between raw agent interactions and trainable reward signals.</p>"},{"location":"components/reward-system/#three-reward-sources","title":"Three Reward Sources","text":"<p>Claw-R1 supports three types of reward computation, which can be combined:</p> Type Description Best For Rule-based Deterministic function of step output Verifiable tasks (math, code execution) Discriminative RM Binary classifier reward model Preference learning, safety evaluation Generative RM LLM-based evaluator via custom scoring function Complex quality assessment, nuanced feedback"},{"location":"components/reward-system/#reward-in-production-vs-research-settings","title":"Reward in Production vs. Research Settings","text":"<p>In research settings (white-box offline mode), rewards are computed from known ground truth:</p> <pre><code>Trajectory:   [user msg] \u2192 [agent think] \u2192 [tool call] \u2192 [tool result] \u2192 [final reply]\nReward:            0.0            0.3            0.7            0.9            0.8\n</code></pre> <ul> <li>Rule-based: is the final answer correct? does the code pass tests?</li> <li>Model-based: is each step logically sound? is the tool use appropriate?</li> </ul> <p>In production settings (online mode), rewards come from real user signals:</p> Signal Type Interpretation User sends follow-up Implicit positive Agent answer was relevant but incomplete User corrects the agent Negative feedback Factual or task error User says \"thanks\" Positive signal Task completed satisfactorily No follow-up after task Neutral / estimated Reward Model estimates step quality <p>Claw-R1 uses a Reward Model to convert these soft signals into scalar process rewards, filling the gap between verifiable task rewards and open-ended conversational rewards.</p>"},{"location":"components/reward-system/#rewardloopworker-api","title":"RewardLoopWorker API","text":""},{"location":"components/reward-system/#compute_score_batchsteps-liststep-listfloat","title":"<code>compute_score_batch(steps: list[Step]) \u2192 list[float]</code>","text":"<p>Computes rewards for a batch of steps. This is the primary interface used by the Trainer.</p> <pre><code># In AsyncTrainer\nrewards = await reward_worker.compute_score_batch.remote(batch_steps)\nfor step, reward in zip(batch_steps, rewards):\n    step.reward = reward\n</code></pre>"},{"location":"components/reward-system/#custom-reward-function","title":"Custom Reward Function","text":"<p>Register a custom generative reward model by implementing the <code>reward_loop_manager</code> interface:</p> <pre><code># custom_reward.py\ndef compute_reward(step: dict, model, tokenizer) -&gt; float:\n    \"\"\"\n    Args:\n        step: dict with keys 'messages', 'response', 'metadata'\n        model: loaded reward model\n        tokenizer: model tokenizer\n    Returns:\n        scalar reward in [0.0, 1.0]\n    \"\"\"\n    prompt = build_evaluation_prompt(step)\n    score = model.score(prompt)\n    return score\n</code></pre> <p>Then register it in the configuration:</p> <pre><code>reward:\n  type: genrm\n  reward_loop_manager: path.to.custom_reward.compute_reward\n  model_path: /path/to/reward/model\n</code></pre>"},{"location":"components/reward-system/#reward-in-the-training-loop","title":"Reward in the Training Loop","text":"<p>Reward computation is decoupled from the agent service:</p> <ol> <li>The Gateway does not compute rewards before submitting steps to DataPool</li> <li>DataPool stores steps with <code>reward=0.0</code> initially</li> <li>The Trainer calls <code>RewardLoopWorker.compute_score_batch()</code> before the PPO update</li> <li>Updated rewards are used for advantage computation</li> </ol> <p>This ensures that even slow generative reward models (which may call an external LLM) do not affect agent service latency.</p> <p>Reward Design</p> <p>For new tasks, start with simple rule-based rewards (e.g., exact match, code execution pass rate). Generative reward models are more expressive but introduce variance and computational cost. Use discriminative models as a middle ground.</p>"},{"location":"concepts/","title":"Core Concepts","text":"<p>Claw-R1 \u7684\u8bbe\u8ba1\u56f4\u7ed5\u4e09\u4e2a\u6838\u5fc3\u6982\u5ff5\u5c55\u5f00\uff1a\u901a\u7528\u6570\u636e\u91c7\u96c6\u3001\u6570\u636e\u4e2d\u95f4\u4ef6\u7ba1\u7406\u548c\u6570\u636e\u9a71\u52a8\u7684\u6301\u7eed\u8fdb\u5316\u3002\u5b83\u4eec\u5171\u540c\u6784\u6210\u4e00\u4e2a\u4ece\u91c7\u96c6\u5230\u8bad\u7ec3\u7684\u6570\u636e\u98de\u8f6e\u3002</p> <ul> <li> <p>Base URL Integration \u00b7 \u901a\u7528\u6570\u636e\u91c7\u96c6</p> <p>\u96f6\u4ee3\u7801\u4fb5\u5165\u7684 Agent \u6570\u636e\u91c7\u96c6\u673a\u5236\u3002\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u53ea\u9700\u4fee\u6539 <code>base_url</code>\uff0cGateway \u5373\u53ef\u81ea\u52a8\u91c7\u96c6\u5176\u4ea4\u4e92\u6570\u636e\u3002</p> <p> Base URL Integration</p> </li> <li> <p>Middleware Layer \u00b7 \u6570\u636e\u4e2d\u95f4\u4ef6</p> <p>Gateway + DataPool \u6570\u636e\u57fa\u7840\u8bbe\u65bd\u3002\u7edf\u4e00\u89e3\u51b3\u6570\u636e\u7684\u91c7\u96c6\u5165\u53e3\u3001\u8d28\u91cf\u7ba1\u7406\u3001\u5206\u533a\u7f13\u51b2\u548c\u6309\u9700\u4f9b\u7ed9\u3002</p> <p> Middleware Layer</p> </li> <li> <p>Production Scenario \u00b7 \u6570\u636e\u9a71\u52a8\u8fdb\u5316</p> <p>\"\u90e8\u7f72 = \u8bad\u7ec3\" \u8303\u5f0f\u3002Agent \u5728\u670d\u52a1\u7528\u6237\u7684\u540c\u65f6\u6301\u7eed\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\uff0c\u7528\u6237\u884c\u4e3a\u5929\u7136\u6210\u4e3a\u6570\u636e\u8d28\u91cf\u4fe1\u53f7\uff0c\u9a71\u52a8\u6a21\u578b\u6301\u7eed\u8fdb\u5316\u3002</p> <p> Production Scenario</p> </li> </ul>"},{"location":"concepts/#_1","title":"\u6570\u636e\u98de\u8f6e","text":"<pre><code>                    base_url\n                 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                 \u2502 \u4efb\u610f Agent  \u2502\n                 \u2502 (\u767d\u76d2/\u9ed1\u76d2) \u2502\n                 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2518\n                        \u2502 OpenAI API\n                        \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    Gateway       \u2502 \u2190 \u6570\u636e\u91c7\u96c6\u5165\u53e3\n              \u2502  (\u81ea\u52a8\u91c7\u96c6 Step)  \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    DataPool      \u2502 \u2190 \u6570\u636e\u7ba1\u7406\u6838\u5fc3\n              \u2502  (\u8bc4\u4f30\u00b7\u7b5b\u9009\u00b7\u4f9b\u7ed9) \u2502    (\u8d28\u91cf\u8bc4\u4f30 + \u5206\u533a\u7ba1\u7406)\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    Trainer       \u2502 \u2190 \u6570\u636e\u6d88\u8d39\n              \u2502  (\u6301\u7eed\u8bad\u7ec3)       \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502 \u6743\u91cd\u540c\u6b65\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    vLLM          \u2502\n              \u2502  (\u66f4\u597d\u7684\u6a21\u578b)     \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre> <p>\u4e09\u4e2a\u6982\u5ff5\u7684\u534f\u540c\uff1a</p> <ol> <li>Base URL \u8ba9\u4efb\u4f55 Agent \u7684\u4ea4\u4e92\u6570\u636e\u96f6\u6210\u672c\u88ab\u91c7\u96c6</li> <li>Middleware \u7ba1\u7406\u6570\u636e\u7684\u8d28\u91cf\u3001\u5206\u533a\u548c\u4f9b\u7ed9</li> <li>Production Scenario \u8ba9\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u81ea\u7136\u878d\u5165\u6570\u636e\uff0c\u9a71\u52a8\u6a21\u578b\u6301\u7eed\u8fdb\u5316</li> </ol>"},{"location":"concepts/base-url-integration/","title":"Base URL Integration","text":""},{"location":"concepts/base-url-integration/#agent-llm","title":"\u95ee\u9898\uff1a\u5982\u4f55\u62e6\u622a\u9ed1\u76d2 Agent \u7684 LLM \u8c03\u7528\uff1f","text":"<p>\u5728 Agentic RL \u4e2d\uff0c\u8bad\u7ec3\u7cfb\u7edf\u9700\u8981\u62e6\u622a Agent \u4e0e LLM \u4e4b\u95f4\u7684\u6bcf\u6b21\u4ea4\u4e92\uff0c\u4ee5\u6536\u96c6 <code>(state, action, reward)</code> \u6570\u636e\u3002\u5bf9\u4e8e\u767d\u76d2 Agent\uff08\u6e90\u7801\u53ef\u63a7\uff09\uff0c\u8fd9\u5f88\u7b80\u5355\u3002\u4f46\u5bf9\u4e8e\u9ed1\u76d2 Agent\uff08\u5982\u7b2c\u4e09\u65b9\u670d\u52a1\u3001\u7f16\u8bd1\u540e\u7684\u4e8c\u8fdb\u5236\u6587\u4ef6\uff09\uff0c\u5982\u4f55\u5728\u4e0d\u4fee\u6539 Agent \u4ee3\u7801\u7684\u60c5\u51b5\u4e0b\u62e6\u622a\uff1f</p>"},{"location":"concepts/base-url-integration/#_1","title":"\u65b9\u6848\u5bf9\u6bd4","text":"\u65b9\u6848 \u4fb5\u5165\u6027 \u53ef\u9760\u6027 \u9002\u7528\u8303\u56f4 SDK monkey-patch \u4e2d \u4f4e\uff08\u7248\u672c\u66f4\u65b0\u6613\u5931\u6548\uff09 \u4ec5\u9650\u7279\u5b9a SDK \u4ee3\u7406\u5c42\uff08Proxy\uff09 \u9ad8 \u4e2d\uff08\u9700\u914d\u7f6e\u7f51\u7edc\uff09 \u901a\u7528 base_url \u66ff\u6362 \u6781\u4f4e \u9ad8 \u6240\u6709 OpenAI \u517c\u5bb9 SDK"},{"location":"concepts/base-url-integration/#base_url","title":"base_url \u673a\u5236","text":"<p>\u51e0\u4e4e\u6240\u6709 OpenAI \u517c\u5bb9\u7684 SDK \u90fd\u652f\u6301\u81ea\u5b9a\u4e49 <code>base_url</code>\u3002Claw-R1 \u5229\u7528\u8fd9\u4e00\u70b9\uff1a</p> <ol> <li>Gateway \u66b4\u9732 <code>POST {base_url}/v1/chat/completions</code> \u7aef\u70b9</li> <li>Agent \u53ea\u9700\u5c06 <code>base_url</code> \u4ece <code>https://api.openai.com</code> \u6539\u4e3a Gateway \u7684\u5730\u5740</li> <li>Gateway \u900f\u660e\u5730\u8f6c\u53d1\u8bf7\u6c42\u5230 vLLM\uff0c\u540c\u65f6\u81ea\u52a8\u6536\u96c6\u8bad\u7ec3\u6570\u636e</li> </ol> <pre><code>from openai import OpenAI\n\n# \u539f\u59cb\u4ee3\u7801\nclient = OpenAI(base_url=\"https://api.openai.com/v1\")\n\n# \u63a5\u5165 Claw-R1\uff1a\u53ea\u6539\u4e00\u884c\nclient = OpenAI(\n    base_url=\"http://gateway:8100/traj123/prompt1\",\n    api_key=\"not-needed\",\n)\n\n# \u540e\u7eed\u4ee3\u7801\u5b8c\u5168\u4e0d\u53d8\nresponse = client.chat.completions.create(\n    model=\"qwen\",\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\n)\n</code></pre>"},{"location":"concepts/base-url-integration/#base_url_1","title":"base_url \u7684\u7ed3\u6784","text":"<pre><code>http://&lt;host&gt;:&lt;port&gt;/&lt;trajectory_uid&gt;/&lt;prompt_uid&gt;\n</code></pre> <ul> <li><code>trajectory_uid</code>\uff1a\u6807\u8bc6\u4e00\u6761\u5b8c\u6574\u7684\u5bf9\u8bdd\u8f68\u8ff9</li> <li><code>prompt_uid</code>\uff1a\u6807\u8bc6\u540c\u4e00 prompt \u7684\u591a\u6b21 rollout\uff08\u7528\u4e8e GRPO \u5206\u7ec4\uff09</li> </ul> <p>\u8fd9\u4e24\u4e2a ID \u7f16\u7801\u5728 URL path \u4e2d\uff0cGateway \u4ece path \u4e2d\u63d0\u53d6\uff0cAgent \u5b8c\u5168\u65e0\u611f\u77e5\u3002</p>"},{"location":"concepts/base-url-integration/#claw-r1","title":"\u5728 Claw-R1 \u4e2d\u7684\u4f7f\u7528","text":""},{"location":"concepts/base-url-integration/#_2","title":"\u9ed1\u76d2\u79bb\u7ebf\u6a21\u5f0f","text":"<p><code>BlackBoxAgentFlowBase</code> \u81ea\u52a8\u7ba1\u7406 <code>base_url</code> \u7684\u751f\u547d\u5468\u671f\uff1a</p> <pre><code>1. POST /init_trajectory              \u2192 \u83b7\u53d6 base_url\n2. POST {base_url}/v1/register_trajectory  \u2192 \u6ce8\u518c channel/metadata\n3. Agent \u4f7f\u7528 base_url \u8fdb\u884c\u591a\u8f6e\u5bf9\u8bdd     \u2192 Gateway \u81ea\u52a8\u6536\u96c6 Step\n4. POST {base_url}/v1/complete_trajectory  \u2192 \u6807\u8bb0\u5b8c\u6210\n</code></pre> <p>Agent \u53ea\u9700\u8981\u63a5\u6536 <code>base_url</code> \u53c2\u6570\uff0c\u5176\u4f59\u7531\u8bad\u7ec3\u6846\u67b6\u5904\u7406\u3002</p>"},{"location":"concepts/base-url-integration/#_3","title":"\u9ed1\u76d2\u5728\u7ebf\u6a21\u5f0f","text":"<p>\u5728\u7ebf\u6a21\u5f0f\u4e0b\uff0c\u5916\u90e8\u670d\u52a1\u76f4\u63a5\u8c03\u7528 Gateway \u7684 <code>init_trajectory</code> \u83b7\u53d6 <code>base_url</code>\uff0c\u7136\u540e\u5c06\u5176\u4f20\u9012\u7ed9 Agent\u3002Agent \u7684\u6bcf\u6b21 LLM \u8c03\u7528\u90fd\u81ea\u52a8\u88ab Gateway \u8bb0\u5f55\u3002</p>"},{"location":"concepts/base-url-integration/#sdk-hook","title":"\u4e3a\u4ec0\u4e48\u4f18\u4e8e SDK Hook","text":"\u7ef4\u5ea6 SDK Hook base_url Agent \u4ee3\u7801\u4fee\u6539 \u9700\u8981\u6ce8\u5165 hook \u4ee3\u7801 \u53ea\u6539\u4e00\u4e2a\u53c2\u6570 \u591a\u8bed\u8a00\u652f\u6301 \u6bcf\u79cd\u8bed\u8a00\u9700\u8981\u5355\u72ec\u5b9e\u73b0 \u6240\u6709\u8bed\u8a00\u901a\u7528 \u7248\u672c\u517c\u5bb9\u6027 SDK \u66f4\u65b0\u53ef\u80fd\u7834\u574f hook HTTP \u534f\u8bae\u7a33\u5b9a \u8c03\u8bd5\u96be\u5ea6 Hook \u5c42\u589e\u52a0\u8c03\u8bd5\u590d\u6742\u5ea6 \u6807\u51c6 HTTP \u8bf7\u6c42\uff0c\u6613\u4e8e\u8c03\u8bd5 \u751f\u4ea7\u53ef\u9760\u6027 \u4e2d\u7b49 \u9ad8"},{"location":"concepts/base-url-integration/#sdk","title":"\u652f\u6301\u7684 SDK \u548c\u6846\u67b6","text":"<p>\u4efb\u4f55\u652f\u6301\u81ea\u5b9a\u4e49 <code>base_url</code> \u7684 OpenAI \u517c\u5bb9 SDK \u90fd\u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528\uff1a</p> <ul> <li>Python: <code>openai</code>, <code>httpx</code>, <code>requests</code></li> <li>JavaScript/TypeScript: <code>openai-node</code></li> <li>Go: <code>go-openai</code></li> <li>\u6846\u67b6: LangChain, LlamaIndex, AutoGen, CrewAI \u7b49</li> </ul>"},{"location":"concepts/middleware-layer/","title":"Middleware Layer","text":""},{"location":"concepts/middleware-layer/#_1","title":"\u4e3a\u4ec0\u4e48\u9700\u8981\u6570\u636e\u4e2d\u95f4\u4ef6\uff1f","text":"<p>Agentic RL \u4e2d\uff0cAgent \u4ea7\u751f\u4ea4\u4e92\u6570\u636e\uff0cTrainer \u6d88\u8d39\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\u3002\u7136\u800c\u5728\u5b9e\u9645\u573a\u666f\u4e2d\uff0c\u4e24\u8005\u4e4b\u95f4\u5b58\u5728\u663e\u8457\u7684\u4e0d\u5bf9\u79f0\uff1a</p> <ul> <li>\u6570\u636e\u6765\u6e90\u591a\u6837\uff1a\u767d\u76d2 Agent\u3001\u9ed1\u76d2 Agent\u3001\u5728\u7ebf\u670d\u52a1 Agent\uff0c\u4ea7\u51fa\u7684\u6570\u636e\u683c\u5f0f\u548c\u9891\u7387\u5404\u4e0d\u76f8\u540c</li> <li>\u6570\u636e\u8d28\u91cf\u53c2\u5dee\uff1a\u5e76\u975e\u6240\u6709\u4ea4\u4e92\u90fd\u6709\u8bad\u7ec3\u4ef7\u503c\uff0c\u9700\u8981\u8bc4\u4f30\u548c\u7b5b\u9009</li> <li>\u4ea7\u6d88\u901f\u7387\u4e0d\u5339\u914d\uff1aAgent \u4fa7\u7684\u6570\u636e\u4ea7\u751f\u901f\u7387\u4e0e Trainer \u4fa7\u7684\u6d88\u8d39\u901f\u7387\u5f80\u5f80\u4e0d\u540c\u6b65</li> <li>\u6570\u636e\u9700\u8981\u7ba1\u7406\uff1a\u5206\u533a\u3001\u7d22\u5f15\u3001\u80cc\u538b\u63a7\u5236\u3001\u7edf\u8ba1\u76d1\u63a7 \u2014 \u8fd9\u4e9b\u4e0d\u662f\u7b80\u5355\u7684\u961f\u5217\u80fd\u89e3\u51b3\u7684</li> </ul> <p>Claw-R1 \u901a\u8fc7 Middleware Layer\uff08Gateway + DataPool\uff09\u5728 Agent \u4fa7\u548c Training \u4fa7\u4e4b\u95f4\u5efa\u7acb\u4e00\u5c42\u6570\u636e\u57fa\u7840\u8bbe\u65bd\uff0c\u7edf\u4e00\u89e3\u51b3\u6570\u636e\u7684\u91c7\u96c6\u3001\u7ba1\u7406\u548c\u4f9b\u7ed9\u95ee\u9898\u3002</p>"},{"location":"concepts/middleware-layer/#gateway-datapool","title":"Gateway + DataPool \u67b6\u6784","text":"<pre><code>Agent \u4fa7                    Middleware                    Training \u4fa7\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510           \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510           \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 Agent    \u2502\u2500\u2500HTTP\u2500\u2500\u25ba  \u2502  Gateway         \u2502\u2500\u2500Ray RPC\u2500\u2500\u25ba\u2502  DataPool    \u2502\n\u2502 (\u4efb\u610f)   \u2502\u25c4\u2500\u2500HTTP\u2500\u2500  \u2502  (FastAPI, 8100) \u2502           \u2502  (Ray Actor) \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518           \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518           \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                                             \u2502 fetch_batch()\n                                                             \u25bc\n                                                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                                                      \u2502  Trainer     \u2502\n                                                      \u2502  (Ray Actor) \u2502\n                                                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre>"},{"location":"concepts/middleware-layer/#gateway","title":"Gateway\uff1a\u6570\u636e\u91c7\u96c6\u5165\u53e3","text":"<p>Gateway \u662f\u4e00\u4e2a\u72ec\u7acb\u8fdb\u7a0b\uff08FastAPI\uff09\uff0c\u8d1f\u8d23\u4ece Agent \u4ea4\u4e92\u4e2d\u91c7\u96c6\u8bad\u7ec3\u6570\u636e\uff1a</p> <ul> <li>\u7eaf\u4ee3\u7406\uff1a\u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f\uff0c\u53ea\u8f6c\u53d1\u8bf7\u6c42\u548c\u91c7\u96c6\u6570\u636e</li> <li>OpenAI \u517c\u5bb9\uff1a\u9ed1\u76d2 Agent \u901a\u8fc7 <code>base_url</code> \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u4ece\u5bf9\u8bdd\u4e2d\u6784\u5efa Step</li> <li>\u5ef6\u8fdf\u521d\u59cb\u5316\uff1aHTTP \u670d\u52a1\u7acb\u5373\u53ef\u7528\uff0ctokenizer \u5728\u540e\u53f0\u52a0\u8f7d</li> </ul> <p>Gateway \u652f\u6301\u4e24\u79cd\u6570\u636e\u91c7\u96c6\u6a21\u5f0f\uff1a</p> \u6a21\u5f0f \u7aef\u70b9 \u6570\u636e\u91c7\u96c6\u65b9\u5f0f \u767d\u76d2 <code>/generate</code>, <code>/submit_steps</code> Agent \u81ea\u884c\u6784\u5efa Step \u5e76\u63d0\u4ea4 \u9ed1\u76d2 <code>{base_url}/v1/chat/completions</code> Gateway \u81ea\u52a8 tokenize \u5e76\u6784\u5efa Step <p>\u8be6\u89c1 Gateway Server\u3002</p>"},{"location":"concepts/middleware-layer/#datapool","title":"DataPool\uff1a\u6570\u636e\u7ba1\u7406\u6838\u5fc3","text":"<p>DataPool \u662f\u4e00\u4e2a Ray Actor\uff0c\u4e0d\u4ec5\u662f trajectory \u7f13\u51b2\u533a\uff0c\u66f4\u662f Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u4e2d\u67a2\uff1a</p> \u80fd\u529b \u8bf4\u660e \u6570\u636e\u5b58\u50a8 \u4ee5 Step \u7c92\u5ea6\u5b58\u50a8\u4ea4\u4e92\u6570\u636e\uff0c\u652f\u6301\u591a\u7ef4\u7d22\u5f15 \u8d28\u91cf\u8ffd\u8e2a \u6bcf\u4e2a Step \u8bb0\u5f55 <code>policy_version</code>\uff0c\u652f\u6301\u65b0\u9c9c\u5ea6\u68c0\u6d4b Channel \u5206\u533a <code>\"train\"</code> \u548c <code>\"val\"</code> \u6570\u636e\u9694\u79bb\uff0c\u4e92\u4e0d\u5e72\u6270 GRPO \u5206\u7ec4 \u6309 <code>prompt_uid</code> \u5206\u7ec4\uff0c\u51d1\u9f50\u6240\u6709 rollout \u540e\u624d\u4f9b\u7ed9\u8bad\u7ec3 \u5bb9\u91cf\u7ba1\u7406 \u53ef\u914d\u7f6e <code>max_queue_size</code>\uff0c\u8d85\u9650\u81ea\u52a8\u4e22\u5f03\u6700\u65e7\u6570\u636e \u7edf\u8ba1\u76d1\u63a7 \u5b9e\u65f6\u63d0\u4f9b\u961f\u5217\u6df1\u5ea6\u3001produce/consume/drop \u901f\u7387\u7b49\u6307\u6807 <p>\u8be6\u89c1 DataPool\u3002</p>"},{"location":"concepts/middleware-layer/#step","title":"Step \u6570\u636e\u6a21\u578b","text":"<p>Step \u662f\u6570\u636e\u7ba1\u7406\u7684\u539f\u5b50\u5355\u4f4d\uff0c\u8bb0\u5f55\u4e86\u4e00\u6b21 Agent \u4ea4\u4e92\u7684\u5b8c\u6574\u4fe1\u606f\uff1a</p> <pre><code>@dataclass\nclass Step:\n    prompt_ids:     list[int]   # state: \u5b8c\u6574\u4e0a\u4e0b\u6587 token IDs\n    response_ids:   list[int]   # action: LLM \u751f\u6210\u7684 token IDs\n    reward:         float       # \u5373\u65f6 reward\uff08\u8d28\u91cf\u8bc4\u5206\uff09\n    trajectory_uid: str         # \u540c\u4e00\u5bf9\u8bdd\u7684 step \u5171\u4eab\u6b64 ID\n    prompt_uid:     str         # \u540c\u4e00 prompt \u7684 rollout \u5171\u4eab\u6b64 ID\n    step_index:     int         # trajectory \u5185\u7684\u4f4d\u7f6e\n    policy_version: int         # \u751f\u6210\u65f6\u7684\u7b56\u7565\u7248\u672c\uff08\u65b0\u9c9c\u5ea6\u8ffd\u8e2a\uff09\n    is_last:        bool        # \u662f\u5426\u4e3a\u6700\u540e\u4e00\u4e2a step\n    metadata:       dict        # \u8f85\u52a9\u6570\u636e\uff08\u6765\u6e90\u3001\u6570\u636e\u96c6\u5b57\u6bb5\u7b49\uff09\n</code></pre>"},{"location":"concepts/middleware-layer/#reward","title":"Reward \u6807\u6ce8\u4e0e\u6570\u636e\u8d28\u91cf\u8bc4\u4f30","text":"<p>Reward \u8ba1\u7b97\u4e0e Agent \u670d\u52a1\u89e3\u8026\uff0c\u786e\u4fdd\u6570\u636e\u8d28\u91cf\u8bc4\u4f30\u4e0d\u5f71\u54cd Agent \u670d\u52a1\u5ef6\u8fdf\uff1a</p> <ol> <li>Gateway \u91c7\u96c6 Step \u65f6 <code>reward=0.0</code>\uff08\u539f\u59cb\u6570\u636e\uff09</li> <li>DataPool \u5b58\u50a8\u539f\u59cb Step</li> <li>Trainer \u5728\u6d88\u8d39\u6570\u636e\u524d\u901a\u8fc7 <code>RewardLoopWorker</code> \u8bc4\u4f30\u6570\u636e\u8d28\u91cf\uff08\u8ba1\u7b97 reward\uff09</li> <li>\u8bc4\u4f30\u540e\u7684 reward \u7528\u4e8e advantage \u8ba1\u7b97\u548c\u6570\u636e\u7b5b\u9009</li> </ol> <p>\u8fd9\u79cd\u8bbe\u8ba1\u4f7f\u5f97\u5373\u4f7f\u662f\u6162\u901f\u7684 generative reward model \u6216\u4eba\u7c7b\u53cd\u9988\u7ba1\u7ebf\u4e5f\u4e0d\u4f1a\u5f71\u54cd Agent \u7684\u6b63\u5e38\u670d\u52a1\u3002</p>"},{"location":"concepts/production-scenario/","title":"Production Agent Scenario","text":""},{"location":"concepts/production-scenario/#agentic-rl","title":"Agentic RL \u4e2d\u7684\u9690\u542b\u5047\u8bbe","text":"<p>\u51e0\u4e4e\u6240\u6709 Agentic RL \u6846\u67b6\u90fd\u5efa\u7acb\u5728\u4e00\u4e2a\u9690\u542b\u5047\u8bbe\u4e0a\uff1a</p> <p>\u8bad\u7ec3\u9636\u6bb5 \u2260 \u90e8\u7f72\u9636\u6bb5</p> <p>\u6807\u51c6\u6d41\u7a0b\uff1a\u5728\u79bb\u7ebf/\u6a21\u62df\u6570\u636e\u4e0a\u8bad\u7ec3 \u2192 \u90e8\u7f72\u56fa\u5b9a\u6a21\u578b \u2192 \u5b9a\u671f\u91cd\u8bad\u3002</p> <p>\u8fd9\u5728\u7814\u7a76\u573a\u666f\u4e0b\u53ef\u884c\uff0c\u4f46\u5728\u751f\u4ea7\u73af\u5883\u4e2d\u9047\u5230\u6839\u672c\u6027\u969c\u788d\uff1a</p> \u95ee\u9898 \u8868\u73b0 \u5206\u5e03\u504f\u79fb \u8bad\u7ec3\u6570\u636e\u662f\u5408\u6210\u7684\uff1b\u771f\u5b9e\u7528\u6237\u8bf7\u6c42\u5206\u5e03\u4e0d\u540c \u2192 \u90e8\u7f72\u540e\u80fd\u529b\u9000\u5316 \u51b7\u542f\u52a8 \u65b0\u90e8\u7f72\u7684\u6a21\u578b\u5bf9\u7279\u5b9a\u7528\u6237\u7684\u4e60\u60ef\u3001\u5de5\u5177\u3001\u5de5\u4f5c\u6d41\u4e00\u65e0\u6240\u77e5 \u2192 \u6f2b\u957f\u7684\"\u9884\u70ed\"\u671f \u957f\u5c3e\u4efb\u52a1 Benchmark \u8986\u76d6\u5e38\u89c1\u4efb\u52a1\uff1b\u7528\u6237\u7684\u5c0f\u4f17\u9700\u6c42\u65e0\u6cd5\u88ab\u79bb\u7ebf\u8bad\u7ec3\u8986\u76d6 \u73af\u5883\u6f02\u79fb \u5de5\u5177 API \u66f4\u65b0\u3001\u7528\u6237\u884c\u4e3a\u53d8\u5316 \u2192 \u9759\u6001\u6a21\u578b\u65e0\u6cd5\u81ea\u9002\u5e94"},{"location":"concepts/production-scenario/#claw-r1-agent","title":"Claw-R1 \u7684\u6838\u5fc3\u573a\u666f\uff1a\u4e2a\u4eba Agent \u81ea\u6211\u8fdb\u5316","text":"<p>Claw-R1 \u7684\u9996\u4e2a\u9a8c\u8bc1\u573a\u666f\u662f OpenClaw \u4e2a\u4eba\u52a9\u624b\uff1a</p> <pre><code>\u8bbe\u7f6e\uff1a\n  \u7528\u6237\u5728 Mac Mini \u4e0a\u90e8\u7f72 OpenClaw\uff0c\u8fde\u63a5 Slack / \u5fae\u4fe1 / \u90ae\u4ef6\u3002\n  \u6bcf\u5929\u901a\u8fc7\u6d88\u606f\u4e0e OpenClaw \u4ea4\u4e92\uff1a\u65e5\u7a0b\u5b89\u6392\u3001\u4fe1\u606f\u68c0\u7d22\u3001\u4ee3\u7801\u8f85\u52a9\u7b49\u3002\n\n\u4f20\u7edf\u65b9\u6848\uff1a\n  OpenClaw \u4f7f\u7528\u56fa\u5b9a\u7684 GPT-4o / Claude 3.5\u3002\n  \u80fd\u529b\u4e0d\u4f1a\u968f\u4f7f\u7528\u800c\u589e\u957f\u3002\n\nClaw-R1 \u65b9\u6848\uff1a\n  1. \u7528\u6237\u6d88\u606f \u2192 OpenClaw \u2192 Gateway\uff08\u62e6\u622a LLM \u8c03\u7528\uff09\n  2. Gateway \u8bb0\u5f55\u6bcf\u6b21\u4ea4\u4e92 \u2192 DataPool\uff08\u672c\u5730\uff09\n  3. Reward Model \u5bf9\u6bcf\u6b21\u4ea4\u4e92\u8bc4\u5206\n  4. \u8fdc\u7a0b\u670d\u52a1\u5668\u4e0a\u7684\u8bad\u7ec3\u5f15\u64ce\u6301\u7eed\u6d88\u8d39 DataPool\uff0c\u66f4\u65b0\u6a21\u578b\u6743\u91cd\n  5. \u66f4\u65b0\u7684\u6743\u91cd\u63a8\u9001\u56de Gateway\uff1b\u4e0b\u6b21\u8c03\u7528\u4f7f\u7528\u6539\u8fdb\u540e\u7684\u6a21\u578b\n\n\u7ed3\u679c\uff1a\n  \u7528\u6237 Mac Mini \u4e0a\u7684 OpenClaw \u4f1a\u968f\u65f6\u95f4\u63a8\u79fb\u8d8a\u6765\u8d8a\u4e86\u89e3\u8be5\u7528\u6237\u3002\n</code></pre>"},{"location":"concepts/production-scenario/#rl","title":"\u4f20\u7edf RL \u6846\u67b6\u65e0\u6cd5\u6ee1\u8db3\u7684\u4e09\u4e2a\u9700\u6c42","text":""},{"location":"concepts/production-scenario/#1","title":"\u2460 \u670d\u52a1\u8fde\u7eed\u6027","text":"<p>\u6a21\u578b\u6743\u91cd\u66f4\u65b0\u4e0d\u80fd\u4e2d\u65ad Gateway \u7684\u8bf7\u6c42\u5904\u7406\u3002\u5728 Claw-R1 \u4e2d\uff1a</p> <ul> <li>Trainer \u76f4\u63a5\u7ba1\u7406 Rollout Engine \u548c Reward Model \u7684\u751f\u547d\u5468\u671f\uff08<code>wake_up</code> / <code>sleep</code> / \u6743\u91cd\u540c\u6b65\uff09</li> <li>Gateway \u662f\u7eaf HTTP \u4ee3\u7406 \u2014 \u53ea\u8f6c\u53d1\u8bf7\u6c42\u548c\u63d0\u4ea4 step\uff1b\u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f</li> <li>\u8fd9\u4fdd\u8bc1\u4e86\u5373\u4f7f\u5728\u6743\u91cd\u66f4\u65b0\u671f\u95f4\uff0c\u8bf7\u6c42\u8f6c\u53d1\u548c\u6570\u636e\u6536\u96c6\u4e5f\u80fd\u6301\u7eed\u8fdb\u884c</li> </ul>"},{"location":"concepts/production-scenario/#2","title":"\u2461 \u65e0\u9884\u8bbe\u6570\u636e","text":"<p>\u4f20\u7edf\u6846\u67b6\u9700\u8981\u9884\u5148\u6536\u96c6\u7684\u6570\u636e\u96c6\u3002Claw-R1 \u7684\u8bad\u7ec3\u6570\u636e\u5b8c\u5168\u6765\u81ea\u5b9e\u65f6\u7528\u6237\u4ea4\u4e92\uff1a</p> <ul> <li>\u7528\u6237\u95ee\u4e86\u4ec0\u4e48\u3001Agent \u5982\u4f55\u56de\u7b54\u3001\u8c03\u7528\u4e86\u54ea\u4e9b\u5de5\u5177 \u2014 \u8fd9\u4e9b\u81ea\u52a8\u6210\u4e3a\u8bad\u7ec3\u6570\u636e</li> <li>\u96f6\u6570\u636e\u5de5\u7a0b\uff1b\u6570\u636e\u968f\u670d\u52a1\u8fd0\u884c\u81ea\u7136\u79ef\u7d2f</li> </ul>"},{"location":"concepts/production-scenario/#3-reward","title":"\u2462 \u771f\u5b9e\u73af\u5883\u7684 Reward \u4fe1\u53f7","text":"<p>\u4f20\u7edf RLVR \u7684 reward \u6765\u81ea\u53ef\u9a8c\u8bc1\u7684\u4efb\u52a1\u7ed3\u679c\u3002\u751f\u4ea7\u73af\u5883\u7684 reward \u66f4\u52a0\u5fae\u5999\uff1a</p> <ul> <li>\u7528\u6237\u7ee7\u7eed\u8ffd\u95ee \u2192 \u9690\u5f0f\u6b63\u4fe1\u53f7</li> <li>\u7528\u6237\u7ea0\u6b63 Agent \u2192 \u8d1f\u53cd\u9988</li> <li>\u4efb\u52a1\u5b8c\u6210\u540e\u65e0\u540e\u7eed \u2192 Reward Model \u4f30\u8ba1\u4e2d\u95f4\u6b65\u9aa4\u8d28\u91cf</li> </ul> <p>Claw-R1 \u4f7f\u7528 Reward Model \u5c06\u8fd9\u4e9b\u8f6f\u4fe1\u53f7\u8f6c\u6362\u4e3a\u53ef\u8bad\u7ec3\u7684 process reward\u3002</p>"},{"location":"concepts/production-scenario/#_1","title":"\u4e09\u79cd\u8fd0\u884c\u6a21\u5f0f","text":"\u6a21\u5f0f Agent \u7c7b\u578b \u6570\u636e\u6765\u6e90 \u8bf4\u660e \u767d\u76d2\u79bb\u7ebf AgentFlow (Python) \u5408\u6210\u6570\u636e\u96c6\u6216\u9884\u6536\u96c6\u7684 trajectory \u5df2\u5b8c\u6574\u5b9e\u73b0\uff1b\u63a8\u8350\u7528\u4e8e\u7814\u7a76 \u9ed1\u76d2\u79bb\u7ebf \u4efb\u4f55 HTTP Agent \u9884\u6536\u96c6\u7684\u6570\u636e\u96c6 \u5df2\u5b8c\u6574\u5b9e\u73b0\uff1b\u901a\u8fc7 <code>base_url</code> \u63a5\u5165 \u9ed1\u76d2\u5728\u7ebf \u4efb\u4f55 HTTP Agent \u5b9e\u65f6\u7528\u6237\u4ea4\u4e92 \u76ee\u6807\u751f\u4ea7\u6a21\u5f0f\uff1bGateway \u7aef\u70b9\u5df2\u5b9e\u73b0"},{"location":"concepts/production-scenario/#_2","title":"\u90e8\u7f72 = \u8bad\u7ec3","text":"<p>Claw-R1 \u5f15\u5165\u4e86\u4e00\u79cd\u65b0\u8303\u5f0f\uff1a</p> <pre><code>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502         \u4f20\u7edf\uff1a\u8bad\u7ec3 \u2192 \u90e8\u7f72\uff08\u56fa\u5b9a\uff09                      \u2502\n\u2502                                                      \u2502\n\u2502  [\u5408\u6210\u6570\u636e] \u2192 [\u8bad\u7ec3] \u2192 [\u56fa\u5b9a\u6a21\u578b] \u2192 \u7528\u6237               \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502         Claw-R1\uff1a\u90e8\u7f72 = \u8bad\u7ec3\uff08\u6301\u7eed\uff09                   \u2502\n\u2502                                                      \u2502\n\u2502  \u7528\u6237 \u2500\u2500\u25ba Agent \u2500\u2500\u25ba [\u5b9e\u65f6\u6570\u636e] \u2500\u2500\u25ba \u8bad\u7ec3 \u2500\u2500\u25ba Agent     \u2502\n\u2502           \u25b2___________________________________|      \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre> <p>\u5728\u8fd9\u79cd\u8303\u5f0f\u4e0b\uff1a</p> <ul> <li>\u6bcf\u6b21\u7528\u6237\u4ea4\u4e92\u90fd\u662f\u4e00\u4e2a\u8bad\u7ec3\u6837\u672c</li> <li>\u6bcf\u6b21\u6a21\u578b\u66f4\u65b0\u90fd\u6539\u5584 Agent \u7684\u771f\u5b9e\u4e16\u754c\u8868\u73b0</li> <li>Agent \u8fd0\u884c\u65f6\u95f4\u8d8a\u957f\uff0c\u5bf9\u5176\u7279\u5b9a\u7528\u6237\u548c\u73af\u5883\u7684\u8868\u73b0\u8d8a\u597d</li> </ul>"},{"location":"configuration/","title":"Configuration Reference","text":"<p>Claw-R1 \u4f7f\u7528 Hydra \u8fdb\u884c\u5c42\u6b21\u5316\u914d\u7f6e\u7ba1\u7406\u3002\u6240\u6709 YAML \u914d\u7f6e\u4f4d\u4e8e <code>claw_r1/config/</code>\u3002</p>"},{"location":"configuration/#_1","title":"\u914d\u7f6e\u6587\u4ef6","text":"\u6587\u4ef6 \u7528\u9014 <code>agent_ppo_trainer.yaml</code> \u57fa\u7840 PPO trainer \u914d\u7f6e\uff08\u7ee7\u627f veRL \u7684 ppo_trainer\uff09 <code>async_ppo_trainer.yaml</code> \u5f02\u6b65\u8bad\u7ec3\u4e13\u7528\u914d\u7f6e <code>overrides/rollout.yaml</code> Rollout worker \u8bbe\u7f6e\uff08\u5f02\u6b65\u6a21\u5f0f\u3001Agent Flow\uff09"},{"location":"configuration/#async_ppo_traineryaml","title":"<code>async_ppo_trainer.yaml</code>","text":"<p>\u5f02\u6b65\u8bad\u7ec3\u7684\u6838\u5fc3\u914d\u7f6e\u6587\u4ef6\uff1a</p> <pre><code>defaults:\n  - ppo_trainer\n  - /overrides/rollout@actor_rollout_ref.rollout\n  - _self_\n\n# -- \u5f02\u6b65\u8bad\u7ec3\u8bbe\u7f6e --\nasync_training:\n  staleness_threshold: 0.1           # off-policy \u5bb9\u5fcd\u5ea6\n  trigger_parameter_sync_step: 4     # \u6bcf N \u6b65\u540c\u6b65\u6743\u91cd\u5230 Rollouter\n  require_batches: 1                 # \u6bcf\u6b21\u4ece DataPool \u53d6\u7684 batch \u6570\n  partial_rollout: false             # \u540c\u6b65\u65f6\u662f\u5426\u4e2d\u65ad\u8fdb\u884c\u4e2d\u7684 rollout\n  use_rollout_log_probs: true        # \u4f7f\u7528 rollout \u65f6\u6536\u96c6\u7684 log_probs\n  max_queue_size: null               # DataPool \u961f\u5217\u5927\u5c0f\uff08null = \u65e0\u9650\uff09\n\n  checkpoint_engine:\n    enable: true\n    device_buffer_size_M: 4096\n\n# -- Training GPU Pool --\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 4\n\n# -- Rollout GPU Pool --\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 4\n  total_epochs: 10\n  test_freq: 1\n\n# -- Actor \u914d\u7f6e --\nactor_rollout_ref:\n  hybrid_engine: false\n  actor:\n    use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, true}\n  checkpoint_engine: ${oc.select:async_training.checkpoint_engine, null}\n</code></pre> <p>GPU \u5206\u914d</p> <p><code>trainer</code> \u548c <code>rollout</code> \u90fd\u5fc5\u987b\u5206\u914d GPU\u3002\u603b GPU \u6570 = <code>trainer.nnodes \u00d7 trainer.n_gpus_per_node + rollout.nnodes \u00d7 rollout.n_gpus_per_node</code>\u3002</p>"},{"location":"configuration/#overridesrolloutyaml","title":"<code>overrides/rollout.yaml</code>","text":"<p>Rollout worker \u7684\u914d\u7f6e\u8986\u76d6\uff1a</p> <pre><code>name: vllm\nmode: async\n\nagent:\n  default_agent_flow: single_step_single_turn_agent\n  agent_flow_config_path: null\n</code></pre>"},{"location":"configuration/#gateway","title":"Gateway \u914d\u7f6e","text":"<p>Gateway \u4f5c\u4e3a\u72ec\u7acb\u8fdb\u7a0b\u8fd0\u884c\uff0c\u901a\u8fc7 CLI \u53c2\u6570\u914d\u7f6e\uff08\u975e Hydra\uff09\uff1a</p> <pre><code>python -m claw_r1.gateway.gateway \\\n    --data-pool-name   data_pool \\\n    --vllm-addresses   host1:8001,host2:8001 \\\n    --tokenizer-path   /path/to/model \\\n    --prompt-length    4096 \\\n    --response-length  1024 \\\n    --reward-worker-name reward_loop_worker \\\n    --ray-address      auto \\\n    --ray-namespace    default \\\n    --host             0.0.0.0 \\\n    --port             8100\n</code></pre> <p>Gateway \u542f\u52a8\u8d85\u65f6\u53ef\u901a\u8fc7 Hydra \u914d\u7f6e\uff1a</p> <pre><code>trainer:\n  gateway_startup_timeout: 300   # \u79d2\uff0c\u9ed8\u8ba4 300\n</code></pre>"},{"location":"configuration/#agent-flow","title":"Agent Flow \u914d\u7f6e","text":""},{"location":"configuration/#agent-flow_1","title":"\u767d\u76d2 Agent Flow","text":"<p>\u5728 <code>overrides/rollout.yaml</code> \u4e2d\u6307\u5b9a\uff1a</p> <pre><code>agent:\n  default_agent_flow: single_step_single_turn_agent\n</code></pre>"},{"location":"configuration/#agent-flow_2","title":"\u9ed1\u76d2 Agent Flow","text":"<p>\u901a\u8fc7\u5916\u90e8 YAML \u6587\u4ef6\u6ce8\u518c\uff1a</p> <pre><code># claw_r1/blackbox_agent/agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n</code></pre> <p>\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u5f15\u7528\uff1a</p> <pre><code>actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\nactor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml\n</code></pre>"},{"location":"configuration/#gpu","title":"\u591a GPU \u914d\u7f6e","text":"<pre><code># \u72ec\u7acb\u7684 GPU \u6c60\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 2    # 2 GPU \u7528\u4e8e\u8bad\u7ec3\uff08Actor + Critic\uff09\n\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 1    # 1 GPU \u7528\u4e8e\u63a8\u7406\uff08vLLM\uff09\n</code></pre> <p>\u8d44\u6e90\u6c60\u9694\u79bb</p> <p>Claw-R1 \u4f7f\u7528 Ray \u7684\u8d44\u6e90\u7ec4\u673a\u5236\u786e\u4fdd Trainer \u548c Rollouter \u7684 GPU \u4e0d\u91cd\u53e0\u3002\u4f7f\u7528 <code>async_ppo_trainer.yaml</code> \u65f6\u81ea\u52a8\u914d\u7f6e\u3002\u8be6\u89c1 Async Training\u3002</p>"},{"location":"configuration/#_2","title":"\u5b8c\u6574\u8bad\u7ec3\u811a\u672c\u793a\u4f8b","text":"<pre><code>python3 -m claw_r1.async_main \\\n    algorithm.adv_estimator=grpo \\\n    data.train_files=$TRAIN_FILE \\\n    data.val_files=$VAL_FILE \\\n    data.train_batch_size=128 \\\n    data.max_prompt_length=512 \\\n    data.max_response_length=1024 \\\n    data.return_raw_chat=True \\\n    actor_rollout_ref.model.path=$MODEL \\\n    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \\\n    actor_rollout_ref.rollout.name=vllm \\\n    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \\\n    actor_rollout_ref.rollout.n=5 \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    trainer.n_gpus_per_node=2 \\\n    trainer.nnodes=1 \\\n    rollout.n_gpus_per_node=1 \\\n    rollout.nnodes=1 \\\n    async_training.trigger_parameter_sync_step=1 \\\n    async_training.use_rollout_log_probs=true\n</code></pre> <p>\u66f4\u591a\u793a\u4f8b\u89c1 <code>example/</code> \u76ee\u5f55\u3002</p>"},{"location":"getting-started/","title":"Getting Started","text":"<ul> <li> <p> Installation</p> <p>\u73af\u5883\u914d\u7f6e\u3001\u4f9d\u8d56\u5b89\u88c5\u548c\u9a8c\u8bc1\u3002</p> <p> Installation</p> </li> <li> <p> Quick Start</p> <p>5 \u5206\u949f\u5185\u8fd0\u884c\u4f60\u7684\u7b2c\u4e00\u4e2a\u5f02\u6b65\u8bad\u7ec3\u5b9e\u9a8c\u3002</p> <p> Quick Start</p> </li> </ul>"},{"location":"getting-started/#_1","title":"\u524d\u7f6e\u6761\u4ef6","text":"\u4f9d\u8d56 \u6700\u4f4e\u7248\u672c Python 3.10+ PyTorch 2.0+ CUDA 12.1+ Ray 2.10+ GPU 3 \u5f20\uff082 \u8bad\u7ec3 + 1 \u63a8\u7406\uff09"},{"location":"getting-started/#_2","title":"\u67b6\u6784\u4e00\u89c8","text":"<pre><code>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502   Agent     \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 Gateway  \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 DataPool \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 Trainer  \u2502\n\u2502 (\u9ed1\u76d2/\u767d\u76d2) \u2502\u25c4\u2500\u2500\u2500\u2500\u2502 (:8100)  \u2502     \u2502          \u2502     \u2502          \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2518\n                                                           \u2502 \u6743\u91cd\u540c\u6b65\n                                                           \u25bc\n                                                     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                                                     \u2502  vLLM    \u2502\n                                                     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre>"},{"location":"getting-started/installation/","title":"Installation Guide","text":"<p>Claw-R1 uses the same environment setup as <code>verl</code>.</p>"},{"location":"getting-started/installation/#base-environment","title":"Base Environment","text":"<p>Follow the official <code>verl</code> installation guide, but make sure the environment ends up with <code>verl==0.7.0</code>.</p> <p>If you want a broader overview of the base training workflow, the <code>verl</code> quickstart is also useful.</p>"},{"location":"getting-started/installation/#what-this-means-for-claw-r1","title":"What This Means for Claw-R1","text":"<p>Once the <code>verl</code> environment is working, Claw-R1 should run in the same environment. In practice, that means you can:</p> <ul> <li>prepare a Python environment with <code>verl==0.7.0</code></li> <li>clone this repository</li> <li>run Claw-R1 commands directly from the repository root</li> </ul> <p>You do not need to install Claw-R1 as a separate package.</p> <p>The documentation in this repository intentionally does not duplicate a separate environment guide, so that the infrastructure setup stays aligned with <code>verl</code>.</p>"},{"location":"getting-started/quickstart/","title":"Quick Start","text":"<p>\u672c\u6307\u5357\u5c55\u793a\u5982\u4f55\u5feb\u901f\u8fd0\u884c Claw-R1 \u7684\u5f02\u6b65\u8bad\u7ec3\u3002</p>"},{"location":"getting-started/quickstart/#_1","title":"\u524d\u7f6e\u6761\u4ef6","text":"<ul> <li>\u5df2\u5b8c\u6210 \u5b89\u88c5</li> <li>\u81f3\u5c11 3 \u5f20 GPU\uff082 \u5f20\u8bad\u7ec3 + 1 \u5f20\u63a8\u7406\uff09</li> <li>\u8bad\u7ec3\u6570\u636e\uff08parquet \u683c\u5f0f\uff09</li> </ul>"},{"location":"getting-started/quickstart/#black-box","title":"Black-box \u6a21\u5f0f\uff08\u63a8\u8350\u5165\u95e8\uff09","text":"<p>\u9ed1\u76d2\u6a21\u5f0f\u4e0b\uff0cAgent \u4f7f\u7528\u6807\u51c6 OpenAI API \u4e0e Gateway \u4ea4\u4e92\uff0c\u65e0\u9700\u4fee\u6539 Agent \u4ee3\u7801\u3002\u4ee5 GSM8K \u6570\u5b66\u9898\u4e3a\u4f8b\uff1a</p>"},{"location":"getting-started/quickstart/#1","title":"1. \u51c6\u5907\u6570\u636e","text":"<pre><code># \u4e0b\u8f7d GSM8K \u6570\u636e\u96c6\uff08parquet \u683c\u5f0f\uff09\n# \u786e\u4fdd train.parquet \u548c test.parquet \u5728 ~/data/gsm8k/ \u4e0b\n</code></pre>"},{"location":"getting-started/quickstart/#2","title":"2. \u8fd0\u884c\u8bad\u7ec3","text":"<pre><code>export CUDA_VISIBLE_DEVICES=0,1,2\n\nsh example/test_async_blackbox.sh\n</code></pre> <p>\u8be5\u811a\u672c\u4f1a\uff1a</p> <ol> <li>\u542f\u52a8 Ray \u96c6\u7fa4</li> <li>\u521b\u5efa DataPool\uff08Ray Actor\uff09</li> <li>\u5728 GPU 0-1 \u4e0a\u90e8\u7f72 Actor + Critic\uff08\u8bad\u7ec3\uff09</li> <li>\u5728 GPU 2 \u4e0a\u90e8\u7f72 vLLM\uff08\u63a8\u7406\uff09</li> <li>\u542f\u52a8 Gateway\uff08\u7aef\u53e3 8100\uff09</li> <li>\u8fd0\u884c <code>BlackBoxGSM8KAgentFlow</code>\uff1a<ul> <li>\u4e3a\u6bcf\u4e2a\u6837\u672c\u8c03\u7528 <code>init_trajectory</code> \u83b7\u53d6 <code>base_url</code></li> <li>\u521b\u5efa <code>GSM8KAgent</code>\uff0c\u4f7f\u7528 <code>base_url</code> \u4f5c\u4e3a OpenAI API \u7684 endpoint</li> <li>Agent \u901a\u8fc7\u591a\u8f6e tool calling \u89e3\u9898</li> <li>Gateway \u81ea\u52a8\u6536\u96c6\u6bcf\u8f6e\u5bf9\u8bdd\u4e3a Step \u5e76\u63d0\u4ea4\u5230 DataPool</li> </ul> </li> <li>AsyncTrainer \u4ece DataPool \u62c9\u53d6 batch \u8fdb\u884c PPO \u8bad\u7ec3</li> <li>\u5b9a\u671f\u540c\u6b65\u6743\u91cd\u5230 vLLM</li> </ol>"},{"location":"getting-started/quickstart/#3","title":"3. \u5173\u952e\u914d\u7f6e\u53c2\u6570","text":"<pre><code># GPU \u5206\u914d\ntrainer.n_gpus_per_node=2        # \u8bad\u7ec3\u7528 2 \u5f20 GPU\nrollout.n_gpus_per_node=1        # \u63a8\u7406\u7528 1 \u5f20 GPU\n\n# Agent Flow\nactor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent\nactor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml\n\n# \u5f02\u6b65\u8bad\u7ec3\nasync_training.trigger_parameter_sync_step=1   # \u6bcf\u6b65\u540c\u6b65\u6743\u91cd\nactor_rollout_ref.rollout.n=5                  # \u6bcf\u4e2a prompt \u751f\u6210 5 \u6761 trajectory\n</code></pre>"},{"location":"getting-started/quickstart/#white-box","title":"White-box \u6a21\u5f0f","text":"<p>\u767d\u76d2\u6a21\u5f0f\u4e0b\uff0cAgent \u903b\u8f91\u7528 Python \u7f16\u5199\uff0c\u76f4\u63a5\u901a\u8fc7 Gateway \u7684 <code>/generate</code> \u548c <code>/submit_steps</code> \u7aef\u70b9\u4ea4\u4e92\u3002</p> <pre><code>export CUDA_VISIBLE_DEVICES=0,1,2\n\nsh example/test_async.sh\n</code></pre> <p>\u767d\u76d2\u6a21\u5f0f\u4f7f\u7528 <code>MultiStepAgentFlow</code> \u6216 <code>SingleStepSingleTurnAgentFlow</code>\uff0cAgent \u81ea\u884c\u7ba1\u7406 tokenize \u548c Step \u6784\u5efa\u3002</p>"},{"location":"getting-started/quickstart/#agent","title":"\u81ea\u5b9a\u4e49 Agent","text":""},{"location":"getting-started/quickstart/#agent_1","title":"\u6dfb\u52a0\u9ed1\u76d2 Agent","text":"<ol> <li>\u5b9e\u73b0 Agent \u7c7b\uff08\u53ea\u9700 <code>base_url</code> \u548c OpenAI API\uff09</li> <li>\u5b9e\u73b0 <code>BlackBoxAgentFlowBase</code> \u5b50\u7c7b</li> <li>\u5728 <code>agent_flow_config.yaml</code> \u4e2d\u6ce8\u518c</li> <li>\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u6307\u5b9a</li> </ol> <p>\u8be6\u7ec6\u6b65\u9aa4\u89c1 Black-box Agent\u3002</p>"},{"location":"getting-started/quickstart/#agent_2","title":"\u6dfb\u52a0\u767d\u76d2 Agent","text":"<ol> <li>\u7ee7\u627f <code>AgentFlowBase</code>\uff08\u6216 <code>MultiStepAgentFlow</code>\uff09</li> <li>\u5b9e\u73b0 <code>run()</code> \u65b9\u6cd5</li> <li>\u4f7f\u7528 <code>@register(\"name\")</code> \u6ce8\u518c</li> </ol> <p>\u8be6\u7ec6\u6b65\u9aa4\u89c1 Agent Flow\u3002</p>"},{"location":"getting-started/quickstart/#_2","title":"\u76d1\u63a7\u8bad\u7ec3","text":"<p>\u8bad\u7ec3\u65e5\u5fd7\u9ed8\u8ba4\u8f93\u51fa\u5230\u63a7\u5236\u53f0\u3002\u53ef\u914d\u7f6e SwanLab \u7b49\u65e5\u5fd7\u540e\u7aef\uff1a</p> <pre><code>trainer.logger='[\"console\",\"swanlab\"]'\ntrainer.project_name='my_project'\ntrainer.experiment_name='my_experiment'\n</code></pre>"},{"location":"getting-started/quickstart/#_3","title":"\u4e0b\u4e00\u6b65","text":"<ul> <li>Components \u2014 \u4e86\u89e3\u5404\u7ec4\u4ef6\u7684\u8be6\u7ec6\u8bbe\u8ba1</li> <li>Configuration \u2014 \u5b8c\u6574\u914d\u7f6e\u53c2\u8003</li> <li>Gateway API \u2014 HTTP \u7aef\u70b9\u6587\u6863</li> </ul>"}]}
\ No newline at end of file
+{"config":{"lang":["en"],"separator":"[\\s\\-]+","pipeline":["stopWordFilter"],"fields":{"title":{"boost":1000.0},"text":{"boost":1.0},"tags":{"boost":1000000.0}}},"docs":[{"location":"","title":"Claw-R1","text":"<p>The Data Foundation for Agentic Reinforcement Learning</p> <p>Claw-R1 \u662f Agentic RL \u7684\u6570\u636e\u57fa\u7840\u8bbe\u65bd \u2014 \u4e13\u6ce8\u4e8e\u4ece\u4efb\u610f Agent \u4ea4\u4e92\u4e2d\u91c7\u96c6\u3001\u8bc4\u4f30\u548c\u7b5b\u9009\u9ad8\u8d28\u91cf\u8bad\u7ec3\u6570\u636e\uff0c\u5e76\u652f\u6301\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u7684\u6574\u5408\u3002</p> <ul> <li> <p> Universal Data Collection</p> <p>\u4ece\u767d\u76d2\u3001\u9ed1\u76d2\u5230\u5728\u7ebf\u670d\u52a1 Agent\uff0c\u901a\u8fc7 <code>base_url</code> \u673a\u5236\u96f6\u4ee3\u7801\u63a5\u5165\uff0c\u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\u3002\u652f\u6301 LangChain\u3001AutoGen\u3001CrewAI \u7b49\u4efb\u610f OpenAI \u517c\u5bb9 Agent\u3002</p> <p> Base URL Integration</p> </li> <li> <p> Data Middleware Layer</p> <p>Gateway + DataPool \u6570\u636e\u4e2d\u95f4\u4ef6\uff1aGateway \u91c7\u96c6\u4ea4\u4e92\u6570\u636e\uff0cDataPool \u7ba1\u7406\u6570\u636e\u8d28\u91cf\u3001\u5206\u533a\u7f13\u51b2\u3001\u6309\u9700\u4f9b\u7ed9\u8bad\u7ec3\u5f15\u64ce\u3002</p> <p> Middleware Layer</p> </li> <li> <p> Data Evaluation &amp; Curation</p> <p>\u591a\u7ef4 Reward \u7cfb\u7edf\uff08\u89c4\u5219/\u5224\u522b\u5f0f RM/\u751f\u6210\u5f0f RM\uff09+ \u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u6574\u5408 + \u7b56\u7565\u7248\u672c\u8ffd\u8e2a\uff0c\u7cfb\u7edf\u6027\u8bc4\u4f30\u548c\u7b5b\u9009\u6570\u636e\u8d28\u91cf\u3002</p> <p> Reward System</p> </li> <li> <p> Production Agent Scenario</p> <p>\"\u90e8\u7f72 = \u8bad\u7ec3\" \u8303\u5f0f\u3002Agent \u5728\u670d\u52a1\u7528\u6237\u7684\u540c\u65f6\u6301\u7eed\u91c7\u96c6\u6570\u636e\uff0c\u7528\u6237\u884c\u4e3a\uff08\u91c7\u7eb3\u3001\u4fee\u6539\u3001\u8ffd\u95ee\uff09\u5929\u7136\u6210\u4e3a\u6570\u636e\u8d28\u91cf\u4fe1\u53f7\u3002</p> <p> Production Scenario</p> </li> </ul>"},{"location":"#why-claw-r1","title":"Why Claw-R1?","text":"<p>Agentic RL \u751f\u6001\u6b63\u84ec\u52c3\u53d1\u5c55 \u2014 verl\u3001Agent-R1\u3001Forge \u7b49\u4f18\u79c0\u6846\u67b6\u5728 Runtime \u548c\u8bad\u7ec3\u7b97\u6cd5\u65b9\u9762\u6301\u7eed\u63a8\u8fdb\u3002\u7136\u800c\uff0c\u968f\u7740 Agent \u4ece\u7b80\u5355 ReAct \u6f14\u8fdb\u5230 Claude Code\u3001OpenClaw \u7b49\u901a\u7528\u67b6\u6784\uff0c\u4e00\u4e2a\u76f8\u5bf9\u6b20\u7f3a\u3001\u503c\u5f97\u6df1\u8015\u7684\u65b9\u5411\u9010\u6e10\u6d6e\u73b0\uff1a\u5982\u4f55\u4ece\u591a\u6837\u7684 Agent \u4ea4\u4e92\u4e2d\u7cfb\u7edf\u6027\u5730\u91c7\u96c6\u3001\u8bc4\u4f30\u548c\u7b5b\u9009\u9ad8\u8d28\u91cf\u8bad\u7ec3\u6570\u636e\uff1f</p> <p>Claw-R1 \u805a\u7126\u4e8e\u8fd9\u4e00\u65b9\u5411\uff0c\u63d0\u4f9b Agent \u4e0e Trainer \u4e4b\u95f4\u7684\u6570\u636e\u57fa\u7840\u8bbe\u65bd\u3002</p> \u7ef4\u5ea6 \u4f20\u7edf Agentic RL \u6846\u67b6 Claw-R1 \u6838\u5fc3\u5173\u6ce8 \u8bad\u7ec3\u7b97\u6cd5\u4e0e Runtime \u6570\u636e\u7684\u91c7\u96c6\u3001\u8bc4\u4f30\u4e0e\u7b5b\u9009 Agent \u63a5\u5165 \u9700\u8981\u7528\u6846\u67b6 API \u91cd\u5199 \u53ea\u6539 <code>base_url</code>\uff0c\u96f6\u4ee3\u7801\u4fb5\u5165 \u6570\u636e\u6765\u6e90 \u9884\u6536\u96c6\u7684\u79bb\u7ebf\u6570\u636e \u5b9e\u65f6\u4ea4\u4e92\u81ea\u52a8\u91c7\u96c6 + \u79bb\u7ebf\u6570\u636e\u96c6 \u6570\u636e\u8d28\u91cf\u7ba1\u63a7 \u8f83\u5c11\u5173\u6ce8 \u591a\u7ef4 Reward + \u4eba\u7c7b\u53cd\u9988 + \u65b0\u9c9c\u5ea6\u68c0\u6d4b \u8bad\u7ec3\u5f15\u64ce \u5185\u7f6e\u7ed1\u5b9a \u53ef\u63d2\u62d4 TrainingBackend\uff0c\u5bf9\u63a5\u4efb\u610f\u5f15\u64ce"},{"location":"#_1","title":"\u5feb\u901f\u5f00\u59cb","text":"<pre><code># \u514b\u9686\u4ed3\u5e93\ngit clone https://github.com/AgentR1/Claw-R1 &amp;&amp; cd Claw-R1\n\n# \u8fd0\u884c\u9ed1\u76d2 GSM8K \u8bad\u7ec3\nexport CUDA_VISIBLE_DEVICES=0,1,2\nsh example/test_async_blackbox.sh\n</code></pre> <p> \u5b8c\u6574\u5b89\u88c5\u6307\u5357 \u00b7  Quick Start</p>"},{"location":"#_2","title":"\u9879\u76ee\u72b6\u6001","text":"\u80fd\u529b \u72b6\u6001 \u767d\u76d2 Agent \u6570\u636e\u91c7\u96c6  \u5df2\u5b9e\u73b0 \u9ed1\u76d2 Agent \u6570\u636e\u91c7\u96c6  \u5df2\u5b9e\u73b0 \u5728\u7ebf\u670d\u52a1\u6570\u636e\u91c7\u96c6  \u5f00\u53d1\u4e2d \u5f02\u6b65\u8bad\u7ec3\u4f9b\u7ed9  \u5df2\u5b9e\u73b0 \u4eba\u7c7b\u53cd\u9988\u7ba1\u7ebf  \u89c4\u5212\u4e2d \u6570\u636e\u8d28\u91cf Dashboard  \u89c4\u5212\u4e2d"},{"location":"#team","title":"Team","text":"<p>State Key Laboratory of Cognitive Intelligence, USTC</p>"},{"location":"#citation","title":"Citation","text":"<pre><code>@misc{clawr1-2026,\n  title={Claw-R1: The Data Foundation for Agentic Reinforcement Learning},\n  author={Wang, Daoyu and Li, Qingchuan and Ouyang, Jie and Yu, Shuo and Cheng, Mingyue and Liu, Qi},\n  year={2025},\n  howpublished={\\url{https://github.com/AgentR1/Claw-R1}},\n  note={GitHub repository}\n}\n</code></pre>"},{"location":"contributing/","title":"Contributing","text":"<p>\u611f\u8c22\u4f60\u5bf9 Claw-R1 \u7684\u5173\u6ce8\uff01\u6b22\u8fce\u8d21\u732e\u4ee3\u7801\u3001\u6587\u6863\u548c\u60f3\u6cd5\u3002</p>"},{"location":"contributing/#_1","title":"\u9879\u76ee\u7ed3\u6784","text":"<pre><code>claw_r1/\n\u251c\u2500\u2500 agent_flow/           # Agent \u6267\u884c\u6846\u67b6\uff08\u767d\u76d2 + \u7ba1\u7406\u5668\uff09\n\u251c\u2500\u2500 blackbox_agent/       # \u9ed1\u76d2 Agent \u7cfb\u7edf\uff08Flow + Agent \u5b9e\u73b0\uff09\n\u251c\u2500\u2500 config/               # Hydra \u914d\u7f6e\u6587\u4ef6\n\u251c\u2500\u2500 data_pool/            # DataPool\uff08Ray Actor + Training Backend\uff09\n\u251c\u2500\u2500 gateway/              # Gateway Server\uff08FastAPI\uff09\n\u251c\u2500\u2500 async_main.py         # \u5f02\u6b65\u8bad\u7ec3\u5165\u53e3\n\u251c\u2500\u2500 async_rollouter.py    # AsyncRollouter\uff08Rollout GPU Pool\uff09\n\u251c\u2500\u2500 async_trainer.py      # AsyncTrainer\uff08Training GPU Pool\uff09\n\u251c\u2500\u2500 param_sync.py         # ParameterSynchronizer\n\u251c\u2500\u2500 detach_workers.py     # \u5206\u79bb\u5f0f Actor/Rollout Worker\n\u251c\u2500\u2500 core_algos.py         # PPO/GAE/GRPO \u6838\u5fc3\u7b97\u6cd5\n\u251c\u2500\u2500 reward_loop.py        # RewardLoopWorker\n\u251c\u2500\u2500 metric_utils.py       # \u6307\u6807\u805a\u5408\n\u251c\u2500\u2500 ray_agent_trainer.py  # \u540c\u6b65 Ray PPO Trainer\n\u2514\u2500\u2500 main_agent_ppo.py     # \u540c\u6b65\u8bad\u7ec3\u5165\u53e3\n</code></pre>"},{"location":"contributing/#_2","title":"\u4ee3\u7801\u98ce\u683c","text":"<ul> <li>\u4f7f\u7528 Ruff \u8fdb\u884c lint \u548c\u683c\u5f0f\u5316</li> <li>\u9075\u5faa PEP 8</li> <li>\u7c7b\u578b\u6ce8\u89e3\uff08Python 3.10+ \u8bed\u6cd5\uff09</li> </ul> <pre><code># \u5b89\u88c5 pre-commit hooks\npip install pre-commit\npre-commit install\n\n# \u624b\u52a8\u68c0\u67e5\nruff check .\nruff format .\n</code></pre>"},{"location":"contributing/#_3","title":"\u8d21\u732e\u65b9\u5411","text":""},{"location":"contributing/#_4","title":"\u9ad8\u4f18\u5148\u7ea7","text":"<ul> <li>\u65b0\u7684\u9ed1\u76d2 Agent \u5b9e\u73b0\uff08\u53c2\u8003 <code>blackbox_agent/gsm8k_agent.py</code>\uff09</li> <li>\u65b0\u7684 Reward \u51fd\u6570</li> <li>\u6027\u80fd\u4f18\u5316\uff08DataPool \u541e\u5410\u3001Gateway \u5ef6\u8fdf\uff09</li> </ul>"},{"location":"contributing/#_5","title":"\u6587\u6863","text":"<ul> <li>\u6559\u7a0b\u548c\u793a\u4f8b</li> <li>API \u6587\u6863\u8865\u5145</li> <li>\u4e2d\u82f1\u6587\u7ffb\u8bd1</li> </ul>"},{"location":"contributing/#_6","title":"\u7814\u7a76","text":"<ul> <li>\u65b0\u7684 advantage \u8ba1\u7b97\u7b97\u6cd5</li> <li>\u5728\u7ebf\u5b66\u4e60\u7b56\u7565</li> <li>\u591a Agent \u534f\u4f5c\u8bad\u7ec3</li> </ul>"},{"location":"contributing/#pr","title":"PR \u6d41\u7a0b","text":"<ol> <li>Fork \u4ed3\u5e93</li> <li>\u521b\u5efa feature branch\uff1a<code>git checkout -b feature/my-feature</code></li> <li>\u7f16\u5199\u4ee3\u7801\u548c\u6d4b\u8bd5</li> <li>\u786e\u4fdd <code>ruff check .</code> \u901a\u8fc7</li> <li>\u63d0\u4ea4 PR\uff0c\u63cf\u8ff0\u6539\u52a8\u5185\u5bb9\u548c\u52a8\u673a</li> </ol>"},{"location":"contributing/#_7","title":"\u672c\u5730\u6784\u5efa\u6587\u6863","text":"<pre><code>pip install mkdocs-material\nmkdocs serve\n# \u8bbf\u95ee http://localhost:8000\n</code></pre>"},{"location":"contributing/#_8","title":"\u8054\u7cfb","text":"<ul> <li>GitHub Issues: AgentR1/Claw-R1</li> </ul>"},{"location":"api/","title":"API Reference","text":"<p>\u672c\u8282\u6587\u6863\u5316 Claw-R1 \u5404\u7ec4\u4ef6\u66b4\u9732\u7684 HTTP \u548c Python API\u3002</p> <ul> <li> <p>Gateway HTTP API</p> <p>REST \u7aef\u70b9\uff0c\u7528\u4e8e Agent \u96c6\u6210\u548c Step \u63d0\u4ea4\u3002\u5305\u62ec\u767d\u76d2\u7aef\u70b9\uff08<code>/generate</code>\u3001<code>/submit_steps</code>\uff09\u548c\u9ed1\u76d2\u7aef\u70b9\uff08<code>{base_url}/v1/chat/completions</code>\uff09\u3002</p> <p> Gateway API</p> </li> </ul>"},{"location":"api/#python","title":"Python \u63a5\u53e3","text":""},{"location":"api/#datapool-ray-actor","title":"DataPool (Ray Actor)","text":"<pre><code>import ray\nfrom claw_r1.data_pool import DataPool\n\ndata_pool = ray.get_actor(\"data_pool\")\n\n# Producer\uff08\u7531 Gateway \u5185\u90e8\u8c03\u7528\uff09\nray.get(data_pool.submit_step.remote(step, channel=\"train\"))\nray.get(data_pool.submit_steps.remote(steps, channel=\"train\"))\nray.get(data_pool.complete_trajectory.remote(trajectory_uid, channel=\"train\"))\n\n# Consumer\uff08\u7531 Trainer \u8c03\u7528\uff09\nbatch = ray.get(data_pool.fetch_batch.remote(n_rollouts=5, channel=\"train\"))\n</code></pre>"},{"location":"api/#rewardloopworker-ray-actor","title":"RewardLoopWorker (Ray Actor)","text":"<pre><code>from claw_r1.reward_loop import RewardLoopWorker\n\nreward_worker = ray.get_actor(\"reward_loop_worker\")\nrewards = ray.get(reward_worker.compute_score_batch.remote(steps))\n</code></pre>"},{"location":"api/#agentflowbase-python-class","title":"AgentFlowBase (Python class)","text":"<pre><code>from claw_r1.agent_flow import SingleStepSingleTurnAgentFlow\n\nclass MyFlow(SingleStepSingleTurnAgentFlow):\n    async def run(self, sampling_params, **kwargs) -&gt; int:\n        text, response_ids, prompt_ids = await self.gateway_generate(\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            messages=[{\"role\": \"user\", \"content\": kwargs[\"question\"]}],\n        )\n        # \u6784\u5efa Step \u5e76\u63d0\u4ea4 ...\n        return 1\n</code></pre>"},{"location":"api/#blackboxagentflowbase-python-class","title":"BlackBoxAgentFlowBase (Python class)","text":"<pre><code>from claw_r1.agent_flow.agent_flow import register\nfrom claw_r1.blackbox_agent.blackbox_agent_flow import BlackBoxAgentFlowBase\n\n@register(\"my_blackbox_agent\")\nclass MyBlackBoxFlow(BlackBoxAgentFlowBase):\n    async def _run_agent(self, base_url: str, kwargs: dict) -&gt; int:\n        # \u521b\u5efa Agent\uff0c\u4f7f\u7528 base_url \u4f5c\u4e3a OpenAI API endpoint\n        agent = MyAgent(base_url=base_url)\n        return await agent.solve(task=kwargs[\"raw_prompt\"])\n</code></pre>"},{"location":"api/gateway/","title":"Gateway API","text":"<p>Gateway \u9ed8\u8ba4\u76d1\u542c\u7aef\u53e3 8100\uff08\u901a\u8fc7 <code>--port</code> \u914d\u7f6e\uff09\u3002\u6240\u6709\u7aef\u70b9\u5747\u63a5\u53d7\u548c\u8fd4\u56de JSON\u3002</p>"},{"location":"api/gateway/#base-url","title":"Base URL","text":"<pre><code>http://&lt;gateway-host&gt;:8100\n</code></pre>"},{"location":"api/gateway/#white-box","title":"White-box \u7aef\u70b9","text":"<p>\u8fd9\u4e9b\u7aef\u70b9\u7531 <code>AgentFlowBase</code> \u7684\u767d\u76d2 Agent \u8c03\u7528\u3002</p>"},{"location":"api/gateway/#post-generate","title":"<code>POST /generate</code>","text":"<p>\u5c06\u751f\u6210\u8bf7\u6c42\u8f6c\u53d1\u5230 vLLM \u5e76\u8fd4\u56de\u5e26 token ID \u7684\u54cd\u5e94\u3002</p> <p>\u8c03\u7528\u65b9: <code>AgentFlowBase.gateway_generate()</code></p>"},{"location":"api/gateway/#request","title":"Request","text":"<pre><code>{\n  \"trajectory_uid\": \"string\",\n  \"prompt_uid\": \"string\",\n  \"messages\": [\n    { \"role\": \"user\", \"content\": \"string\" }\n  ],\n  \"max_tokens\": 1024,\n  \"temperature\": 1.0,\n  \"top_p\": 1.0\n}\n</code></pre> \u5b57\u6bb5 \u7c7b\u578b \u5fc5\u586b \u8bf4\u660e <code>trajectory_uid</code> string \u662f \u5f53\u524d\u5bf9\u8bdd\u7684\u552f\u4e00 ID <code>prompt_uid</code> string \u662f Prompt \u7ec4 ID\uff08\u7528\u4e8e GRPO \u5206\u7ec4\uff09 <code>messages</code> array \u662f OpenAI \u683c\u5f0f\u7684\u804a\u5929\u6d88\u606f <code>max_tokens</code> int \u5426 \u6700\u5927\u54cd\u5e94\u957f\u5ea6\uff08\u9ed8\u8ba4\u53d6 <code>--response-length</code>\uff09 <code>temperature</code> float \u5426 \u91c7\u6837\u6e29\u5ea6\uff08\u9ed8\u8ba4 1.0\uff09 <code>top_p</code> float \u5426 Top-p \u91c7\u6837\uff08\u9ed8\u8ba4 1.0\uff09"},{"location":"api/gateway/#response","title":"Response","text":"<pre><code>{\n  \"response_text\": \"string\",\n  \"response_ids\": [101, 202, 303],\n  \"prompt_ids\": [50, 60, 70, 80]\n}\n</code></pre>"},{"location":"api/gateway/#post-submit_steps","title":"<code>POST /submit_steps</code>","text":"<p>\u63d0\u4ea4\u4e00\u4e2a\u6216\u591a\u4e2a <code>Step</code> \u5bf9\u8c61\u5230 DataPool\u3002</p> <p>\u8c03\u7528\u65b9: <code>AgentFlowBase.gateway_submit_steps()</code></p>"},{"location":"api/gateway/#request_1","title":"Request","text":"<pre><code>{\n  \"steps\": [\n    {\n      \"trajectory_uid\": \"string\",\n      \"prompt_uid\": \"string\",\n      \"prompt_ids\": [50, 60, 70],\n      \"response_ids\": [101, 202],\n      \"reward\": 0.0,\n      \"step_index\": 0,\n      \"policy_version\": 42,\n      \"is_last\": true,\n      \"metadata\": {}\n    }\n  ]\n}\n</code></pre>"},{"location":"api/gateway/#response_1","title":"Response","text":"<pre><code>{\n  \"accepted\": 1\n}\n</code></pre>"},{"location":"api/gateway/#post-compute_reward","title":"<code>POST /compute_reward</code>","text":"<p>\u4e3a\u4e00\u4e2a step \u8ba1\u7b97 reward\uff08\u7531 Trainer \u8c03\u7528\uff0c\u4e0d\u7531 Agent \u8c03\u7528\uff09\u3002</p>"},{"location":"api/gateway/#request_2","title":"Request","text":"<pre><code>{\n  \"trajectory_uid\": \"string\",\n  \"messages\": [...],\n  \"dataset_fields\": {\n    \"ground_truth\": \"string\",\n    \"task_type\": \"string\"\n  }\n}\n</code></pre>"},{"location":"api/gateway/#response_2","title":"Response","text":"<pre><code>{\n  \"reward\": 0.85\n}\n</code></pre>"},{"location":"api/gateway/#black-box","title":"Black-box \u7aef\u70b9","text":"<p>\u8fd9\u4e9b\u7aef\u70b9\u4f9b\u9ed1\u76d2 Agent \u4f7f\u7528\u3002\u9ed1\u76d2 Agent \u53ea\u9700\u8981\u77e5\u9053\u4e00\u4e2a <code>base_url</code>\uff0c\u6240\u6709\u4ea4\u4e92\u90fd\u901a\u8fc7\u8be5 URL \u5b8c\u6210\u3002</p> <p><code>base_url</code> \u7684\u683c\u5f0f\u4e3a <code>http://&lt;host&gt;:&lt;port&gt;/&lt;trajectory_uid&gt;/&lt;prompt_uid&gt;</code>\uff0c\u7531 <code>POST /init_trajectory</code> \u8fd4\u56de\u3002</p>"},{"location":"api/gateway/#post-init_trajectory","title":"<code>POST /init_trajectory</code>","text":"<p>\u5206\u914d\u4e00\u6761\u65b0\u7684 trajectory \u5e76\u8fd4\u56de <code>base_url</code>\u3002</p>"},{"location":"api/gateway/#request_3","title":"Request","text":"<p>\u65e0\u8bf7\u6c42\u4f53\u3002</p>"},{"location":"api/gateway/#response_3","title":"Response","text":"<pre><code>{\n  \"trajectory_uid\": \"a1b2c3d4e5f6...\",\n  \"base_url\": \"http://0.0.0.0:8100/a1b2c3d4e5f6.../1\"\n}\n</code></pre>"},{"location":"api/gateway/#post-base_urlv1register_trajectory","title":"<code>POST {base_url}/v1/register_trajectory</code>","text":"<p>\u6ce8\u518c trajectory \u7684 channel \u548c metadata\u3002\u5728 Agent \u5f00\u59cb\u4ea4\u4e92\u4e4b\u524d\u8c03\u7528\u3002</p> <p><code>trajectory_uid</code> \u4ece URL path \u4e2d\u63d0\u53d6\uff0c\u65e0\u9700\u5728 body \u4e2d\u4f20\u9012\u3002</p>"},{"location":"api/gateway/#request_4","title":"Request","text":"<pre><code>{\n  \"channel\": \"train\",\n  \"metadata\": {\n    \"data_source\": \"gsm8k\",\n    \"ground_truth\": \"42\"\n  }\n}\n</code></pre> <p>\u6240\u6709\u5b57\u6bb5\u5747\u4e3a\u53ef\u9009\u3002<code>channel</code> \u9ed8\u8ba4\u4e3a <code>\"train\"</code>\u3002</p>"},{"location":"api/gateway/#response_4","title":"Response","text":"<pre><code>{ \"status\": \"ok\" }\n</code></pre>"},{"location":"api/gateway/#post-base_urlv1chatcompletions","title":"<code>POST {base_url}/v1/chat/completions</code>","text":"<p>OpenAI \u517c\u5bb9\u7684\u804a\u5929\u8865\u5168\u7aef\u70b9\u3002\u9ed1\u76d2 Agent \u53ea\u9700\u5c06 <code>base_url</code> \u8bbe\u4e3a OpenAI SDK \u7684 <code>base_url</code>\uff0c\u5373\u53ef\u900f\u660e\u63a5\u5165\u8bad\u7ec3\u7cfb\u7edf\u3002</p> <p>Gateway \u4f1a\uff1a</p> <ol> <li>\u5c06\u8bf7\u6c42\u8f6c\u53d1\u5230 vLLM \u670d\u52a1\u5668</li> <li>\u5bf9 prompt \u548c response \u8fdb\u884c tokenize</li> <li>\u81ea\u52a8\u6784\u5efa <code>Step</code> \u5e76\u63d0\u4ea4\u5230 DataPool</li> <li>\u8fd4\u56de\u6807\u51c6 OpenAI \u683c\u5f0f\u7684\u54cd\u5e94</li> </ol>"},{"location":"api/gateway/#request_5","title":"Request","text":"<p>\u6807\u51c6 OpenAI <code>chat/completions</code> \u8bf7\u6c42\u4f53\u3002</p> <pre><code>{\n  \"model\": \"qwen\",\n  \"messages\": [\n    { \"role\": \"user\", \"content\": \"What is 2+2?\" }\n  ],\n  \"temperature\": 0.7\n}\n</code></pre>"},{"location":"api/gateway/#response_5","title":"Response","text":"<p>\u6807\u51c6 OpenAI <code>chat/completions</code> \u54cd\u5e94\u4f53\u3002</p> <pre><code>{\n  \"id\": \"chatcmpl-...\",\n  \"object\": \"chat.completion\",\n  \"choices\": [\n    {\n      \"index\": 0,\n      \"message\": {\n        \"role\": \"assistant\",\n        \"content\": \"4\"\n      },\n      \"finish_reason\": \"stop\"\n    }\n  ],\n  \"usage\": {\n    \"prompt_tokens\": 10,\n    \"completion_tokens\": 1,\n    \"total_tokens\": 11\n  }\n}\n</code></pre>"},{"location":"api/gateway/#post-base_urlv1complete_trajectory","title":"<code>POST {base_url}/v1/complete_trajectory</code>","text":"<p>\u6807\u8bb0\u4e00\u6761 trajectory \u5b8c\u6210\u3002Agent \u5b8c\u6210\u6240\u6709\u4ea4\u4e92\u540e\u8c03\u7528\u3002</p>"},{"location":"api/gateway/#request_6","title":"Request","text":"<p>\u65e0\u8bf7\u6c42\u4f53\u3002</p>"},{"location":"api/gateway/#response_6","title":"Response","text":"<pre><code>{ \"status\": \"ok\" }\n</code></pre>"},{"location":"api/gateway/#post-complete_trajectorytrajectory_uid","title":"<code>POST /complete_trajectory/{trajectory_uid}</code>","text":"<p>\u5185\u90e8\u7aef\u70b9\uff0c\u901a\u8fc7 trajectory_uid \u76f4\u63a5\u6807\u8bb0\u5b8c\u6210\u3002\u53ef\u9009\u4f20\u5165 reward \u548c channel\u3002</p>"},{"location":"api/gateway/#request_7","title":"Request","text":"<pre><code>{\n  \"channel\": \"train\",\n  \"reward\": 0.9\n}\n</code></pre>"},{"location":"api/gateway/#response_7","title":"Response","text":"<pre><code>{ \"status\": \"ok\" }\n</code></pre>"},{"location":"api/gateway/#_1","title":"\u5c31\u7eea\u68c0\u67e5","text":""},{"location":"api/gateway/#get-ready","title":"<code>GET /ready</code>","text":"<p>\u5f53 Gateway \u5b8c\u5168\u521d\u59cb\u5316\uff08\u5305\u62ec tokenizer \u52a0\u8f7d\u5b8c\u6210\uff09\u540e\u8fd4\u56de 200\u3002\u7528\u4e8e Rollouter \u542f\u52a8\u65f6\u7684\u5065\u5eb7\u68c0\u67e5\u3002</p>"},{"location":"api/gateway/#response-200","title":"Response (200)","text":"<pre><code>{ \"status\": \"ready\" }\n</code></pre>"},{"location":"api/gateway/#response-503","title":"Response (503)","text":"<pre><code>{ \"detail\": \"Gateway not ready (tokenizer still loading)\" }\n</code></pre>"},{"location":"api/gateway/#get-docs","title":"<code>GET /docs</code>","text":"<p>FastAPI \u81ea\u52a8\u751f\u6210\u7684 Swagger UI \u6587\u6863\u9875\u9762\u3002</p>"},{"location":"components/","title":"Components","text":"<p>Claw-R1 \u7684\u7ec4\u4ef6\u56f4\u7ed5\u6570\u636e\u6d41\u7ec4\u7ec7\uff1a\u4ece Agent \u4ea4\u4e92\u7684\u91c7\u96c6\uff0c\u5230\u6570\u636e\u7684\u7ba1\u7406\u4e0e\u8d28\u91cf\u8bc4\u4f30\uff0c\u518d\u5230\u5411\u8bad\u7ec3\u5f15\u64ce\u7684\u4f9b\u7ed9\u3002\u5404\u7ec4\u4ef6\u901a\u8fc7 HTTP \u548c Ray RPC \u901a\u4fe1\u3002</p> <ul> <li> <p>Gateway Server \u00b7 \u6570\u636e\u91c7\u96c6\u5165\u53e3</p> <p>FastAPI HTTP \u670d\u52a1\u3002\u6240\u6709 Agent LLM \u8c03\u7528\u7684\u7edf\u4e00\u5165\u53e3\uff0c\u81ea\u52a8\u4ece\u4ea4\u4e92\u4e2d\u91c7\u96c6\u8bad\u7ec3\u6570\u636e\uff08Step\uff09\u5e76\u63d0\u4ea4\u5230 DataPool\u3002\u652f\u6301\u767d\u76d2\u663e\u5f0f\u63d0\u4ea4\u548c\u9ed1\u76d2\u81ea\u52a8\u91c7\u96c6\u4e24\u79cd\u6a21\u5f0f\u3002</p> <p> Gateway Server</p> </li> <li> <p>DataPool \u00b7 \u6570\u636e\u7ba1\u7406\u6838\u5fc3</p> <p>Ray Actor\u3002Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u4e2d\u67a2 \u2014 \u5b58\u50a8\u3001\u7d22\u5f15\u3001\u5206\u533a\u548c\u4f9b\u7ed9\u4ea4\u4e92\u6570\u636e\u3002\u652f\u6301 Channel \u9694\u79bb\u3001GRPO \u5206\u7ec4\u3001\u5bb9\u91cf\u80cc\u538b\u63a7\u5236\u548c\u5b9e\u65f6\u7edf\u8ba1\u76d1\u63a7\u3002</p> <p> DataPool</p> </li> <li> <p>Reward System \u00b7 \u6570\u636e\u8d28\u91cf\u8bc4\u4f30</p> <p><code>RewardLoopWorker</code> Ray Actor\u3002\u591a\u7ef4\u5ea6\u6570\u636e\u8d28\u91cf\u8bc4\u4f30\uff1arule-based\u3001discriminative RM\u3001generative RM\uff0c\u4ee5\u53ca\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u7684\u6574\u5408\u3002</p> <p> Reward System</p> </li> <li> <p>Agent Flow \u00b7 \u767d\u76d2\u6570\u636e\u91c7\u96c6</p> <p>Agent \u6267\u884c\u751f\u547d\u5468\u671f\u7ba1\u7406\u3002\u767d\u76d2 Agent \u901a\u8fc7 Python API \u663e\u5f0f\u63d0\u4ea4 Step\uff0c\u5b8c\u6574\u63a7\u5236\u6570\u636e\u91c7\u96c6\u8fc7\u7a0b\u3002</p> <p> Agent Flow</p> </li> <li> <p>Black-box Agent \u00b7 \u9ed1\u76d2\u6570\u636e\u91c7\u96c6</p> <p>\u96f6\u4ee3\u7801\u4fb5\u5165\u7684\u9ed1\u76d2 Agent \u63a5\u5165\u3002\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u901a\u8fc7 <code>base_url</code> \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\u3002</p> <p> Black-box Agent</p> </li> <li> <p>Async Training \u00b7 \u6570\u636e\u6d88\u8d39\u4e0e\u8bad\u7ec3</p> <p><code>AsyncTrainer</code> \u548c <code>AsyncRollouter</code> Ray Actor\u3002\u6301\u7eed\u4ece DataPool \u6d88\u8d39\u9ad8\u8d28\u91cf\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\uff0c\u5e26\u53c2\u6570\u540c\u6b65\u3002</p> <p> Async Training</p> </li> </ul>"},{"location":"components/#_1","title":"\u6570\u636e\u6d41\u5168\u666f","text":"<pre><code>                        \u6570\u636e\u91c7\u96c6\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n  \u9ed1\u76d2 Agent \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u25ba\u2502                                         \u2502\n  (base_url)          \u2502         GATEWAY SERVER                  \u2502\n                      \u2502         (FastAPI, \u7aef\u53e3 8100)             \u2502\n  \u767d\u76d2 Agent \u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u25ba\u2502         \u81ea\u52a8\u91c7\u96c6\u4ea4\u4e92 Step                 \u2502\n  (AgentFlow)         \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                   \u2502 Ray RPC (submit_steps)\n                                   \u25bc\n                        \u6570\u636e\u7ba1\u7406\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         DATAPOOL                         \u2502\n                      \u2502         (Ray Actor)                      \u2502\n                      \u2502                                          \u2502\n                      \u2502  \u2022 \u5b58\u50a8\u4e0e\u7d22\u5f15    \u2022 Channel \u5206\u533a            \u2502\n                      \u2502  \u2022 GRPO \u5206\u7ec4     \u2022 \u5bb9\u91cf\u80cc\u538b\u63a7\u5236            \u2502\n                      \u2502  \u2022 \u8d28\u91cf\u8bc4\u4f30      \u2022 \u5b9e\u65f6\u7edf\u8ba1\u76d1\u63a7            \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                         \u2502 fetch_batch()\n                                         \u25bc\n                        \u6570\u636e\u6d88\u8d39\u5c42\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         ASYNC TRAINER                    \u2502\n                      \u2502         (Ray Actor, Training GPU Pool)   \u2502\n                      \u2502   \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n                      \u2502   \u2502  Actor \u2502 Critic \u2502 RefPolicy      \u2502   \u2502\n                      \u2502   \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                       \u2502 NCCL weight sync\n                                       \u25bc\n                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                      \u2502         ASYNC ROLLOUTER                  \u2502\n                      \u2502         (Ray Actor, Rollout GPU Pool)    \u2502\n                      \u2502         vLLM servers                     \u2502\n                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre>"},{"location":"components/agent-flow/","title":"Agent Flow","text":"<p>Agent Flow \u662f Claw-R1 \u4e2d\u7ba1\u7406 Agent \u6267\u884c\u751f\u547d\u5468\u671f\u7684\u6846\u67b6\u3002\u5b83\u5206\u4e3a\u4e24\u5927\u7c7b\uff1a</p> <ul> <li>\u767d\u76d2 Agent Flow\uff1aAgent \u903b\u8f91\u7528 Python \u7f16\u5199\uff0c\u76f4\u63a5\u901a\u8fc7 Gateway \u7684 <code>/generate</code>\u3001<code>/submit_steps</code> \u7b49\u7aef\u70b9\u4ea4\u4e92\uff0c\u81ea\u884c\u7ba1\u7406 tokenize \u548c Step \u6784\u5efa\u3002</li> <li>\u9ed1\u76d2 Agent Flow\uff1aAgent \u4f7f\u7528\u6807\u51c6 OpenAI API\uff0c\u901a\u8fc7 <code>base_url</code> \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u5904\u7406 tokenize \u548c Step \u63d0\u4ea4\u3002</li> </ul>"},{"location":"components/agent-flow/#_1","title":"\u7c7b\u5c42\u6b21","text":"<pre><code>AgentFlowBase                              (abstract base)\n    \u2502\n    \u251c\u2500\u2500 SingleStepSingleTurnAgentFlow      (\u767d\u76d2\uff1a\u5355\u8f6e\u95ee\u7b54)\n    \u251c\u2500\u2500 MultiStepAgentFlow                 (\u767d\u76d2\uff1a\u591a\u8f6e\u5de5\u5177\u8c03\u7528)\n    \u2502\n    \u2514\u2500\u2500 BlackBoxAgentFlowBase              (\u9ed1\u76d2\u57fa\u7c7b)\n            \u2514\u2500\u2500 BlackBoxGSM8KAgentFlow     (\u9ed1\u76d2\uff1aGSM8K \u6570\u5b66\u9898)\n</code></pre>"},{"location":"components/agent-flow/#agentflowbase","title":"AgentFlowBase","text":"<p>\u6240\u6709 Agent Flow \u7684\u62bd\u8c61\u57fa\u7c7b\uff0c\u63d0\u4f9b\uff1a</p> <ul> <li>Gateway URL \u7ba1\u7406</li> <li>\u914d\u7f6e\u8bbf\u95ee\uff08<code>self.config</code>\uff09</li> <li>\u62bd\u8c61\u65b9\u6cd5 <code>run(sampling_params, **kwargs) -&gt; int</code></li> </ul>"},{"location":"components/agent-flow/#_2","title":"\u767d\u76d2\u8f85\u52a9\u65b9\u6cd5","text":"<p>\u767d\u76d2 Agent Flow \u53ef\u4f7f\u7528\u4ee5\u4e0b\u65b9\u6cd5\u4e0e Gateway \u4ea4\u4e92\uff1a</p>"},{"location":"components/agent-flow/#gateway_generatetrajectory_uid-prompt_uid-messages-kwargs","title":"<code>gateway_generate(trajectory_uid, prompt_uid, messages, **kwargs)</code>","text":"<p>\u5411 Gateway <code>/generate</code> \u53d1\u9001\u5f02\u6b65 HTTP POST\uff0c\u8fd4\u56de\u751f\u6210\u6587\u672c\u548c token IDs\u3002</p> <pre><code>text, response_ids, prompt_ids = await self.gateway_generate(\n    trajectory_uid=\"traj-abc\",\n    prompt_uid=\"prompt-xyz\",\n    messages=[{\"role\": \"user\", \"content\": \"Summarize this document.\"}],\n    max_tokens=512,\n    temperature=0.8,\n)\n</code></pre>"},{"location":"components/agent-flow/#gateway_submit_stepssteps-channeltrain","title":"<code>gateway_submit_steps(steps, channel=\"train\")</code>","text":"<p>\u5411 Gateway <code>/submit_steps</code> \u63d0\u4ea4 Step \u5217\u8868\u3002</p>"},{"location":"components/agent-flow/#gateway_compute_rewardtrajectory_uid-messages-dataset_fields","title":"<code>gateway_compute_reward(trajectory_uid, messages, dataset_fields)</code>","text":"<p>\u5411 Gateway <code>/compute_reward</code> \u8bf7\u6c42 reward \u8ba1\u7b97\u3002</p>"},{"location":"components/agent-flow/#singlestepsingleturnagentflow","title":"SingleStepSingleTurnAgentFlow","text":"<p>\u6700\u7b80\u5355\u7684\u767d\u76d2\u5b9e\u73b0\uff1a\u5355\u4e2a prompt \u4ea7\u751f\u5355\u4e2a response\u3002\u9002\u7528\u4e8e\u6bcf\u4e2a\u6837\u672c\u90fd\u662f\u72ec\u7acb\u95ee\u7b54\u5bf9\u7684\u6570\u636e\u96c6\u3002</p> <pre><code>class MyAgentFlow(SingleStepSingleTurnAgentFlow):\n    async def run(self, sampling_params, **kwargs) -&gt; int:\n        messages = [{\"role\": \"user\", \"content\": kwargs[\"raw_prompt\"]}]\n        text, response_ids, prompt_ids = await self.gateway_generate(\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            messages=messages,\n        )\n        step = Step(\n            prompt_ids=prompt_ids,\n            response_ids=response_ids,\n            reward=0.0,\n            trajectory_uid=kwargs[\"trajectory_uid\"],\n            prompt_uid=kwargs[\"prompt_uid\"],\n            step_index=0,\n            is_last=True,\n        )\n        await self.gateway_submit_steps([step])\n        return 1\n</code></pre>"},{"location":"components/agent-flow/#multistepagentflow","title":"MultiStepAgentFlow","text":"<p>\u591a\u8f6e Agent Flow\uff0c\u652f\u6301\u5de5\u5177\u8c03\u7528\u3001\u89c4\u5212\u7b49\u573a\u666f\u3002\u6bcf\u8f6e\u4ea7\u751f\u4e00\u4e2a Step\uff0c\u901a\u8fc7 <code>trajectory_uid</code> \u4e32\u8054\u3002</p> <pre><code>class ToolAgentFlow(MultiStepAgentFlow):\n    async def run(self, sampling_params, **kwargs) -&gt; int:\n        messages = [{\"role\": \"user\", \"content\": kwargs[\"task\"]}]\n        step_index = 0\n\n        while True:\n            text, response_ids, prompt_ids = await self.gateway_generate(...)\n            is_last = self.is_terminal(text)\n\n            step = Step(\n                prompt_ids=prompt_ids,\n                response_ids=response_ids,\n                step_index=step_index,\n                is_last=is_last,\n                ...\n            )\n            await self.gateway_submit_steps([step])\n\n            if is_last:\n                break\n\n            messages.append({\"role\": \"assistant\", \"content\": text})\n            tool_result = await self.execute_tool(text)\n            messages.append({\"role\": \"tool\", \"content\": tool_result})\n            step_index += 1\n\n        return step_index + 1\n</code></pre>"},{"location":"components/agent-flow/#blackboxagentflowbase","title":"BlackBoxAgentFlowBase","text":"<p>\u9ed1\u76d2 Agent Flow \u7684\u57fa\u7c7b\u3002\u5904\u7406\u4e0e Gateway \u7684\u5b8c\u6574\u534f\u8bae\uff08init \u2192 register \u2192 complete\uff09\uff0c\u5c06 Agent \u6267\u884c\u59d4\u6258\u7ed9\u5b50\u7c7b\u7684 <code>_run_agent</code> \u65b9\u6cd5\u3002</p> <p>\u8be6\u7ec6\u6587\u6863\u89c1 Black-box Agent\u3002</p>"},{"location":"components/agent-flow/#_3","title":"\u6ce8\u518c\u673a\u5236","text":"<p>Agent Flow \u901a\u8fc7 <code>@register(\"name\")</code> \u88c5\u9970\u5668\u6ce8\u518c\u5230\u5168\u5c40\u6ce8\u518c\u8868\uff1a</p> <pre><code>from claw_r1.agent_flow.agent_flow import register\n\n@register(\"my_agent_flow\")\nclass MyAgentFlow(AgentFlowBase):\n    ...\n</code></pre> <p>\u4e5f\u53ef\u901a\u8fc7 YAML \u914d\u7f6e\u6587\u4ef6\u6ce8\u518c\uff08\u7528\u4e8e\u9ed1\u76d2 Agent\uff09\uff1a</p> <pre><code># agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n</code></pre>"},{"location":"components/agent-flow/#agentflowmanager-agentflowworker","title":"AgentFlowManager \u548c AgentFlowWorker","text":"<ul> <li>AgentFlowManager\uff1a\u7ba1\u7406\u591a\u4e2a <code>AgentFlowWorker</code>\uff0c\u5c06 batch \u4e2d\u7684\u6bcf\u4e2a\u6837\u672c\u5206\u53d1\u7ed9\u5bf9\u5e94\u7684 Agent Flow \u6267\u884c\u3002</li> <li>AgentFlowWorker\uff1aRay Actor\uff0c\u6301\u6709 tokenizer \u548c\u914d\u7f6e\uff0c\u6267\u884c\u5177\u4f53\u7684 Agent Flow\u3002</li> </ul> <pre><code>AsyncRollouter\n    \u2514\u2500\u2500 AgentFlowManager\n            \u2514\u2500\u2500 AgentFlowWorker (Ray Actor, \u53ef\u591a\u4e2a)\n                    \u2514\u2500\u2500 AgentFlowBase \u5b50\u7c7b\u5b9e\u4f8b\n</code></pre>"},{"location":"components/agent-flow/#_4","title":"\u914d\u7f6e","text":"<p>\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u6307\u5b9a Agent Flow\uff1a</p> <pre><code>python3 -m claw_r1.async_main \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    ...\n</code></pre>"},{"location":"components/async-training/","title":"Async Training","text":"<p>Claw-R1 \u7684\u5f02\u6b65\u8bad\u7ec3\u67b6\u6784\u5c06 rollout\uff08trajectory \u751f\u6210\uff09\u548c training\uff08\u6743\u91cd\u66f4\u65b0\uff09\u5206\u79bb\u4e3a\u4e24\u4e2a\u72ec\u7acb\u7684 Ray Actor\uff0c\u8fd0\u884c\u5728\u4e0d\u540c\u7684 GPU \u6c60\u4e0a\u3002</p>"},{"location":"components/async-training/#_1","title":"\u67b6\u6784","text":"<pre><code>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  Rollout GPU Pool                                        \u2502\n\u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n\u2502  \u2502  AsyncRollouter (Ray Actor)                      \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 DataLoader (\u904d\u5386\u6570\u636e\u96c6)                      \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 vLLM replicas (\u63a8\u7406\u5f15\u64ce)                     \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 AgentFlowManager (\u7ba1\u7406 Agent \u6267\u884c)           \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Gateway (FastAPI \u5b50\u8fdb\u7a0b, \u7aef\u53e3 8100)          \u2502   \u2502\n\u2502  \u2502  \u2514\u2500\u2500 RewardLoopWorker (\u8ba1\u7b97 reward)               \u2502   \u2502\n\u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  submit_step (via Gateway \u2192 DataPool)\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502   DataPool       \u2502   \u2190 \u5171\u4eab Ray Actor\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  fetch_batch()\n                       \u25bc\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  Training GPU Pool                                       \u2502\n\u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510   \u2502\n\u2502  \u2502  AsyncTrainer (Ray Actor)                        \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Actor worker group (\u7b56\u7565\u6a21\u578b)                \u2502   \u2502\n\u2502  \u2502  \u251c\u2500\u2500 Critic worker group (\u4ef7\u503c\u6a21\u578b)               \u2502   \u2502\n\u2502  \u2502  \u2514\u2500\u2500 RefPolicy worker group (KL baseline)        \u2502   \u2502\n\u2502  \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518   \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502  NCCL weight broadcast\n                       \u25bc\n              AsyncRollouter.update_weights()\n</code></pre>"},{"location":"components/async-training/#asynctrainer","title":"AsyncTrainer","text":"<p><code>AsyncTrainer</code> \u662f\u8fd0\u884c\u5728 Training GPU Pool \u4e0a\u7684 Ray Actor\uff0c\u6267\u884c\u6301\u7eed\u7684 PPO \u8bad\u7ec3\u5faa\u73af\uff1a</p> <ol> <li>\u4ece DataPool <code>fetch_batch()</code> \u2014 \u963b\u585e\u7b49\u5f85\u5b8c\u6574\u7684 <code>prompt_uid</code> \u7ec4</li> <li>\u901a\u8fc7 <code>RewardLoopWorker</code> \u8ba1\u7b97 batch \u7684 reward</li> <li>\u8ba1\u7b97 advantage\uff08GAE \u6216 GRPO\uff09</li> <li>\u6267\u884c PPO Actor + Critic \u66f4\u65b0</li> <li>\u6bcf <code>trigger_parameter_sync_step</code> \u6b65\u89e6\u53d1\u6743\u91cd\u540c\u6b65</li> </ol>"},{"location":"components/async-training/#worker","title":"Worker \u521d\u59cb\u5316","text":"<p>AsyncTrainer \u5728 <code>init_workers()</code> \u4e2d\u521b\u5efa Actor\u3001Critic\u3001RefPolicy \u7684 worker group\uff0c\u5e76\u5c06\u5b83\u4eec\u90e8\u7f72\u5230 Training GPU Pool\uff1a</p> <pre><code># \u521b\u5efa\u987a\u5e8f\uff1aCritic \u2192 RefPolicy \u2192 Actor\uff08\u6700\u540e\u521b\u5efa Actor \u4ee5\u514d\u5f71\u54cd vLLM \u5185\u5b58\u4f30\u7b97\uff09\nself.critic_wg.init_model()\nself.ref_policy_wg.init_model()\nself.actor_wg.init_model()\n</code></pre>"},{"location":"components/async-training/#asyncrollouter","title":"AsyncRollouter","text":"<p><code>AsyncRollouter</code> \u8fd0\u884c\u5728 Rollout GPU Pool \u4e0a\uff0c\u6301\u6709\uff1a</p> <ul> <li>DataLoader\uff1a\u904d\u5386\u8bad\u7ec3\u6570\u636e\u96c6</li> <li>vLLM replicas\uff1a\u9ad8\u541e\u5410\u63a8\u7406\u670d\u52a1\u5668</li> <li>AgentFlowManager\uff1a\u7ba1\u7406 <code>AgentFlowBase</code> worker</li> <li>Gateway\uff1aFastAPI HTTP \u670d\u52a1\u5668\uff08\u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u542f\u52a8\uff09</li> <li>RewardLoopWorker\uff1a\u5728 rollout \u671f\u95f4\u8ba1\u7b97 reward</li> </ul>"},{"location":"components/async-training/#gateway","title":"Gateway \u542f\u52a8\u6d41\u7a0b","text":"<p>Rollouter \u5c06 Gateway \u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u542f\u52a8\uff1a</p> <ol> <li>\u5feb\u901f\u521d\u59cb\u5316\uff08Ray \u8fde\u63a5\u3001DataPool\u3001vLLM \u5730\u5740\uff09\u2192 HTTP \u7acb\u5373\u53ef\u7528</li> <li>Tokenizer \u5728\u540e\u53f0\u7ebf\u7a0b\u52a0\u8f7d</li> <li>Rollouter \u8f6e\u8be2 <code>GET /ready</code> \u7b49\u5f85 Gateway \u5b8c\u5168\u5c31\u7eea</li> <li>\u8d85\u65f6\u65f6\u95f4\u53ef\u901a\u8fc7 <code>trainer.gateway_startup_timeout</code> \u914d\u7f6e\uff08\u9ed8\u8ba4 300 \u79d2\uff09</li> </ol>"},{"location":"components/async-training/#_2","title":"\u6682\u505c/\u6062\u590d\uff08\u6743\u91cd\u540c\u6b65\uff09","text":"<p>\u6743\u91cd\u540c\u6b65\u671f\u95f4\uff0cRollouter \u6682\u505c\u751f\u6210\uff1a</p> <pre><code>rollouter.pause()                          # \u505c\u6b62\u65b0\u751f\u6210\uff0c\u7b49\u5f85\u8fdb\u884c\u4e2d\u7684\u8bf7\u6c42\u5b8c\u6210\n# NCCL broadcast: Actor weights \u2192 vLLM\nrollouter.update_param_version(new_version)\nrollouter.resume()                         # \u4f7f\u7528\u66f4\u65b0\u540e\u7684\u6743\u91cd\u6062\u590d\u751f\u6210\n</code></pre>"},{"location":"components/async-training/#parametersynchronizer","title":"ParameterSynchronizer","text":"<p>\u8f7b\u91cf\u7ea7 Ray Actor\uff0c\u534f\u8c03 AsyncTrainer \u548c AsyncRollouter \u4e4b\u95f4\u7684\u6743\u91cd\u540c\u6b65\uff1a</p> <pre><code>class ParameterSynchronizer:\n    def sync_weights(self, version, validate=False):\n        # 1. \u6682\u505c rollout\n        # 2. NCCL broadcast: trainer Actor \u2192 vLLM\n        # 3. \u66f4\u65b0 rollouter \u7684 param_version\n        # 4. \u53ef\u9009\uff1a\u8fd0\u884c\u9a8c\u8bc1\n        # 5. \u6062\u590d rollout\n</code></pre>"},{"location":"components/async-training/#advantage","title":"Advantage \u8ba1\u7b97","text":""},{"location":"components/async-training/#gae-generalized-advantage-estimation","title":"GAE (Generalized Advantage Estimation)","text":"<p>\u7528\u4e8e trajectory \u7ea7\u522b\u7684 value baseline\u3002\u5728 step \u7ea7\u522b \u8ba1\u7b97 advantage\uff0c\u7136\u540e\u5e7f\u64ad\u5230 token \u7ea7\u522b\uff08\u540c\u4e00 step \u5185\u6240\u6709 response token \u5171\u4eab\u76f8\u540c\u7684 advantage\uff09\u3002</p>"},{"location":"components/async-training/#grpo-group-relative-policy-optimization","title":"GRPO (Group Relative Policy Optimization)","text":"<p>\u7528\u4e8e prompt \u7ea7\u522b\u7684 baseline\u3002\u5c06\u6765\u81ea\u540c\u4e00 <code>prompt_uid</code> \u7684\u591a\u4e2a rollout \u5206\u7ec4\uff0c\u5728\u7ec4\u5185\u5f52\u4e00\u5316 advantage\u3002\u4e0d\u9700\u8981\u5355\u72ec\u7684 Critic \u6a21\u578b\uff0c\u66f4\u8282\u7701\u5185\u5b58\u3002</p>"},{"location":"components/async-training/#_3","title":"\u8d44\u6e90\u6c60\u914d\u7f6e","text":"<p>Trainer \u548c Rollouter \u8fd0\u884c\u5728\u72ec\u7acb\u7684 GPU \u6c60\u4e0a\uff0c\u9632\u6b62\u8d44\u6e90\u7ade\u4e89\uff1a</p> <pre><code># async_ppo_trainer.yaml\n\n# Training GPU Pool (Actor, Critic, RefPolicy)\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 2\n\n# Rollout GPU Pool (vLLM)\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 1\n</code></pre> <p>\u603b GPU \u6570 = <code>trainer.nnodes \u00d7 trainer.n_gpus_per_node + rollout.nnodes \u00d7 rollout.n_gpus_per_node</code>\u3002</p> <p>GPU \u5206\u914d</p> <p>\u5fc5\u987b\u540c\u65f6\u4e3a trainer \u548c rollout \u914d\u7f6e GPU\u3002\u5982\u679c trainer \u6ca1\u6709\u5206\u914d GPU\uff0c\u8bad\u7ec3\u53c2\u6570\uff08Actor\u3001Critic\uff09\u5c06\u65e0\u6cd5\u90e8\u7f72\u5230 GPU \u4e0a\u3002</p>"},{"location":"components/async-training/#_4","title":"\u5173\u952e\u914d\u7f6e","text":"<pre><code># async_ppo_trainer.yaml\nasync_training:\n  staleness_threshold: 0.1           # off-policy \u5bb9\u5fcd\u5ea6\n  trigger_parameter_sync_step: 4     # \u6bcf N \u6b65\u540c\u6b65\u6743\u91cd\n  require_batches: 1                 # \u6bcf\u6b21\u4ece DataPool \u53d6\u591a\u5c11\u4e2a batch\n  use_rollout_log_probs: true        # \u4f7f\u7528 rollout \u65f6\u7684 log_probs\n  max_queue_size: null               # DataPool \u961f\u5217\u5927\u5c0f\uff08null = \u65e0\u9650\uff09\n  partial_rollout: false             # \u540c\u6b65\u65f6\u662f\u5426\u4e2d\u65ad rollout\n\n  checkpoint_engine:\n    enable: true\n    device_buffer_size_M: 4096\n</code></pre>"},{"location":"components/async-training/#_5","title":"\u5165\u53e3","text":"<pre><code>python3 -m claw_r1.async_main \\\n    trainer.n_gpus_per_node=2 \\\n    trainer.nnodes=1 \\\n    rollout.n_gpus_per_node=1 \\\n    rollout.nnodes=1 \\\n    async_training.trigger_parameter_sync_step=1 \\\n    ...\n</code></pre> <p>\u5b8c\u6574\u793a\u4f8b\u89c1 <code>example/test_async_blackbox.sh</code>\u3002</p>"},{"location":"components/blackbox-agent/","title":"Black-box Agent","text":"<p>Black-box Agent \u7cfb\u7edf\u5141\u8bb8\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u63a5\u5165 Claw-R1 \u7684\u8bad\u7ec3\u5faa\u73af\uff0c\u65e0\u9700\u4fee\u6539 Agent \u5185\u90e8\u903b\u8f91\u3002Agent \u53ea\u9700\u5c06 <code>base_url</code> \u6307\u5411 Gateway\uff0c\u5373\u53ef\u900f\u660e\u5730\u6536\u96c6\u8bad\u7ec3\u6570\u636e\u3002</p>"},{"location":"components/blackbox-agent/#_1","title":"\u67b6\u6784\u6982\u89c8","text":"<pre><code>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502  BlackBoxAgentFlowBase (\u8bad\u7ec3\u4fa7\u7f16\u6392)                           \u2502\n\u2502                                                               \u2502\n\u2502  1. POST /init_trajectory          \u2192 \u83b7\u53d6 base_url            \u2502\n\u2502  2. POST {base_url}/v1/register_trajectory \u2192 \u6ce8\u518c metadata    \u2502\n\u2502  3. \u8c03\u7528 _run_agent(base_url, kwargs)                         \u2502\n\u2502     \u2502                                                         \u2502\n\u2502     \u2502  \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510                    \u2502\n\u2502     \u2514\u2500\u2500\u2502  \u5177\u4f53 Agent (\u5982 GSM8KAgent)      \u2502                    \u2502\n\u2502        \u2502  \u53ea\u77e5\u9053 base_url\uff0c\u4f7f\u7528 OpenAI API \u2502                    \u2502\n\u2502        \u2502  POST {base_url}/v1/chat/completions (\u591a\u8f6e)          \u2502\n\u2502        \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518                    \u2502\n\u2502  4. POST {base_url}/v1/complete_trajectory \u2192 \u6807\u8bb0\u5b8c\u6210          \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre>"},{"location":"components/blackbox-agent/#_2","title":"\u6838\u5fc3\u8bbe\u8ba1","text":""},{"location":"components/blackbox-agent/#_3","title":"\u5173\u6ce8\u70b9\u5206\u79bb","text":"<ul> <li>BlackBoxAgentFlowBase\uff1a\u5904\u7406\u4e0e Gateway \u7684\u5b8c\u6574\u534f\u8bae\uff08init \u2192 register \u2192 complete\uff09\uff0c\u662f\u8bad\u7ec3\u4fa7\u7684\u7f16\u6392\u5c42\u3002</li> <li>\u5177\u4f53 Agent\uff08\u5982 <code>GSM8KAgent</code>\uff09\uff1a\u53ea\u63a5\u6536 <code>base_url</code> \u548c\u4efb\u52a1\u53c2\u6570\uff0c\u4f7f\u7528\u6807\u51c6 OpenAI API \u5b8c\u6210\u4efb\u52a1\u3002Agent \u5b8c\u5168\u4e0d\u77e5\u9053\u8bad\u7ec3\u7cfb\u7edf\u7684\u5b58\u5728\u3002</li> </ul> <p>\u8fd9\u79cd\u5206\u79bb\u4f7f\u5f97\uff1a</p> <ul> <li>\u540c\u4e00\u4e2a Agent \u53ef\u4ee5\u5728\u8bad\u7ec3\u6a21\u5f0f\u548c\u72ec\u7acb\u670d\u52a1\u6a21\u5f0f\u4e0b\u590d\u7528</li> <li>\u65b0\u589e\u4efb\u52a1\u53ea\u9700\u5b9e\u73b0 Agent + \u5bf9\u5e94\u7684 Flow \u5b50\u7c7b</li> <li>Agent \u53ef\u4ee5\u7528\u4efb\u4f55\u8bed\u8a00/\u6846\u67b6\u5b9e\u73b0\uff0c\u53ea\u8981\u652f\u6301 OpenAI API</li> </ul>"},{"location":"components/blackbox-agent/#_4","title":"\u6ce8\u518c\u673a\u5236","text":"<p>Agent Flow \u901a\u8fc7 <code>@register(\"name\")</code> \u88c5\u9970\u5668\u6ce8\u518c\uff0c\u5e76\u5728 YAML \u914d\u7f6e\u4e2d\u5f15\u7528\uff1a</p> <pre><code># agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n</code></pre>"},{"location":"components/blackbox-agent/#_5","title":"\u7c7b\u5c42\u6b21","text":"<pre><code>AgentFlowBase                         (agent_flow/agent_flow.py)\n    \u2502\n    \u2514\u2500\u2500 BlackBoxAgentFlowBase          (blackbox_agent/blackbox_agent_flow.py)\n            \u2502\n            \u2514\u2500\u2500 BlackBoxGSM8KAgentFlow (blackbox_agent/gsm8k_agent_flow.py)\n</code></pre>"},{"location":"components/blackbox-agent/#blackboxagentflowbase","title":"BlackBoxAgentFlowBase","text":"<p>\u6240\u6709\u9ed1\u76d2 Agent Flow \u7684\u57fa\u7c7b\uff0c\u5b9e\u73b0\u4e86\u5b8c\u6574\u7684 Gateway \u534f\u8bae\uff1a</p> <pre><code>class BlackBoxAgentFlowBase(AgentFlowBase):\n\n    async def run(self, sampling_params, **kwargs) -&gt; int:\n        # 1. \u63d0\u53d6 channel\u3001prompt_uid\u3001metadata\n        channel, prompt_uid, metadata = self._prepare_params(kwargs)\n\n        # 2. init_trajectory \u2192 \u83b7\u53d6 base_url\n        init_resp = await http.post(f\"{self.gateway_url}/init_trajectory\")\n        base_url = ...\n\n        # 3. register_trajectory \u2192 \u6ce8\u518c channel \u548c metadata\n        await http.post(f\"{base_url}/v1/register_trajectory\", json={...})\n\n        # 4. \u8c03\u7528\u5b50\u7c7b\u5b9e\u73b0\u7684 _run_agent\n        num_turns = await self._run_agent(base_url, kwargs)\n\n        # 5. complete_trajectory \u2192 \u6807\u8bb0\u5b8c\u6210\n        await http.post(f\"{base_url}/v1/complete_trajectory\")\n\n        return num_turns\n\n    @abstractmethod\n    async def _run_agent(self, base_url: str, kwargs: dict) -&gt; int:\n        \"\"\"\u5b50\u7c7b\u5b9e\u73b0\uff1a\u521b\u5efa\u5e76\u8fd0\u884c\u5177\u4f53 Agent\u3002\"\"\"\n        ...\n</code></pre> <p>\u5b50\u7c7b\u53ea\u9700\u5b9e\u73b0 <code>_run_agent</code>\uff1a\u4ece <code>kwargs</code> \u4e2d\u63d0\u53d6\u4efb\u52a1\u53c2\u6570\uff0c\u521b\u5efa Agent \u5b9e\u4f8b\uff0c\u8c03\u7528 Agent \u7684\u6267\u884c\u65b9\u6cd5\u3002</p>"},{"location":"components/blackbox-agent/#blackboxgsm8kagentflow","title":"BlackBoxGSM8KAgentFlow","text":"<p>GSM8K \u6570\u5b66\u9898\u7684\u5177\u4f53\u5b9e\u73b0\uff1a</p> <pre><code>@register(\"blackbox_gsm8k_agent\")\nclass BlackBoxGSM8KAgentFlow(BlackBoxAgentFlowBase):\n\n    async def _run_agent(self, base_url: str, kwargs: dict) -&gt; int:\n        from claw_r1.blackbox_agent.gsm8k_agent import GSM8KAgent\n\n        question = ...   # \u4ece kwargs[\"raw_prompt\"] \u63d0\u53d6\n        ground_truth = ...  # \u4ece kwargs[\"reward_model\"] \u63d0\u53d6\n        max_turns = self.config.actor_rollout_ref.rollout.get(\"max_turns\", 3)\n\n        agent = GSM8KAgent(base_url=base_url)\n        return await agent.solve(\n            question=question,\n            ground_truth=ground_truth,\n            max_turns=max_turns,\n        )\n</code></pre>"},{"location":"components/blackbox-agent/#gsm8kagent","title":"GSM8KAgent","text":"<p>\u4e00\u4e2a\u8bad\u7ec3\u65e0\u5173\u7684 Agent\uff0c\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u89e3\u51b3 GSM8K \u6570\u5b66\u9898\uff1a</p> <ul> <li>\u63a5\u6536 <code>base_url</code>\uff08\u6307\u5411 Gateway\uff09\u548c\u4efb\u52a1\u53c2\u6570</li> <li>\u4f7f\u7528 tool calling\uff08<code>check_answer</code> \u5de5\u5177\uff09\u8fdb\u884c\u591a\u8f6e\u63a8\u7406</li> <li>\u652f\u6301 Qwen \u98ce\u683c\u7684 tool call \u89e3\u6790\uff08<code>\u273fFUNCTION\u273f</code> \u683c\u5f0f\uff09</li> <li>\u8fd4\u56de\u4f7f\u7528\u7684\u8f6e\u6b21\u6570</li> </ul> <pre><code>agent = GSM8KAgent(base_url=\"http://gateway:8100/traj123/1\")\nnum_turns = await agent.solve(\n    question=\"What is 15 * 23?\",\n    ground_truth=\"345\",\n    max_turns=3,\n)\n</code></pre>"},{"location":"components/blackbox-agent/#agent","title":"\u6dfb\u52a0\u65b0\u7684\u9ed1\u76d2 Agent","text":"<ol> <li>\u5b9e\u73b0 Agent \u7c7b\uff08\u8bad\u7ec3\u65e0\u5173\uff09\uff1a</li> </ol> <pre><code># claw_r1/blackbox_agent/my_agent.py\nclass MyAgent:\n    def __init__(self, base_url: str):\n        self.client = AsyncOpenAI(base_url=base_url, api_key=\"x\")\n\n    async def solve(self, task: str, **kwargs) -&gt; int:\n        # \u4f7f\u7528 self.client \u8fdb\u884c\u591a\u8f6e\u5bf9\u8bdd\n        # \u8fd4\u56de\u4f7f\u7528\u7684\u8f6e\u6b21\u6570\n        ...\n</code></pre> <ol> <li>\u5b9e\u73b0 Flow \u5b50\u7c7b\uff1a</li> </ol> <pre><code># claw_r1/blackbox_agent/my_agent_flow.py\nfrom claw_r1.agent_flow.agent_flow import register\nfrom claw_r1.blackbox_agent.blackbox_agent_flow import BlackBoxAgentFlowBase\n\n@register(\"blackbox_my_agent\")\nclass BlackBoxMyAgentFlow(BlackBoxAgentFlowBase):\n    async def _run_agent(self, base_url, kwargs):\n        from claw_r1.blackbox_agent.my_agent import MyAgent\n        task = kwargs.get(\"raw_prompt\", \"\")\n        agent = MyAgent(base_url=base_url)\n        return await agent.solve(task=task)\n</code></pre> <ol> <li>\u6ce8\u518c\u5230\u914d\u7f6e\uff1a</li> </ol> <pre><code># agent_flow_config.yaml\n- name: blackbox_my_agent\n  _target_: claw_r1.blackbox_agent.my_agent_flow.BlackBoxMyAgentFlow\n</code></pre> <ol> <li>\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u4f7f\u7528\uff1a</li> </ol> <pre><code>python3 -m claw_r1.async_main \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_my_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    ...\n</code></pre>"},{"location":"components/blackbox-agent/#_6","title":"\u6587\u4ef6\u7ed3\u6784","text":"<pre><code>claw_r1/blackbox_agent/\n\u251c\u2500\u2500 blackbox_agent_flow.py      # BlackBoxAgentFlowBase \u57fa\u7c7b\n\u251c\u2500\u2500 gsm8k_agent_flow.py         # GSM8K Flow \u5b50\u7c7b\n\u251c\u2500\u2500 gsm8k_agent.py              # GSM8K Agent\uff08\u8bad\u7ec3\u65e0\u5173\uff09\n\u2514\u2500\u2500 agent_flow_config.yaml      # Agent Flow \u6ce8\u518c\u914d\u7f6e\n</code></pre>"},{"location":"components/datapool/","title":"DataPool","text":"<p>DataPool \u662f Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u6838\u5fc3 \u2014 \u4e00\u4e2a Ray Actor\uff0c\u627f\u62c5\u7740 Agent \u4ea4\u4e92\u6570\u636e\u7684\u5b58\u50a8\u3001\u7d22\u5f15\u3001\u8d28\u91cf\u8ffd\u8e2a\u3001\u5206\u533a\u7ba1\u7406\u548c\u6309\u9700\u4f9b\u7ed9\u3002\u5b83\u4e0d\u4ec5\u662f Agent \u4fa7\u4e0e Training \u4fa7\u4e4b\u95f4\u7684\u7f13\u51b2\u533a\uff0c\u66f4\u662f\u6574\u4e2a\u6570\u636e\u57fa\u7840\u8bbe\u65bd\u7684\u4e2d\u67a2\u3002</p>"},{"location":"components/datapool/#_1","title":"\u5728\u67b6\u6784\u4e2d\u7684\u89d2\u8272","text":"<pre><code>Gateway \u2500\u2500\u25ba DataPool.submit_steps()     (\u6570\u636e\u91c7\u96c6\uff1a\u5f02\u6b65\u5199\u5165)\nTrainer \u25c4\u2500\u2500 DataPool.fetch_batch()      (\u6570\u636e\u4f9b\u7ed9\uff1a\u963b\u585e\u62c9\u53d6\u5c31\u7eea\u7ec4)\n            DataPool.get_statistics()   (\u6570\u636e\u76d1\u63a7\uff1a\u5b9e\u65f6\u7edf\u8ba1)\n</code></pre> <p>DataPool \u5b8c\u5168\u89e3\u8026\u4e86\u6570\u636e\u91c7\u96c6\u901f\u5ea6\uff08\u7531 Agent \u8bf7\u6c42\u9891\u7387\u9a71\u52a8\uff09\u548c\u6570\u636e\u6d88\u8d39\u901f\u5ea6\uff08\u7531\u8bad\u7ec3\u541e\u5410\u91cf\u9a71\u52a8\uff09\u3002\u53cc\u65b9\u4e92\u4e0d\u7b49\u5f85\u3002</p>"},{"location":"components/datapool/#channel","title":"Channel \u7cfb\u7edf\uff08\u6570\u636e\u5206\u533a\uff09","text":"<p>DataPool \u901a\u8fc7 channel \u5bf9\u6570\u636e\u8fdb\u884c\u5206\u533a\u7ba1\u7406\u3002\u9ed8\u8ba4 channel \u4e3a <code>\"train\"</code>\uff0c\u9a8c\u8bc1\u6d41\u7a0b\u4f7f\u7528 <code>\"val\"</code> channel \u4ee5\u9694\u79bb\u6570\u636e\u3002</p> <pre><code># \u8bad\u7ec3\u6570\u636e\ndata_pool.submit_step(step, channel=\"train\")\n\n# \u9a8c\u8bc1\u6570\u636e\ndata_pool.submit_step(step, channel=\"val\")\n</code></pre> <p>\u6bcf\u4e2a channel \u62e5\u6709\u72ec\u7acb\u7684\u5b58\u50a8\u3001\u7d22\u5f15\u548c FIFO \u961f\u5217\u3002</p>"},{"location":"components/datapool/#_2","title":"\u6570\u636e\u6a21\u578b","text":"<p>DataPool \u4ee5 step \u7c92\u5ea6 \u5b58\u50a8 trajectory\u3002\u6bcf\u4e2a step \u662f\u4e00\u4e2a <code>(s, a, r)</code> \u5143\u7ec4\uff1a</p> <pre><code>@dataclass\nclass Step:\n    prompt_ids:     list[int]   # state: \u5b8c\u6574\u4e0a\u4e0b\u6587 token IDs\n    response_ids:   list[int]   # action: LLM \u751f\u6210\u7684 token IDs\n    reward:         float       # \u8be5 step \u7684\u5373\u65f6 reward\n    trajectory_uid: str         # \u540c\u4e00\u5bf9\u8bdd\u4e2d\u7684 step \u5171\u4eab\u6b64 ID\n    prompt_uid:     str         # \u540c\u4e00 prompt \u7684 rollout \u5171\u4eab\u6b64 ID\uff08\u7528\u4e8e GRPO\uff09\n    step_index:     int         # trajectory \u5185\u7684\u4f4d\u7f6e\uff080-indexed\uff09\n    policy_version: int         # \u751f\u6210\u8be5 step \u65f6\u7684\u7b56\u7565\u7248\u672c\n    is_last:        bool        # \u662f\u5426\u4e3a trajectory \u7684\u6700\u540e\u4e00\u4e2a step\n    metadata:       dict        # \u8f85\u52a9\u6570\u636e\uff08\u6570\u636e\u96c6\u5b57\u6bb5\u3001\u6765\u6e90\u4fe1\u606f\u7b49\uff09\n</code></pre>"},{"location":"components/datapool/#_3","title":"\u5185\u90e8\u7d22\u5f15","text":"\u7d22\u5f15 \u7c7b\u578b \u7528\u9014 <code>trajectory_index</code> <code>dict[str, list[int]]</code> <code>trajectory_uid</code> \u2192 step \u7d22\u5f15\u5217\u8868 <code>trajectory_complete</code> <code>dict[str, bool]</code> \u8ffd\u8e2a trajectory \u662f\u5426\u5df2\u6536\u5230 <code>is_last</code> step <code>prompt_groups</code> <code>dict[str, PromptGroup]</code> <code>prompt_uid</code> \u2192 trajectory \u5217\u8868\u548c\u5b8c\u6210\u72b6\u6001"},{"location":"components/datapool/#producer-api","title":"Producer API","text":""},{"location":"components/datapool/#submit_stepstep-step-channeltrain","title":"<code>submit_step(step: Step, channel=\"train\")</code>","text":"<p>\u6dfb\u52a0\u5355\u4e2a step \u5230\u6307\u5b9a channel\u3002\u7531 Gateway \u901a\u8fc7 Ray RPC \u8c03\u7528\u3002</p>"},{"location":"components/datapool/#submit_stepssteps-liststep-channeltrain","title":"<code>submit_steps(steps: list[Step], channel=\"train\")</code>","text":"<p>\u6279\u91cf\u63d0\u4ea4\u591a\u4e2a step\u3002\u6bd4\u5faa\u73af\u8c03\u7528 <code>submit_step</code> \u66f4\u9ad8\u6548\u3002</p>"},{"location":"components/datapool/#complete_trajectorytrajectory_uid-rewardnone-channeltrain","title":"<code>complete_trajectory(trajectory_uid, reward=None, channel=\"train\")</code>","text":"<p>\u6807\u8bb0\u4e00\u6761 trajectory \u5b8c\u6210\u3002\u7528\u4e8e\u9ed1\u76d2\u6a21\u5f0f\uff0cAgent \u901a\u8fc7 Gateway \u7684 <code>v1/complete_trajectory</code> \u7aef\u70b9\u89e6\u53d1\u3002</p>"},{"location":"components/datapool/#consumer-api","title":"Consumer API","text":""},{"location":"components/datapool/#fetch_batchn_rollouts-channeltrain-liststep-none","title":"<code>fetch_batch(n_rollouts, channel=\"train\") \u2192 list[Step] | None</code>","text":"<p>FIFO \u62c9\u53d6\u4e0b\u4e00\u4e2a\u5c31\u7eea\u7684 <code>prompt_uid</code> \u7ec4\u3002\u4e00\u4e2a\u7ec4\u5728\u6240\u6709 trajectory \u90fd\u6536\u5230 <code>is_last</code> step \u540e\u53d8\u4e3a\"\u5c31\u7eea\"\u3002</p> <p>\u5f53\u6ca1\u6709\u5b8c\u6574\u7ec4\u53ef\u7528\u65f6\u8fd4\u56de <code>None</code>\u3002</p> <pre><code># Trainer \u4fa7\nwhile True:\n    batch = await data_pool.fetch_batch.remote(n_rollouts=5)\n    if batch is not None:\n        train_on_batch(batch)\n</code></pre>"},{"location":"components/datapool/#_4","title":"\u5bb9\u91cf\u7ba1\u7406\u4e0e\u80cc\u538b\u63a7\u5236","text":"<p>\u5f53\u8bbe\u7f6e <code>max_queue_size</code> \u65f6\uff0cDataPool \u5728\u961f\u5217\u6ee1\u65f6\u81ea\u52a8\u4e22\u5f03\u6700\u65e7\u7684\u5c31\u7eea\u7ec4\uff0c\u9632\u6b62\u6570\u636e\u5806\u79ef\u5bfc\u81f4\u5185\u5b58\u65e0\u9650\u589e\u957f\u3002\u8fd9\u79cd\u80cc\u538b\u673a\u5236\u4e5f\u786e\u4fdd\u4e86\u8bad\u7ec3\u4fa7\u6d88\u8d39\u7684\u6570\u636e\u5c3d\u53ef\u80fd\u65b0\u9c9c\uff1a</p> <pre><code>async_training:\n  max_queue_size: null   # null = \u65e0\u9650\n</code></pre>"},{"location":"components/datapool/#training-backend","title":"Training Backend\uff08\u6570\u636e\u4f9b\u7ed9\u9002\u914d\uff09","text":"<p>DataPool \u901a\u8fc7\u53ef\u63d2\u62d4\u7684 <code>TrainingBackend</code> \u5c06 <code>list[Step]</code> \u8f6c\u6362\u4e3a\u4efb\u610f\u8bad\u7ec3\u5f15\u64ce\u7684\u539f\u751f\u683c\u5f0f\uff0c\u5b9e\u73b0\u6570\u636e\u7ba1\u7406\u4e0e\u8bad\u7ec3\u6846\u67b6\u7684\u89e3\u8026\uff1a</p> <pre><code>class VerlBackend(TrainingBackend):\n    \"\"\"\u5c06 Step \u5217\u8868\u8f6c\u6362\u4e3a verl DataProto\u3002\"\"\"\n\n    def convert(self, steps: list[Step]) -&gt; DataProto:\n        # prompt_ids: \u5de6\u586b\u5145\u5230 prompt_length\n        # response_ids: \u53f3\u586b\u5145\u5230 response_length\n        # input_ids: [prompt_ids | response_ids]\n        # attention_mask, position_ids, response_mask \u7b49\n        ...\n</code></pre>"},{"location":"components/datapool/#off-policy","title":"Off-policy \u652f\u6301\uff08\u6570\u636e\u65b0\u9c9c\u5ea6\u7ba1\u63a7\uff09","text":"<p>\u6bcf\u4e2a Step \u90fd\u8bb0\u5f55\u4e86\u751f\u6210\u65f6\u7684 <code>policy_version</code>\uff0cDataPool \u548c Trainer \u53ef\u4ee5\u636e\u6b64\u5224\u65ad\u6570\u636e\u7684\u65b0\u9c9c\u5ea6\u3002Trainer \u901a\u8fc7 staleness threshold \u914d\u7f6e\u6765\u5904\u7406\u5386\u53f2\uff08off-policy\uff09\u6570\u636e\uff1a</p> <pre><code>async_training:\n  staleness_threshold: 0.1   # policy_version \u6ede\u540e &gt; threshold \u7684 step \u4e3a off-policy\n</code></pre> <p>Off-policy step \u4ecd\u5305\u542b\u5728 batch \u4e2d\uff0c\u4f46\u5728 loss \u8ba1\u7b97\u65f6\u901a\u8fc7 importance sampling \u8fdb\u884c\u964d\u6743\u3002</p>"},{"location":"components/gateway/","title":"Gateway Server","text":"<p>Gateway Server \u662f\u4e00\u4e2a FastAPI HTTP \u670d\u52a1\uff0c\u4f5c\u4e3a Agent \u4e0e Claw-R1 \u8bad\u7ec3\u57fa\u7840\u8bbe\u65bd\u4e4b\u95f4\u7684\u7f51\u7edc\u5c42\u4ee3\u7406\u3002</p>"},{"location":"components/gateway/#_1","title":"\u8bbe\u8ba1\u539f\u5219","text":"<ul> <li>\u72ec\u7acb\u8fdb\u7a0b\uff1aGateway \u4f5c\u4e3a\u666e\u901a OS \u8fdb\u7a0b\u8fd0\u884c\uff08\u975e Ray Actor\uff09\uff0c\u53ef\u4ee5\u72ec\u7acb\u4e8e Ray \u96c6\u7fa4\u91cd\u542f\u3002</li> <li>\u7eaf\u4ee3\u7406\uff1aGateway \u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f\uff0c\u53ea\u8d1f\u8d23\u8f6c\u53d1\u8bf7\u6c42\u3001\u6536\u96c6 Step\u3001\u63d0\u4ea4\u5230 DataPool\u3002</li> <li>OpenAI \u517c\u5bb9\uff1a\u9ed1\u76d2\u7aef\u70b9\u5b9e\u73b0\u4e0e OpenAI chat completions API \u76f8\u540c\u7684\u63a5\u53e3\uff0c\u53ef\u4f5c\u4e3a drop-in \u66ff\u6362\u3002</li> <li>\u5ef6\u8fdf\u521d\u59cb\u5316\uff1a\u542f\u52a8\u65f6\u5148\u5feb\u901f\u521d\u59cb\u5316 Ray \u8fde\u63a5\u548c\u914d\u7f6e\uff0cHTTP \u670d\u52a1\u7acb\u5373\u53ef\u7528\uff1btokenizer \u5728\u540e\u53f0\u7ebf\u7a0b\u52a0\u8f7d\uff0c\u901a\u8fc7 <code>/ready</code> \u7aef\u70b9\u62a5\u544a\u5c31\u7eea\u72b6\u6001\u3002</li> </ul>"},{"location":"components/gateway/#_2","title":"\u542f\u52a8\u65b9\u5f0f","text":"<p>Gateway \u901a\u5e38\u7531 <code>AsyncRollouter</code> \u4f5c\u4e3a\u5b50\u8fdb\u7a0b\u81ea\u52a8\u542f\u52a8\u3002\u4e5f\u53ef\u624b\u52a8\u542f\u52a8\uff1a</p> <pre><code>python -m claw_r1.gateway.gateway \\\n    --data-pool-name  data_pool \\\n    --vllm-addresses  http://host1:8001,http://host2:8001 \\\n    --tokenizer-path  /path/to/model \\\n    --prompt-length   4096 \\\n    --response-length 1024 \\\n    --reward-worker-name reward_loop_worker \\\n    --ray-address     auto \\\n    --ray-namespace   default \\\n    --host            0.0.0.0 \\\n    --port            8100\n</code></pre>"},{"location":"components/gateway/#_3","title":"\u53c2\u6570","text":"\u53c2\u6570 \u5fc5\u586b \u8bf4\u660e <code>--data-pool-name</code> \u662f DataPool \u7684 Ray Actor \u540d\u79f0 <code>--vllm-addresses</code> \u662f \u9017\u53f7\u5206\u9694\u7684 vLLM \u670d\u52a1\u5668\u5730\u5740\u5217\u8868\uff08\u8f6e\u8be2\u8d1f\u8f7d\u5747\u8861\uff09 <code>--tokenizer-path</code> \u662f HuggingFace tokenizer \u8def\u5f84 <code>--prompt-length</code> \u662f \u6700\u5927 prompt token \u957f\u5ea6\uff08\u7528\u4e8e padding\uff09 <code>--response-length</code> \u662f \u6700\u5927 response token \u957f\u5ea6\uff08\u7528\u4e8e padding\uff09 <code>--reward-worker-name</code> \u5426 RewardLoopWorker \u7684 Ray Actor \u540d\u79f0 <code>--ray-address</code> \u5426 Ray GCS \u5730\u5740\uff08\u9ed8\u8ba4 <code>auto</code>\uff09 <code>--ray-namespace</code> \u5426 Ray namespace <code>--host</code> \u5426 \u76d1\u542c\u5730\u5740\uff08\u9ed8\u8ba4 <code>0.0.0.0</code>\uff09 <code>--port</code> \u5426 \u76d1\u542c\u7aef\u53e3\uff08\u9ed8\u8ba4 <code>8100</code>\uff09"},{"location":"components/gateway/#_4","title":"\u4e24\u79cd\u5de5\u4f5c\u6a21\u5f0f","text":""},{"location":"components/gateway/#white-box","title":"White-box \u6a21\u5f0f","text":"<p>\u767d\u76d2 Agent\uff08<code>AgentFlowBase</code> \u5b50\u7c7b\uff09\u901a\u8fc7 Gateway \u6839\u8def\u5f84\u7aef\u70b9\u4ea4\u4e92\uff1a</p> <pre><code>AgentFlow \u2192 POST /generate        \u2192 vLLM \u2192 \u8fd4\u56de token IDs\nAgentFlow \u2192 POST /submit_steps    \u2192 DataPool\nAgentFlow \u2192 POST /compute_reward  \u2192 RewardLoopWorker\n</code></pre> <p>Agent \u81ea\u5df1\u7ba1\u7406 tokenize\u3001Step \u6784\u5efa\u548c\u63d0\u4ea4\u3002</p>"},{"location":"components/gateway/#black-box","title":"Black-box \u6a21\u5f0f","text":"<p>\u9ed1\u76d2 Agent \u53ea\u9700\u8981\u4e00\u4e2a <code>base_url</code>\uff0c\u901a\u8fc7\u6807\u51c6 OpenAI \u63a5\u53e3\u4ea4\u4e92\uff1a</p> <pre><code>1. BlackBoxAgentFlow \u2192 POST /init_trajectory           \u2192 \u83b7\u53d6 base_url\n2. BlackBoxAgentFlow \u2192 POST {base_url}/v1/register_trajectory  \u2192 \u6ce8\u518c channel/metadata\n3. Agent             \u2192 POST {base_url}/v1/chat/completions     \u2192 \u6807\u51c6 OpenAI \u8c03\u7528\uff08\u53ef\u591a\u8f6e\uff09\n4. BlackBoxAgentFlow \u2192 POST {base_url}/v1/complete_trajectory  \u2192 \u6807\u8bb0\u5b8c\u6210\n</code></pre> <p>Gateway \u5728 <code>v1/chat/completions</code> \u5185\u90e8\u81ea\u52a8\u5b8c\u6210 tokenize\u3001Step \u6784\u5efa\u548c DataPool \u63d0\u4ea4\uff0cAgent \u5b8c\u5168\u65e0\u611f\u77e5\u3002</p>"},{"location":"components/gateway/#base_url","title":"base_url \u673a\u5236","text":"<p><code>base_url</code> \u7684\u683c\u5f0f\u4e3a\uff1a</p> <pre><code>http://&lt;host&gt;:&lt;port&gt;/&lt;trajectory_uid&gt;/&lt;prompt_uid&gt;\n</code></pre> <p><code>trajectory_uid</code> \u548c <code>prompt_uid</code> \u7f16\u7801\u5728 URL path \u4e2d\uff0c\u4f7f\u5f97 Gateway \u80fd\u5c06\u8bf7\u6c42\u5173\u8054\u5230\u6b63\u786e\u7684 trajectory\uff0c\u800c Agent \u7aef\u53ea\u9700\u4fee\u6539 <code>base_url</code> \u5373\u53ef\u63a5\u5165\u8bad\u7ec3\u7cfb\u7edf\u3002</p> <pre><code>from openai import OpenAI\n\nclient = OpenAI(\n    base_url=\"http://gateway:8100/abc123/1\",  # base_url \u7531 init_trajectory \u8fd4\u56de\n    api_key=\"not-needed\",\n)\nresponse = client.chat.completions.create(\n    model=\"qwen\",\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\n)\n</code></pre>"},{"location":"components/gateway/#_5","title":"\u5185\u90e8\u72b6\u6001\u7ba1\u7406","text":"<p>Gateway \u4e3a\u6bcf\u6761 trajectory \u7ef4\u62a4\u4ee5\u4e0b\u72b6\u6001\uff1a</p> \u72b6\u6001 \u8bf4\u660e <code>_trajectory_step_counter</code> \u6bcf\u6761 trajectory \u7684\u4e0b\u4e00\u4e2a step_index <code>_trajectory_channel</code> trajectory \u5bf9\u5e94\u7684 DataPool channel\uff08\u9ed8\u8ba4 <code>\"train\"</code>\uff09 <code>_trajectory_metadata</code> trajectory \u5173\u8054\u7684 metadata\uff08\u5982 reward_model\u3001data_source \u7b49\uff09 <p>\u8fd9\u4e9b\u72b6\u6001\u5728 <code>register_trajectory</code> \u65f6\u8bbe\u7f6e\uff0c\u5728 <code>complete_trajectory</code> \u65f6\u6e05\u7406\u3002</p>"},{"location":"components/gateway/#_6","title":"\u8d1f\u8f7d\u5747\u8861","text":"<p>\u5f53\u63d0\u4f9b\u591a\u4e2a <code>--vllm-addresses</code> \u65f6\uff0cGateway \u4f7f\u7528 round-robin \u8f6e\u8be2\u5206\u53d1\u8bf7\u6c42\uff1a</p> <pre><code>_vllm_cycle = itertools.cycle(vllm_addresses)\nvllm_url = next(_vllm_cycle)\n</code></pre>"},{"location":"components/gateway/#api","title":"API \u53c2\u8003","text":"<p>\u5b8c\u6574\u7684\u7aef\u70b9\u6587\u6863\u89c1 Gateway API\u3002</p>"},{"location":"components/reward-system/","title":"Reward System","text":"<p>The <code>RewardLoopWorker</code> is a Ray Actor responsible for assigning reward scores to trajectory steps. It bridges the gap between raw agent interactions and trainable reward signals.</p>"},{"location":"components/reward-system/#three-reward-sources","title":"Three Reward Sources","text":"<p>Claw-R1 supports three types of reward computation, which can be combined:</p> Type Description Best For Rule-based Deterministic function of step output Verifiable tasks (math, code execution) Discriminative RM Binary classifier reward model Preference learning, safety evaluation Generative RM LLM-based evaluator via custom scoring function Complex quality assessment, nuanced feedback"},{"location":"components/reward-system/#reward-in-production-vs-research-settings","title":"Reward in Production vs. Research Settings","text":"<p>In research settings (white-box offline mode), rewards are computed from known ground truth:</p> <pre><code>Trajectory:   [user msg] \u2192 [agent think] \u2192 [tool call] \u2192 [tool result] \u2192 [final reply]\nReward:            0.0            0.3            0.7            0.9            0.8\n</code></pre> <ul> <li>Rule-based: is the final answer correct? does the code pass tests?</li> <li>Model-based: is each step logically sound? is the tool use appropriate?</li> </ul> <p>In production settings (online mode), rewards come from real user signals:</p> Signal Type Interpretation User sends follow-up Implicit positive Agent answer was relevant but incomplete User corrects the agent Negative feedback Factual or task error User says \"thanks\" Positive signal Task completed satisfactorily No follow-up after task Neutral / estimated Reward Model estimates step quality <p>Claw-R1 uses a Reward Model to convert these soft signals into scalar process rewards, filling the gap between verifiable task rewards and open-ended conversational rewards.</p>"},{"location":"components/reward-system/#rewardloopworker-api","title":"RewardLoopWorker API","text":""},{"location":"components/reward-system/#compute_score_batchsteps-liststep-listfloat","title":"<code>compute_score_batch(steps: list[Step]) \u2192 list[float]</code>","text":"<p>Computes rewards for a batch of steps. This is the primary interface used by the Trainer.</p> <pre><code># In AsyncTrainer\nrewards = await reward_worker.compute_score_batch.remote(batch_steps)\nfor step, reward in zip(batch_steps, rewards):\n    step.reward = reward\n</code></pre>"},{"location":"components/reward-system/#custom-reward-function","title":"Custom Reward Function","text":"<p>Register a custom generative reward model by implementing the <code>reward_loop_manager</code> interface:</p> <pre><code># custom_reward.py\ndef compute_reward(step: dict, model, tokenizer) -&gt; float:\n    \"\"\"\n    Args:\n        step: dict with keys 'messages', 'response', 'metadata'\n        model: loaded reward model\n        tokenizer: model tokenizer\n    Returns:\n        scalar reward in [0.0, 1.0]\n    \"\"\"\n    prompt = build_evaluation_prompt(step)\n    score = model.score(prompt)\n    return score\n</code></pre> <p>Then register it in the configuration:</p> <pre><code>reward:\n  type: genrm\n  reward_loop_manager: path.to.custom_reward.compute_reward\n  model_path: /path/to/reward/model\n</code></pre>"},{"location":"components/reward-system/#reward-in-the-training-loop","title":"Reward in the Training Loop","text":"<p>Reward computation is decoupled from the agent service:</p> <ol> <li>The Gateway does not compute rewards before submitting steps to DataPool</li> <li>DataPool stores steps with <code>reward=0.0</code> initially</li> <li>The Trainer calls <code>RewardLoopWorker.compute_score_batch()</code> before the PPO update</li> <li>Updated rewards are used for advantage computation</li> </ol> <p>This ensures that even slow generative reward models (which may call an external LLM) do not affect agent service latency.</p> <p>Reward Design</p> <p>For new tasks, start with simple rule-based rewards (e.g., exact match, code execution pass rate). Generative reward models are more expressive but introduce variance and computational cost. Use discriminative models as a middle ground.</p>"},{"location":"concepts/","title":"Core Concepts","text":"<p>Claw-R1 \u7684\u8bbe\u8ba1\u56f4\u7ed5\u4e09\u4e2a\u6838\u5fc3\u6982\u5ff5\u5c55\u5f00\uff1a\u901a\u7528\u6570\u636e\u91c7\u96c6\u3001\u6570\u636e\u4e2d\u95f4\u4ef6\u7ba1\u7406\u548c\u6570\u636e\u9a71\u52a8\u7684\u6301\u7eed\u8fdb\u5316\u3002\u5b83\u4eec\u5171\u540c\u6784\u6210\u4e00\u4e2a\u4ece\u91c7\u96c6\u5230\u8bad\u7ec3\u7684\u6570\u636e\u98de\u8f6e\u3002</p> <ul> <li> <p>Base URL Integration \u00b7 \u901a\u7528\u6570\u636e\u91c7\u96c6</p> <p>\u96f6\u4ee3\u7801\u4fb5\u5165\u7684 Agent \u6570\u636e\u91c7\u96c6\u673a\u5236\u3002\u4efb\u4f55\u4f7f\u7528 OpenAI \u517c\u5bb9 API \u7684 Agent \u53ea\u9700\u4fee\u6539 <code>base_url</code>\uff0cGateway \u5373\u53ef\u81ea\u52a8\u91c7\u96c6\u5176\u4ea4\u4e92\u6570\u636e\u3002</p> <p> Base URL Integration</p> </li> <li> <p>Middleware Layer \u00b7 \u6570\u636e\u4e2d\u95f4\u4ef6</p> <p>Gateway + DataPool \u6570\u636e\u57fa\u7840\u8bbe\u65bd\u3002\u7edf\u4e00\u89e3\u51b3\u6570\u636e\u7684\u91c7\u96c6\u5165\u53e3\u3001\u8d28\u91cf\u7ba1\u7406\u3001\u5206\u533a\u7f13\u51b2\u548c\u6309\u9700\u4f9b\u7ed9\u3002</p> <p> Middleware Layer</p> </li> <li> <p>Production Scenario \u00b7 \u6570\u636e\u9a71\u52a8\u8fdb\u5316</p> <p>\"\u90e8\u7f72 = \u8bad\u7ec3\" \u8303\u5f0f\u3002Agent \u5728\u670d\u52a1\u7528\u6237\u7684\u540c\u65f6\u6301\u7eed\u91c7\u96c6\u4ea4\u4e92\u6570\u636e\uff0c\u7528\u6237\u884c\u4e3a\u5929\u7136\u6210\u4e3a\u6570\u636e\u8d28\u91cf\u4fe1\u53f7\uff0c\u9a71\u52a8\u6a21\u578b\u6301\u7eed\u8fdb\u5316\u3002</p> <p> Production Scenario</p> </li> </ul>"},{"location":"concepts/#_1","title":"\u6570\u636e\u98de\u8f6e","text":"<pre><code>                    base_url\n                 \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                 \u2502 \u4efb\u610f Agent  \u2502\n                 \u2502 (\u767d\u76d2/\u9ed1\u76d2) \u2502\n                 \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2518\n                        \u2502 OpenAI API\n                        \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    Gateway       \u2502 \u2190 \u6570\u636e\u91c7\u96c6\u5165\u53e3\n              \u2502  (\u81ea\u52a8\u91c7\u96c6 Step)  \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    DataPool      \u2502 \u2190 \u6570\u636e\u7ba1\u7406\u6838\u5fc3\n              \u2502  (\u8bc4\u4f30\u00b7\u7b5b\u9009\u00b7\u4f9b\u7ed9) \u2502    (\u8d28\u91cf\u8bc4\u4f30 + \u5206\u533a\u7ba1\u7406)\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    Trainer       \u2502 \u2190 \u6570\u636e\u6d88\u8d39\n              \u2502  (\u6301\u7eed\u8bad\u7ec3)       \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                       \u2502 \u6743\u91cd\u540c\u6b65\n                       \u25bc\n              \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n              \u2502    vLLM          \u2502\n              \u2502  (\u66f4\u597d\u7684\u6a21\u578b)     \u2502\n              \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre> <p>\u4e09\u4e2a\u6982\u5ff5\u7684\u534f\u540c\uff1a</p> <ol> <li>Base URL \u8ba9\u4efb\u4f55 Agent \u7684\u4ea4\u4e92\u6570\u636e\u96f6\u6210\u672c\u88ab\u91c7\u96c6</li> <li>Middleware \u7ba1\u7406\u6570\u636e\u7684\u8d28\u91cf\u3001\u5206\u533a\u548c\u4f9b\u7ed9</li> <li>Production Scenario \u8ba9\u4eba\u7c7b\u53cd\u9988\u4fe1\u53f7\u81ea\u7136\u878d\u5165\u6570\u636e\uff0c\u9a71\u52a8\u6a21\u578b\u6301\u7eed\u8fdb\u5316</li> </ol>"},{"location":"concepts/base-url-integration/","title":"Base URL Integration","text":""},{"location":"concepts/base-url-integration/#agent-llm","title":"\u95ee\u9898\uff1a\u5982\u4f55\u62e6\u622a\u9ed1\u76d2 Agent \u7684 LLM \u8c03\u7528\uff1f","text":"<p>\u5728 Agentic RL \u4e2d\uff0c\u8bad\u7ec3\u7cfb\u7edf\u9700\u8981\u62e6\u622a Agent \u4e0e LLM \u4e4b\u95f4\u7684\u6bcf\u6b21\u4ea4\u4e92\uff0c\u4ee5\u6536\u96c6 <code>(state, action, reward)</code> \u6570\u636e\u3002\u5bf9\u4e8e\u767d\u76d2 Agent\uff08\u6e90\u7801\u53ef\u63a7\uff09\uff0c\u8fd9\u5f88\u7b80\u5355\u3002\u4f46\u5bf9\u4e8e\u9ed1\u76d2 Agent\uff08\u5982\u7b2c\u4e09\u65b9\u670d\u52a1\u3001\u7f16\u8bd1\u540e\u7684\u4e8c\u8fdb\u5236\u6587\u4ef6\uff09\uff0c\u5982\u4f55\u5728\u4e0d\u4fee\u6539 Agent \u4ee3\u7801\u7684\u60c5\u51b5\u4e0b\u62e6\u622a\uff1f</p>"},{"location":"concepts/base-url-integration/#_1","title":"\u65b9\u6848\u5bf9\u6bd4","text":"\u65b9\u6848 \u4fb5\u5165\u6027 \u53ef\u9760\u6027 \u9002\u7528\u8303\u56f4 SDK monkey-patch \u4e2d \u4f4e\uff08\u7248\u672c\u66f4\u65b0\u6613\u5931\u6548\uff09 \u4ec5\u9650\u7279\u5b9a SDK \u4ee3\u7406\u5c42\uff08Proxy\uff09 \u9ad8 \u4e2d\uff08\u9700\u914d\u7f6e\u7f51\u7edc\uff09 \u901a\u7528 base_url \u66ff\u6362 \u6781\u4f4e \u9ad8 \u6240\u6709 OpenAI \u517c\u5bb9 SDK"},{"location":"concepts/base-url-integration/#base_url","title":"base_url \u673a\u5236","text":"<p>\u51e0\u4e4e\u6240\u6709 OpenAI \u517c\u5bb9\u7684 SDK \u90fd\u652f\u6301\u81ea\u5b9a\u4e49 <code>base_url</code>\u3002Claw-R1 \u5229\u7528\u8fd9\u4e00\u70b9\uff1a</p> <ol> <li>Gateway \u66b4\u9732 <code>POST {base_url}/v1/chat/completions</code> \u7aef\u70b9</li> <li>Agent \u53ea\u9700\u5c06 <code>base_url</code> \u4ece <code>https://api.openai.com</code> \u6539\u4e3a Gateway \u7684\u5730\u5740</li> <li>Gateway \u900f\u660e\u5730\u8f6c\u53d1\u8bf7\u6c42\u5230 vLLM\uff0c\u540c\u65f6\u81ea\u52a8\u6536\u96c6\u8bad\u7ec3\u6570\u636e</li> </ol> <pre><code>from openai import OpenAI\n\n# \u539f\u59cb\u4ee3\u7801\nclient = OpenAI(base_url=\"https://api.openai.com/v1\")\n\n# \u63a5\u5165 Claw-R1\uff1a\u53ea\u6539\u4e00\u884c\nclient = OpenAI(\n    base_url=\"http://gateway:8100/traj123/prompt1\",\n    api_key=\"not-needed\",\n)\n\n# \u540e\u7eed\u4ee3\u7801\u5b8c\u5168\u4e0d\u53d8\nresponse = client.chat.completions.create(\n    model=\"qwen\",\n    messages=[{\"role\": \"user\", \"content\": \"Hello\"}],\n)\n</code></pre>"},{"location":"concepts/base-url-integration/#base_url_1","title":"base_url \u7684\u7ed3\u6784","text":"<pre><code>http://&lt;host&gt;:&lt;port&gt;/&lt;trajectory_uid&gt;/&lt;prompt_uid&gt;\n</code></pre> <ul> <li><code>trajectory_uid</code>\uff1a\u6807\u8bc6\u4e00\u6761\u5b8c\u6574\u7684\u5bf9\u8bdd\u8f68\u8ff9</li> <li><code>prompt_uid</code>\uff1a\u6807\u8bc6\u540c\u4e00 prompt \u7684\u591a\u6b21 rollout\uff08\u7528\u4e8e GRPO \u5206\u7ec4\uff09</li> </ul> <p>\u8fd9\u4e24\u4e2a ID \u7f16\u7801\u5728 URL path \u4e2d\uff0cGateway \u4ece path \u4e2d\u63d0\u53d6\uff0cAgent \u5b8c\u5168\u65e0\u611f\u77e5\u3002</p>"},{"location":"concepts/base-url-integration/#claw-r1","title":"\u5728 Claw-R1 \u4e2d\u7684\u4f7f\u7528","text":""},{"location":"concepts/base-url-integration/#_2","title":"\u9ed1\u76d2\u79bb\u7ebf\u6a21\u5f0f","text":"<p><code>BlackBoxAgentFlowBase</code> \u81ea\u52a8\u7ba1\u7406 <code>base_url</code> \u7684\u751f\u547d\u5468\u671f\uff1a</p> <pre><code>1. POST /init_trajectory              \u2192 \u83b7\u53d6 base_url\n2. POST {base_url}/v1/register_trajectory  \u2192 \u6ce8\u518c channel/metadata\n3. Agent \u4f7f\u7528 base_url \u8fdb\u884c\u591a\u8f6e\u5bf9\u8bdd     \u2192 Gateway \u81ea\u52a8\u6536\u96c6 Step\n4. POST {base_url}/v1/complete_trajectory  \u2192 \u6807\u8bb0\u5b8c\u6210\n</code></pre> <p>Agent \u53ea\u9700\u8981\u63a5\u6536 <code>base_url</code> \u53c2\u6570\uff0c\u5176\u4f59\u7531\u8bad\u7ec3\u6846\u67b6\u5904\u7406\u3002</p>"},{"location":"concepts/base-url-integration/#_3","title":"\u9ed1\u76d2\u5728\u7ebf\u6a21\u5f0f","text":"<p>\u5728\u7ebf\u6a21\u5f0f\u4e0b\uff0c\u5916\u90e8\u670d\u52a1\u76f4\u63a5\u8c03\u7528 Gateway \u7684 <code>init_trajectory</code> \u83b7\u53d6 <code>base_url</code>\uff0c\u7136\u540e\u5c06\u5176\u4f20\u9012\u7ed9 Agent\u3002Agent \u7684\u6bcf\u6b21 LLM \u8c03\u7528\u90fd\u81ea\u52a8\u88ab Gateway \u8bb0\u5f55\u3002</p>"},{"location":"concepts/base-url-integration/#sdk-hook","title":"\u4e3a\u4ec0\u4e48\u4f18\u4e8e SDK Hook","text":"\u7ef4\u5ea6 SDK Hook base_url Agent \u4ee3\u7801\u4fee\u6539 \u9700\u8981\u6ce8\u5165 hook \u4ee3\u7801 \u53ea\u6539\u4e00\u4e2a\u53c2\u6570 \u591a\u8bed\u8a00\u652f\u6301 \u6bcf\u79cd\u8bed\u8a00\u9700\u8981\u5355\u72ec\u5b9e\u73b0 \u6240\u6709\u8bed\u8a00\u901a\u7528 \u7248\u672c\u517c\u5bb9\u6027 SDK \u66f4\u65b0\u53ef\u80fd\u7834\u574f hook HTTP \u534f\u8bae\u7a33\u5b9a \u8c03\u8bd5\u96be\u5ea6 Hook \u5c42\u589e\u52a0\u8c03\u8bd5\u590d\u6742\u5ea6 \u6807\u51c6 HTTP \u8bf7\u6c42\uff0c\u6613\u4e8e\u8c03\u8bd5 \u751f\u4ea7\u53ef\u9760\u6027 \u4e2d\u7b49 \u9ad8"},{"location":"concepts/base-url-integration/#sdk","title":"\u652f\u6301\u7684 SDK \u548c\u6846\u67b6","text":"<p>\u4efb\u4f55\u652f\u6301\u81ea\u5b9a\u4e49 <code>base_url</code> \u7684 OpenAI \u517c\u5bb9 SDK \u90fd\u53ef\u4ee5\u76f4\u63a5\u4f7f\u7528\uff1a</p> <ul> <li>Python: <code>openai</code>, <code>httpx</code>, <code>requests</code></li> <li>JavaScript/TypeScript: <code>openai-node</code></li> <li>Go: <code>go-openai</code></li> <li>\u6846\u67b6: LangChain, LlamaIndex, AutoGen, CrewAI \u7b49</li> </ul>"},{"location":"concepts/middleware-layer/","title":"Middleware Layer","text":""},{"location":"concepts/middleware-layer/#_1","title":"\u4e3a\u4ec0\u4e48\u9700\u8981\u6570\u636e\u4e2d\u95f4\u4ef6\uff1f","text":"<p>Agentic RL \u4e2d\uff0cAgent \u4ea7\u751f\u4ea4\u4e92\u6570\u636e\uff0cTrainer \u6d88\u8d39\u6570\u636e\u8fdb\u884c\u8bad\u7ec3\u3002\u7136\u800c\u5728\u5b9e\u9645\u573a\u666f\u4e2d\uff0c\u4e24\u8005\u4e4b\u95f4\u5b58\u5728\u663e\u8457\u7684\u4e0d\u5bf9\u79f0\uff1a</p> <ul> <li>\u6570\u636e\u6765\u6e90\u591a\u6837\uff1a\u767d\u76d2 Agent\u3001\u9ed1\u76d2 Agent\u3001\u5728\u7ebf\u670d\u52a1 Agent\uff0c\u4ea7\u51fa\u7684\u6570\u636e\u683c\u5f0f\u548c\u9891\u7387\u5404\u4e0d\u76f8\u540c</li> <li>\u6570\u636e\u8d28\u91cf\u53c2\u5dee\uff1a\u5e76\u975e\u6240\u6709\u4ea4\u4e92\u90fd\u6709\u8bad\u7ec3\u4ef7\u503c\uff0c\u9700\u8981\u8bc4\u4f30\u548c\u7b5b\u9009</li> <li>\u4ea7\u6d88\u901f\u7387\u4e0d\u5339\u914d\uff1aAgent \u4fa7\u7684\u6570\u636e\u4ea7\u751f\u901f\u7387\u4e0e Trainer \u4fa7\u7684\u6d88\u8d39\u901f\u7387\u5f80\u5f80\u4e0d\u540c\u6b65</li> <li>\u6570\u636e\u9700\u8981\u7ba1\u7406\uff1a\u5206\u533a\u3001\u7d22\u5f15\u3001\u80cc\u538b\u63a7\u5236\u3001\u7edf\u8ba1\u76d1\u63a7 \u2014 \u8fd9\u4e9b\u4e0d\u662f\u7b80\u5355\u7684\u961f\u5217\u80fd\u89e3\u51b3\u7684</li> </ul> <p>Claw-R1 \u901a\u8fc7 Middleware Layer\uff08Gateway + DataPool\uff09\u5728 Agent \u4fa7\u548c Training \u4fa7\u4e4b\u95f4\u5efa\u7acb\u4e00\u5c42\u6570\u636e\u57fa\u7840\u8bbe\u65bd\uff0c\u7edf\u4e00\u89e3\u51b3\u6570\u636e\u7684\u91c7\u96c6\u3001\u7ba1\u7406\u548c\u4f9b\u7ed9\u95ee\u9898\u3002</p>"},{"location":"concepts/middleware-layer/#gateway-datapool","title":"Gateway + DataPool \u67b6\u6784","text":"<pre><code>Agent \u4fa7                    Middleware                    Training \u4fa7\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510           \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510           \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502 Agent    \u2502\u2500\u2500HTTP\u2500\u2500\u25ba  \u2502  Gateway         \u2502\u2500\u2500Ray RPC\u2500\u2500\u25ba\u2502  DataPool    \u2502\n\u2502 (\u4efb\u610f)   \u2502\u25c4\u2500\u2500HTTP\u2500\u2500  \u2502  (FastAPI, 8100) \u2502           \u2502  (Ray Actor) \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518           \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518           \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n                                                             \u2502 fetch_batch()\n                                                             \u25bc\n                                                      \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                                                      \u2502  Trainer     \u2502\n                                                      \u2502  (Ray Actor) \u2502\n                                                      \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre>"},{"location":"concepts/middleware-layer/#gateway","title":"Gateway\uff1a\u6570\u636e\u91c7\u96c6\u5165\u53e3","text":"<p>Gateway \u662f\u4e00\u4e2a\u72ec\u7acb\u8fdb\u7a0b\uff08FastAPI\uff09\uff0c\u8d1f\u8d23\u4ece Agent \u4ea4\u4e92\u4e2d\u91c7\u96c6\u8bad\u7ec3\u6570\u636e\uff1a</p> <ul> <li>\u7eaf\u4ee3\u7406\uff1a\u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f\uff0c\u53ea\u8f6c\u53d1\u8bf7\u6c42\u548c\u91c7\u96c6\u6570\u636e</li> <li>OpenAI \u517c\u5bb9\uff1a\u9ed1\u76d2 Agent \u901a\u8fc7 <code>base_url</code> \u900f\u660e\u63a5\u5165\uff0cGateway \u81ea\u52a8\u4ece\u5bf9\u8bdd\u4e2d\u6784\u5efa Step</li> <li>\u5ef6\u8fdf\u521d\u59cb\u5316\uff1aHTTP \u670d\u52a1\u7acb\u5373\u53ef\u7528\uff0ctokenizer \u5728\u540e\u53f0\u52a0\u8f7d</li> </ul> <p>Gateway \u652f\u6301\u4e24\u79cd\u6570\u636e\u91c7\u96c6\u6a21\u5f0f\uff1a</p> \u6a21\u5f0f \u7aef\u70b9 \u6570\u636e\u91c7\u96c6\u65b9\u5f0f \u767d\u76d2 <code>/generate</code>, <code>/submit_steps</code> Agent \u81ea\u884c\u6784\u5efa Step \u5e76\u63d0\u4ea4 \u9ed1\u76d2 <code>{base_url}/v1/chat/completions</code> Gateway \u81ea\u52a8 tokenize \u5e76\u6784\u5efa Step <p>\u8be6\u89c1 Gateway Server\u3002</p>"},{"location":"concepts/middleware-layer/#datapool","title":"DataPool\uff1a\u6570\u636e\u7ba1\u7406\u6838\u5fc3","text":"<p>DataPool \u662f\u4e00\u4e2a Ray Actor\uff0c\u4e0d\u4ec5\u662f trajectory \u7f13\u51b2\u533a\uff0c\u66f4\u662f Claw-R1 \u7684\u6570\u636e\u7ba1\u7406\u4e2d\u67a2\uff1a</p> \u80fd\u529b \u8bf4\u660e \u6570\u636e\u5b58\u50a8 \u4ee5 Step \u7c92\u5ea6\u5b58\u50a8\u4ea4\u4e92\u6570\u636e\uff0c\u652f\u6301\u591a\u7ef4\u7d22\u5f15 \u8d28\u91cf\u8ffd\u8e2a \u6bcf\u4e2a Step \u8bb0\u5f55 <code>policy_version</code>\uff0c\u652f\u6301\u65b0\u9c9c\u5ea6\u68c0\u6d4b Channel \u5206\u533a <code>\"train\"</code> \u548c <code>\"val\"</code> \u6570\u636e\u9694\u79bb\uff0c\u4e92\u4e0d\u5e72\u6270 GRPO \u5206\u7ec4 \u6309 <code>prompt_uid</code> \u5206\u7ec4\uff0c\u51d1\u9f50\u6240\u6709 rollout \u540e\u624d\u4f9b\u7ed9\u8bad\u7ec3 \u5bb9\u91cf\u7ba1\u7406 \u53ef\u914d\u7f6e <code>max_queue_size</code>\uff0c\u8d85\u9650\u81ea\u52a8\u4e22\u5f03\u6700\u65e7\u6570\u636e \u7edf\u8ba1\u76d1\u63a7 \u5b9e\u65f6\u63d0\u4f9b\u961f\u5217\u6df1\u5ea6\u3001produce/consume/drop \u901f\u7387\u7b49\u6307\u6807 <p>\u8be6\u89c1 DataPool\u3002</p>"},{"location":"concepts/middleware-layer/#step","title":"Step \u6570\u636e\u6a21\u578b","text":"<p>Step \u662f\u6570\u636e\u7ba1\u7406\u7684\u539f\u5b50\u5355\u4f4d\uff0c\u8bb0\u5f55\u4e86\u4e00\u6b21 Agent \u4ea4\u4e92\u7684\u5b8c\u6574\u4fe1\u606f\uff1a</p> <pre><code>@dataclass\nclass Step:\n    prompt_ids:     list[int]   # state: \u5b8c\u6574\u4e0a\u4e0b\u6587 token IDs\n    response_ids:   list[int]   # action: LLM \u751f\u6210\u7684 token IDs\n    reward:         float       # \u5373\u65f6 reward\uff08\u8d28\u91cf\u8bc4\u5206\uff09\n    trajectory_uid: str         # \u540c\u4e00\u5bf9\u8bdd\u7684 step \u5171\u4eab\u6b64 ID\n    prompt_uid:     str         # \u540c\u4e00 prompt \u7684 rollout \u5171\u4eab\u6b64 ID\n    step_index:     int         # trajectory \u5185\u7684\u4f4d\u7f6e\n    policy_version: int         # \u751f\u6210\u65f6\u7684\u7b56\u7565\u7248\u672c\uff08\u65b0\u9c9c\u5ea6\u8ffd\u8e2a\uff09\n    is_last:        bool        # \u662f\u5426\u4e3a\u6700\u540e\u4e00\u4e2a step\n    metadata:       dict        # \u8f85\u52a9\u6570\u636e\uff08\u6765\u6e90\u3001\u6570\u636e\u96c6\u5b57\u6bb5\u7b49\uff09\n</code></pre>"},{"location":"concepts/middleware-layer/#reward","title":"Reward \u6807\u6ce8\u4e0e\u6570\u636e\u8d28\u91cf\u8bc4\u4f30","text":"<p>Reward \u8ba1\u7b97\u4e0e Agent \u670d\u52a1\u89e3\u8026\uff0c\u786e\u4fdd\u6570\u636e\u8d28\u91cf\u8bc4\u4f30\u4e0d\u5f71\u54cd Agent \u670d\u52a1\u5ef6\u8fdf\uff1a</p> <ol> <li>Gateway \u91c7\u96c6 Step \u65f6 <code>reward=0.0</code>\uff08\u539f\u59cb\u6570\u636e\uff09</li> <li>DataPool \u5b58\u50a8\u539f\u59cb Step</li> <li>Trainer \u5728\u6d88\u8d39\u6570\u636e\u524d\u901a\u8fc7 <code>RewardLoopWorker</code> \u8bc4\u4f30\u6570\u636e\u8d28\u91cf\uff08\u8ba1\u7b97 reward\uff09</li> <li>\u8bc4\u4f30\u540e\u7684 reward \u7528\u4e8e advantage \u8ba1\u7b97\u548c\u6570\u636e\u7b5b\u9009</li> </ol> <p>\u8fd9\u79cd\u8bbe\u8ba1\u4f7f\u5f97\u5373\u4f7f\u662f\u6162\u901f\u7684 generative reward model \u6216\u4eba\u7c7b\u53cd\u9988\u7ba1\u7ebf\u4e5f\u4e0d\u4f1a\u5f71\u54cd Agent \u7684\u6b63\u5e38\u670d\u52a1\u3002</p>"},{"location":"concepts/production-scenario/","title":"Production Agent Scenario","text":""},{"location":"concepts/production-scenario/#agentic-rl","title":"Agentic RL \u4e2d\u7684\u9690\u542b\u5047\u8bbe","text":"<p>\u51e0\u4e4e\u6240\u6709 Agentic RL \u6846\u67b6\u90fd\u5efa\u7acb\u5728\u4e00\u4e2a\u9690\u542b\u5047\u8bbe\u4e0a\uff1a</p> <p>\u8bad\u7ec3\u9636\u6bb5 \u2260 \u90e8\u7f72\u9636\u6bb5</p> <p>\u6807\u51c6\u6d41\u7a0b\uff1a\u5728\u79bb\u7ebf/\u6a21\u62df\u6570\u636e\u4e0a\u8bad\u7ec3 \u2192 \u90e8\u7f72\u56fa\u5b9a\u6a21\u578b \u2192 \u5b9a\u671f\u91cd\u8bad\u3002</p> <p>\u8fd9\u5728\u7814\u7a76\u573a\u666f\u4e0b\u53ef\u884c\uff0c\u4f46\u5728\u751f\u4ea7\u73af\u5883\u4e2d\u9047\u5230\u6839\u672c\u6027\u969c\u788d\uff1a</p> \u95ee\u9898 \u8868\u73b0 \u5206\u5e03\u504f\u79fb \u8bad\u7ec3\u6570\u636e\u662f\u5408\u6210\u7684\uff1b\u771f\u5b9e\u7528\u6237\u8bf7\u6c42\u5206\u5e03\u4e0d\u540c \u2192 \u90e8\u7f72\u540e\u80fd\u529b\u9000\u5316 \u51b7\u542f\u52a8 \u65b0\u90e8\u7f72\u7684\u6a21\u578b\u5bf9\u7279\u5b9a\u7528\u6237\u7684\u4e60\u60ef\u3001\u5de5\u5177\u3001\u5de5\u4f5c\u6d41\u4e00\u65e0\u6240\u77e5 \u2192 \u6f2b\u957f\u7684\"\u9884\u70ed\"\u671f \u957f\u5c3e\u4efb\u52a1 Benchmark \u8986\u76d6\u5e38\u89c1\u4efb\u52a1\uff1b\u7528\u6237\u7684\u5c0f\u4f17\u9700\u6c42\u65e0\u6cd5\u88ab\u79bb\u7ebf\u8bad\u7ec3\u8986\u76d6 \u73af\u5883\u6f02\u79fb \u5de5\u5177 API \u66f4\u65b0\u3001\u7528\u6237\u884c\u4e3a\u53d8\u5316 \u2192 \u9759\u6001\u6a21\u578b\u65e0\u6cd5\u81ea\u9002\u5e94"},{"location":"concepts/production-scenario/#claw-r1-agent","title":"Claw-R1 \u7684\u6838\u5fc3\u573a\u666f\uff1a\u4e2a\u4eba Agent \u81ea\u6211\u8fdb\u5316","text":"<p>Claw-R1 \u7684\u9996\u4e2a\u9a8c\u8bc1\u573a\u666f\u662f OpenClaw \u4e2a\u4eba\u52a9\u624b\uff1a</p> <pre><code>\u8bbe\u7f6e\uff1a\n  \u7528\u6237\u5728 Mac Mini \u4e0a\u90e8\u7f72 OpenClaw\uff0c\u8fde\u63a5 Slack / \u5fae\u4fe1 / \u90ae\u4ef6\u3002\n  \u6bcf\u5929\u901a\u8fc7\u6d88\u606f\u4e0e OpenClaw \u4ea4\u4e92\uff1a\u65e5\u7a0b\u5b89\u6392\u3001\u4fe1\u606f\u68c0\u7d22\u3001\u4ee3\u7801\u8f85\u52a9\u7b49\u3002\n\n\u4f20\u7edf\u65b9\u6848\uff1a\n  OpenClaw \u4f7f\u7528\u56fa\u5b9a\u7684 GPT-4o / Claude 3.5\u3002\n  \u80fd\u529b\u4e0d\u4f1a\u968f\u4f7f\u7528\u800c\u589e\u957f\u3002\n\nClaw-R1 \u65b9\u6848\uff1a\n  1. \u7528\u6237\u6d88\u606f \u2192 OpenClaw \u2192 Gateway\uff08\u62e6\u622a LLM \u8c03\u7528\uff09\n  2. Gateway \u8bb0\u5f55\u6bcf\u6b21\u4ea4\u4e92 \u2192 DataPool\uff08\u672c\u5730\uff09\n  3. Reward Model \u5bf9\u6bcf\u6b21\u4ea4\u4e92\u8bc4\u5206\n  4. \u8fdc\u7a0b\u670d\u52a1\u5668\u4e0a\u7684\u8bad\u7ec3\u5f15\u64ce\u6301\u7eed\u6d88\u8d39 DataPool\uff0c\u66f4\u65b0\u6a21\u578b\u6743\u91cd\n  5. \u66f4\u65b0\u7684\u6743\u91cd\u63a8\u9001\u56de Gateway\uff1b\u4e0b\u6b21\u8c03\u7528\u4f7f\u7528\u6539\u8fdb\u540e\u7684\u6a21\u578b\n\n\u7ed3\u679c\uff1a\n  \u7528\u6237 Mac Mini \u4e0a\u7684 OpenClaw \u4f1a\u968f\u65f6\u95f4\u63a8\u79fb\u8d8a\u6765\u8d8a\u4e86\u89e3\u8be5\u7528\u6237\u3002\n</code></pre>"},{"location":"concepts/production-scenario/#rl","title":"\u4f20\u7edf RL \u6846\u67b6\u65e0\u6cd5\u6ee1\u8db3\u7684\u4e09\u4e2a\u9700\u6c42","text":""},{"location":"concepts/production-scenario/#1","title":"\u2460 \u670d\u52a1\u8fde\u7eed\u6027","text":"<p>\u6a21\u578b\u6743\u91cd\u66f4\u65b0\u4e0d\u80fd\u4e2d\u65ad Gateway \u7684\u8bf7\u6c42\u5904\u7406\u3002\u5728 Claw-R1 \u4e2d\uff1a</p> <ul> <li>Trainer \u76f4\u63a5\u7ba1\u7406 Rollout Engine \u548c Reward Model \u7684\u751f\u547d\u5468\u671f\uff08<code>wake_up</code> / <code>sleep</code> / \u6743\u91cd\u540c\u6b65\uff09</li> <li>Gateway \u662f\u7eaf HTTP \u4ee3\u7406 \u2014 \u53ea\u8f6c\u53d1\u8bf7\u6c42\u548c\u63d0\u4ea4 step\uff1b\u4e0d\u7ba1\u7406\u4efb\u4f55\u5f15\u64ce\u751f\u547d\u5468\u671f</li> <li>\u8fd9\u4fdd\u8bc1\u4e86\u5373\u4f7f\u5728\u6743\u91cd\u66f4\u65b0\u671f\u95f4\uff0c\u8bf7\u6c42\u8f6c\u53d1\u548c\u6570\u636e\u6536\u96c6\u4e5f\u80fd\u6301\u7eed\u8fdb\u884c</li> </ul>"},{"location":"concepts/production-scenario/#2","title":"\u2461 \u65e0\u9884\u8bbe\u6570\u636e","text":"<p>\u4f20\u7edf\u6846\u67b6\u9700\u8981\u9884\u5148\u6536\u96c6\u7684\u6570\u636e\u96c6\u3002Claw-R1 \u7684\u8bad\u7ec3\u6570\u636e\u5b8c\u5168\u6765\u81ea\u5b9e\u65f6\u7528\u6237\u4ea4\u4e92\uff1a</p> <ul> <li>\u7528\u6237\u95ee\u4e86\u4ec0\u4e48\u3001Agent \u5982\u4f55\u56de\u7b54\u3001\u8c03\u7528\u4e86\u54ea\u4e9b\u5de5\u5177 \u2014 \u8fd9\u4e9b\u81ea\u52a8\u6210\u4e3a\u8bad\u7ec3\u6570\u636e</li> <li>\u96f6\u6570\u636e\u5de5\u7a0b\uff1b\u6570\u636e\u968f\u670d\u52a1\u8fd0\u884c\u81ea\u7136\u79ef\u7d2f</li> </ul>"},{"location":"concepts/production-scenario/#3-reward","title":"\u2462 \u771f\u5b9e\u73af\u5883\u7684 Reward \u4fe1\u53f7","text":"<p>\u4f20\u7edf RLVR \u7684 reward \u6765\u81ea\u53ef\u9a8c\u8bc1\u7684\u4efb\u52a1\u7ed3\u679c\u3002\u751f\u4ea7\u73af\u5883\u7684 reward \u66f4\u52a0\u5fae\u5999\uff1a</p> <ul> <li>\u7528\u6237\u7ee7\u7eed\u8ffd\u95ee \u2192 \u9690\u5f0f\u6b63\u4fe1\u53f7</li> <li>\u7528\u6237\u7ea0\u6b63 Agent \u2192 \u8d1f\u53cd\u9988</li> <li>\u4efb\u52a1\u5b8c\u6210\u540e\u65e0\u540e\u7eed \u2192 Reward Model \u4f30\u8ba1\u4e2d\u95f4\u6b65\u9aa4\u8d28\u91cf</li> </ul> <p>Claw-R1 \u4f7f\u7528 Reward Model \u5c06\u8fd9\u4e9b\u8f6f\u4fe1\u53f7\u8f6c\u6362\u4e3a\u53ef\u8bad\u7ec3\u7684 process reward\u3002</p>"},{"location":"concepts/production-scenario/#_1","title":"\u4e09\u79cd\u8fd0\u884c\u6a21\u5f0f","text":"\u6a21\u5f0f Agent \u7c7b\u578b \u6570\u636e\u6765\u6e90 \u8bf4\u660e \u767d\u76d2\u79bb\u7ebf AgentFlow (Python) \u5408\u6210\u6570\u636e\u96c6\u6216\u9884\u6536\u96c6\u7684 trajectory \u5df2\u5b8c\u6574\u5b9e\u73b0\uff1b\u63a8\u8350\u7528\u4e8e\u7814\u7a76 \u9ed1\u76d2\u79bb\u7ebf \u4efb\u4f55 HTTP Agent \u9884\u6536\u96c6\u7684\u6570\u636e\u96c6 \u5df2\u5b8c\u6574\u5b9e\u73b0\uff1b\u901a\u8fc7 <code>base_url</code> \u63a5\u5165 \u9ed1\u76d2\u5728\u7ebf \u4efb\u4f55 HTTP Agent \u5b9e\u65f6\u7528\u6237\u4ea4\u4e92 \u76ee\u6807\u751f\u4ea7\u6a21\u5f0f\uff1bGateway \u7aef\u70b9\u5df2\u5b9e\u73b0"},{"location":"concepts/production-scenario/#_2","title":"\u90e8\u7f72 = \u8bad\u7ec3","text":"<p>Claw-R1 \u5f15\u5165\u4e86\u4e00\u79cd\u65b0\u8303\u5f0f\uff1a</p> <pre><code>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502         \u4f20\u7edf\uff1a\u8bad\u7ec3 \u2192 \u90e8\u7f72\uff08\u56fa\u5b9a\uff09                      \u2502\n\u2502                                                      \u2502\n\u2502  [\u5408\u6210\u6570\u636e] \u2192 [\u8bad\u7ec3] \u2192 [\u56fa\u5b9a\u6a21\u578b] \u2192 \u7528\u6237               \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n\n\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502         Claw-R1\uff1a\u90e8\u7f72 = \u8bad\u7ec3\uff08\u6301\u7eed\uff09                   \u2502\n\u2502                                                      \u2502\n\u2502  \u7528\u6237 \u2500\u2500\u25ba Agent \u2500\u2500\u25ba [\u5b9e\u65f6\u6570\u636e] \u2500\u2500\u25ba \u8bad\u7ec3 \u2500\u2500\u25ba Agent     \u2502\n\u2502           \u25b2___________________________________|      \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre> <p>\u5728\u8fd9\u79cd\u8303\u5f0f\u4e0b\uff1a</p> <ul> <li>\u6bcf\u6b21\u7528\u6237\u4ea4\u4e92\u90fd\u662f\u4e00\u4e2a\u8bad\u7ec3\u6837\u672c</li> <li>\u6bcf\u6b21\u6a21\u578b\u66f4\u65b0\u90fd\u6539\u5584 Agent \u7684\u771f\u5b9e\u4e16\u754c\u8868\u73b0</li> <li>Agent \u8fd0\u884c\u65f6\u95f4\u8d8a\u957f\uff0c\u5bf9\u5176\u7279\u5b9a\u7528\u6237\u548c\u73af\u5883\u7684\u8868\u73b0\u8d8a\u597d</li> </ul>"},{"location":"configuration/","title":"Configuration Reference","text":"<p>Claw-R1 \u4f7f\u7528 Hydra \u8fdb\u884c\u5c42\u6b21\u5316\u914d\u7f6e\u7ba1\u7406\u3002\u6240\u6709 YAML \u914d\u7f6e\u4f4d\u4e8e <code>claw_r1/config/</code>\u3002</p>"},{"location":"configuration/#_1","title":"\u914d\u7f6e\u6587\u4ef6","text":"\u6587\u4ef6 \u7528\u9014 <code>agent_ppo_trainer.yaml</code> \u57fa\u7840 PPO trainer \u914d\u7f6e\uff08\u7ee7\u627f veRL \u7684 ppo_trainer\uff09 <code>async_ppo_trainer.yaml</code> \u5f02\u6b65\u8bad\u7ec3\u4e13\u7528\u914d\u7f6e <code>overrides/rollout.yaml</code> Rollout worker \u8bbe\u7f6e\uff08\u5f02\u6b65\u6a21\u5f0f\u3001Agent Flow\uff09"},{"location":"configuration/#async_ppo_traineryaml","title":"<code>async_ppo_trainer.yaml</code>","text":"<p>\u5f02\u6b65\u8bad\u7ec3\u7684\u6838\u5fc3\u914d\u7f6e\u6587\u4ef6\uff1a</p> <pre><code>defaults:\n  - ppo_trainer\n  - /overrides/rollout@actor_rollout_ref.rollout\n  - _self_\n\n# -- \u5f02\u6b65\u8bad\u7ec3\u8bbe\u7f6e --\nasync_training:\n  staleness_threshold: 0.1           # off-policy \u5bb9\u5fcd\u5ea6\n  trigger_parameter_sync_step: 4     # \u6bcf N \u6b65\u540c\u6b65\u6743\u91cd\u5230 Rollouter\n  require_batches: 1                 # \u6bcf\u6b21\u4ece DataPool \u53d6\u7684 batch \u6570\n  partial_rollout: false             # \u540c\u6b65\u65f6\u662f\u5426\u4e2d\u65ad\u8fdb\u884c\u4e2d\u7684 rollout\n  use_rollout_log_probs: true        # \u4f7f\u7528 rollout \u65f6\u6536\u96c6\u7684 log_probs\n  max_queue_size: null               # DataPool \u961f\u5217\u5927\u5c0f\uff08null = \u65e0\u9650\uff09\n\n  checkpoint_engine:\n    enable: true\n    device_buffer_size_M: 4096\n\n# -- Training GPU Pool --\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 4\n\n# -- Rollout GPU Pool --\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 4\n  total_epochs: 10\n  test_freq: 1\n\n# -- Actor \u914d\u7f6e --\nactor_rollout_ref:\n  hybrid_engine: false\n  actor:\n    use_rollout_log_probs: ${oc.select:async_training.use_rollout_log_probs, true}\n  checkpoint_engine: ${oc.select:async_training.checkpoint_engine, null}\n</code></pre> <p>GPU \u5206\u914d</p> <p><code>trainer</code> \u548c <code>rollout</code> \u90fd\u5fc5\u987b\u5206\u914d GPU\u3002\u603b GPU \u6570 = <code>trainer.nnodes \u00d7 trainer.n_gpus_per_node + rollout.nnodes \u00d7 rollout.n_gpus_per_node</code>\u3002</p>"},{"location":"configuration/#overridesrolloutyaml","title":"<code>overrides/rollout.yaml</code>","text":"<p>Rollout worker \u7684\u914d\u7f6e\u8986\u76d6\uff1a</p> <pre><code>name: vllm\nmode: async\n\nagent:\n  default_agent_flow: single_step_single_turn_agent\n  agent_flow_config_path: null\n</code></pre>"},{"location":"configuration/#gateway","title":"Gateway \u914d\u7f6e","text":"<p>Gateway \u4f5c\u4e3a\u72ec\u7acb\u8fdb\u7a0b\u8fd0\u884c\uff0c\u901a\u8fc7 CLI \u53c2\u6570\u914d\u7f6e\uff08\u975e Hydra\uff09\uff1a</p> <pre><code>python -m claw_r1.gateway.gateway \\\n    --data-pool-name   data_pool \\\n    --vllm-addresses   host1:8001,host2:8001 \\\n    --tokenizer-path   /path/to/model \\\n    --prompt-length    4096 \\\n    --response-length  1024 \\\n    --reward-worker-name reward_loop_worker \\\n    --ray-address      auto \\\n    --ray-namespace    default \\\n    --host             0.0.0.0 \\\n    --port             8100\n</code></pre> <p>Gateway \u542f\u52a8\u8d85\u65f6\u53ef\u901a\u8fc7 Hydra \u914d\u7f6e\uff1a</p> <pre><code>trainer:\n  gateway_startup_timeout: 300   # \u79d2\uff0c\u9ed8\u8ba4 300\n</code></pre>"},{"location":"configuration/#agent-flow","title":"Agent Flow \u914d\u7f6e","text":""},{"location":"configuration/#agent-flow_1","title":"\u767d\u76d2 Agent Flow","text":"<p>\u5728 <code>overrides/rollout.yaml</code> \u4e2d\u6307\u5b9a\uff1a</p> <pre><code>agent:\n  default_agent_flow: single_step_single_turn_agent\n</code></pre>"},{"location":"configuration/#agent-flow_2","title":"\u9ed1\u76d2 Agent Flow","text":"<p>\u901a\u8fc7\u5916\u90e8 YAML \u6587\u4ef6\u6ce8\u518c\uff1a</p> <pre><code># claw_r1/blackbox_agent/agent_flow_config.yaml\n- name: blackbox_gsm8k_agent\n  _target_: claw_r1.blackbox_agent.gsm8k_agent_flow.BlackBoxGSM8KAgentFlow\n</code></pre> <p>\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u5f15\u7528\uff1a</p> <pre><code>actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\nactor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml\n</code></pre>"},{"location":"configuration/#gpu","title":"\u591a GPU \u914d\u7f6e","text":"<pre><code># \u72ec\u7acb\u7684 GPU \u6c60\ntrainer:\n  nnodes: 1\n  n_gpus_per_node: 2    # 2 GPU \u7528\u4e8e\u8bad\u7ec3\uff08Actor + Critic\uff09\n\nrollout:\n  nnodes: 1\n  n_gpus_per_node: 1    # 1 GPU \u7528\u4e8e\u63a8\u7406\uff08vLLM\uff09\n</code></pre> <p>\u8d44\u6e90\u6c60\u9694\u79bb</p> <p>Claw-R1 \u4f7f\u7528 Ray \u7684\u8d44\u6e90\u7ec4\u673a\u5236\u786e\u4fdd Trainer \u548c Rollouter \u7684 GPU \u4e0d\u91cd\u53e0\u3002\u4f7f\u7528 <code>async_ppo_trainer.yaml</code> \u65f6\u81ea\u52a8\u914d\u7f6e\u3002\u8be6\u89c1 Async Training\u3002</p>"},{"location":"configuration/#_2","title":"\u5b8c\u6574\u8bad\u7ec3\u811a\u672c\u793a\u4f8b","text":"<pre><code>python3 -m claw_r1.async_main \\\n    algorithm.adv_estimator=grpo \\\n    data.train_files=$TRAIN_FILE \\\n    data.val_files=$VAL_FILE \\\n    data.train_batch_size=128 \\\n    data.max_prompt_length=512 \\\n    data.max_response_length=1024 \\\n    data.return_raw_chat=True \\\n    actor_rollout_ref.model.path=$MODEL \\\n    actor_rollout_ref.rollout.tensor_model_parallel_size=1 \\\n    actor_rollout_ref.rollout.name=vllm \\\n    actor_rollout_ref.rollout.gpu_memory_utilization=0.8 \\\n    actor_rollout_ref.rollout.n=5 \\\n    actor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent \\\n    actor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml \\\n    trainer.n_gpus_per_node=2 \\\n    trainer.nnodes=1 \\\n    rollout.n_gpus_per_node=1 \\\n    rollout.nnodes=1 \\\n    async_training.trigger_parameter_sync_step=1 \\\n    async_training.use_rollout_log_probs=true\n</code></pre> <p>\u66f4\u591a\u793a\u4f8b\u89c1 <code>example/</code> \u76ee\u5f55\u3002</p>"},{"location":"getting-started/","title":"Getting Started","text":"<ul> <li> <p> Installation</p> <p>\u73af\u5883\u914d\u7f6e\u3001\u4f9d\u8d56\u5b89\u88c5\u548c\u9a8c\u8bc1\u3002</p> <p> Installation</p> </li> <li> <p> Quick Start</p> <p>5 \u5206\u949f\u5185\u8fd0\u884c\u4f60\u7684\u7b2c\u4e00\u4e2a\u5f02\u6b65\u8bad\u7ec3\u5b9e\u9a8c\u3002</p> <p> Quick Start</p> </li> </ul>"},{"location":"getting-started/#_1","title":"\u524d\u7f6e\u6761\u4ef6","text":"\u4f9d\u8d56 \u6700\u4f4e\u7248\u672c Python 3.10+ PyTorch 2.0+ CUDA 12.1+ Ray 2.10+ GPU 3 \u5f20\uff082 \u8bad\u7ec3 + 1 \u63a8\u7406\uff09"},{"location":"getting-started/#_2","title":"\u67b6\u6784\u4e00\u89c8","text":"<pre><code>\u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n\u2502   Agent     \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 Gateway  \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 DataPool \u2502\u2500\u2500\u2500\u2500\u25ba\u2502 Trainer  \u2502\n\u2502 (\u9ed1\u76d2/\u767d\u76d2) \u2502\u25c4\u2500\u2500\u2500\u2500\u2502 (:8100)  \u2502     \u2502          \u2502     \u2502          \u2502\n\u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518     \u2514\u2500\u2500\u2500\u2500\u252c\u2500\u2500\u2500\u2500\u2500\u2518\n                                                           \u2502 \u6743\u91cd\u540c\u6b65\n                                                           \u25bc\n                                                     \u250c\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2510\n                                                     \u2502  vLLM    \u2502\n                                                     \u2514\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2518\n</code></pre>"},{"location":"getting-started/installation/","title":"Installation Guide","text":"<p>Claw-R1 uses the same environment setup as <code>verl</code>.</p>"},{"location":"getting-started/installation/#base-environment","title":"Base Environment","text":"<p>Follow the official <code>verl</code> installation guide, but make sure the environment ends up with <code>verl==0.7.0</code>.</p> <p>If you want a broader overview of the base training workflow, the <code>verl</code> quickstart is also useful.</p>"},{"location":"getting-started/installation/#what-this-means-for-claw-r1","title":"What This Means for Claw-R1","text":"<p>Once the <code>verl</code> environment is working, Claw-R1 should run in the same environment. In practice, that means you can:</p> <ul> <li>prepare a Python environment with <code>verl==0.7.0</code></li> <li>clone this repository</li> <li>run Claw-R1 commands directly from the repository root</li> </ul> <p>You do not need to install Claw-R1 as a separate package.</p> <p>The documentation in this repository intentionally does not duplicate a separate environment guide, so that the infrastructure setup stays aligned with <code>verl</code>.</p>"},{"location":"getting-started/quickstart/","title":"Quick Start","text":"<p>\u672c\u6307\u5357\u5c55\u793a\u5982\u4f55\u5feb\u901f\u8fd0\u884c Claw-R1 \u7684\u5f02\u6b65\u8bad\u7ec3\u3002</p>"},{"location":"getting-started/quickstart/#_1","title":"\u524d\u7f6e\u6761\u4ef6","text":"<ul> <li>\u5df2\u5b8c\u6210 \u5b89\u88c5</li> <li>\u81f3\u5c11 3 \u5f20 GPU\uff082 \u5f20\u8bad\u7ec3 + 1 \u5f20\u63a8\u7406\uff09</li> <li>\u8bad\u7ec3\u6570\u636e\uff08parquet \u683c\u5f0f\uff09</li> </ul>"},{"location":"getting-started/quickstart/#black-box","title":"Black-box \u6a21\u5f0f\uff08\u63a8\u8350\u5165\u95e8\uff09","text":"<p>\u9ed1\u76d2\u6a21\u5f0f\u4e0b\uff0cAgent \u4f7f\u7528\u6807\u51c6 OpenAI API \u4e0e Gateway \u4ea4\u4e92\uff0c\u65e0\u9700\u4fee\u6539 Agent \u4ee3\u7801\u3002\u4ee5 GSM8K \u6570\u5b66\u9898\u4e3a\u4f8b\uff1a</p>"},{"location":"getting-started/quickstart/#1","title":"1. \u51c6\u5907\u6570\u636e","text":"<pre><code># \u4e0b\u8f7d GSM8K \u6570\u636e\u96c6\uff08parquet \u683c\u5f0f\uff09\n# \u786e\u4fdd train.parquet \u548c test.parquet \u5728 ~/data/gsm8k/ \u4e0b\n</code></pre>"},{"location":"getting-started/quickstart/#2","title":"2. \u8fd0\u884c\u8bad\u7ec3","text":"<pre><code>export CUDA_VISIBLE_DEVICES=0,1,2\n\nsh example/test_async_blackbox.sh\n</code></pre> <p>\u8be5\u811a\u672c\u4f1a\uff1a</p> <ol> <li>\u542f\u52a8 Ray \u96c6\u7fa4</li> <li>\u521b\u5efa DataPool\uff08Ray Actor\uff09</li> <li>\u5728 GPU 0-1 \u4e0a\u90e8\u7f72 Actor + Critic\uff08\u8bad\u7ec3\uff09</li> <li>\u5728 GPU 2 \u4e0a\u90e8\u7f72 vLLM\uff08\u63a8\u7406\uff09</li> <li>\u542f\u52a8 Gateway\uff08\u7aef\u53e3 8100\uff09</li> <li>\u8fd0\u884c <code>BlackBoxGSM8KAgentFlow</code>\uff1a<ul> <li>\u4e3a\u6bcf\u4e2a\u6837\u672c\u8c03\u7528 <code>init_trajectory</code> \u83b7\u53d6 <code>base_url</code></li> <li>\u521b\u5efa <code>GSM8KAgent</code>\uff0c\u4f7f\u7528 <code>base_url</code> \u4f5c\u4e3a OpenAI API \u7684 endpoint</li> <li>Agent \u901a\u8fc7\u591a\u8f6e tool calling \u89e3\u9898</li> <li>Gateway \u81ea\u52a8\u6536\u96c6\u6bcf\u8f6e\u5bf9\u8bdd\u4e3a Step \u5e76\u63d0\u4ea4\u5230 DataPool</li> </ul> </li> <li>AsyncTrainer \u4ece DataPool \u62c9\u53d6 batch \u8fdb\u884c PPO \u8bad\u7ec3</li> <li>\u5b9a\u671f\u540c\u6b65\u6743\u91cd\u5230 vLLM</li> </ol>"},{"location":"getting-started/quickstart/#3","title":"3. \u5173\u952e\u914d\u7f6e\u53c2\u6570","text":"<pre><code># GPU \u5206\u914d\ntrainer.n_gpus_per_node=2        # \u8bad\u7ec3\u7528 2 \u5f20 GPU\nrollout.n_gpus_per_node=1        # \u63a8\u7406\u7528 1 \u5f20 GPU\n\n# Agent Flow\nactor_rollout_ref.rollout.agent.default_agent_flow=blackbox_gsm8k_agent\nactor_rollout_ref.rollout.agent.agent_flow_config_path=claw_r1/blackbox_agent/agent_flow_config.yaml\n\n# \u5f02\u6b65\u8bad\u7ec3\nasync_training.trigger_parameter_sync_step=1   # \u6bcf\u6b65\u540c\u6b65\u6743\u91cd\nactor_rollout_ref.rollout.n=5                  # \u6bcf\u4e2a prompt \u751f\u6210 5 \u6761 trajectory\n</code></pre>"},{"location":"getting-started/quickstart/#white-box","title":"White-box \u6a21\u5f0f","text":"<p>\u767d\u76d2\u6a21\u5f0f\u4e0b\uff0cAgent \u903b\u8f91\u7528 Python \u7f16\u5199\uff0c\u76f4\u63a5\u901a\u8fc7 Gateway \u7684 <code>/generate</code> \u548c <code>/submit_steps</code> \u7aef\u70b9\u4ea4\u4e92\u3002</p> <pre><code>export CUDA_VISIBLE_DEVICES=0,1,2\n\nsh example/test_async.sh\n</code></pre> <p>\u767d\u76d2\u6a21\u5f0f\u4f7f\u7528 <code>MultiStepAgentFlow</code> \u6216 <code>SingleStepSingleTurnAgentFlow</code>\uff0cAgent \u81ea\u884c\u7ba1\u7406 tokenize \u548c Step \u6784\u5efa\u3002</p>"},{"location":"getting-started/quickstart/#agent","title":"\u81ea\u5b9a\u4e49 Agent","text":""},{"location":"getting-started/quickstart/#agent_1","title":"\u6dfb\u52a0\u9ed1\u76d2 Agent","text":"<ol> <li>\u5b9e\u73b0 Agent \u7c7b\uff08\u53ea\u9700 <code>base_url</code> \u548c OpenAI API\uff09</li> <li>\u5b9e\u73b0 <code>BlackBoxAgentFlowBase</code> \u5b50\u7c7b</li> <li>\u5728 <code>agent_flow_config.yaml</code> \u4e2d\u6ce8\u518c</li> <li>\u5728\u8bad\u7ec3\u811a\u672c\u4e2d\u6307\u5b9a</li> </ol> <p>\u8be6\u7ec6\u6b65\u9aa4\u89c1 Black-box Agent\u3002</p>"},{"location":"getting-started/quickstart/#agent_2","title":"\u6dfb\u52a0\u767d\u76d2 Agent","text":"<ol> <li>\u7ee7\u627f <code>AgentFlowBase</code>\uff08\u6216 <code>MultiStepAgentFlow</code>\uff09</li> <li>\u5b9e\u73b0 <code>run()</code> \u65b9\u6cd5</li> <li>\u4f7f\u7528 <code>@register(\"name\")</code> \u6ce8\u518c</li> </ol> <p>\u8be6\u7ec6\u6b65\u9aa4\u89c1 Agent Flow\u3002</p>"},{"location":"getting-started/quickstart/#_2","title":"\u76d1\u63a7\u8bad\u7ec3","text":"<p>\u8bad\u7ec3\u65e5\u5fd7\u9ed8\u8ba4\u8f93\u51fa\u5230\u63a7\u5236\u53f0\u3002\u53ef\u914d\u7f6e SwanLab \u7b49\u65e5\u5fd7\u540e\u7aef\uff1a</p> <pre><code>trainer.logger='[\"console\",\"swanlab\"]'\ntrainer.project_name='my_project'\ntrainer.experiment_name='my_experiment'\n</code></pre>"},{"location":"getting-started/quickstart/#_3","title":"\u4e0b\u4e00\u6b65","text":"<ul> <li>Components \u2014 \u4e86\u89e3\u5404\u7ec4\u4ef6\u7684\u8be6\u7ec6\u8bbe\u8ba1</li> <li>Configuration \u2014 \u5b8c\u6574\u914d\u7f6e\u53c2\u8003</li> <li>Gateway API \u2014 HTTP \u7aef\u70b9\u6587\u6863</li> </ul>"}]}
\ No newline at end of file