In [1]:
from pathlib import Path

fp = Path.cwd() / "data" / "pdf" / "DeepSeek_R1.pdf"

In [2]:
from lionagi import Branch, iModel, BaseModel, Field
from lionagi.tools.types import ReaderTool

In [3]:
class Source(BaseModel):
    title: str
    url: str


class ResearchReport(BaseModel):
    title: str | None = None
    content: str = Field(
        description="A detailed factual well argued report on the research and findings."
    )
    source: list[Source] | None = None

In [4]:
r1 = iModel(
    provider="openrouter",
    model="deepseek/deepseek-r1",
    max_tokens=8000,
    invoke_with_endpoint=False,
    temperature=0.65,
    top_p=0.9,
)

In [5]:
a = Branch(chat_model=r1, tools=ReaderTool)
a.connect(
    name="search_exa",
    provider="exa",
    endpoint="search",
    queue_capacity=5,
    capacity_refresh_time=1,
    description="Search the exa database for relevant information",
)
a.connect(
    name="search_perplexity",
    provider="perplexity",
    queue_capacity=100,
    capacity_refresh_time=60,
    description="Search the perplexity database for relevant information",
)

In [6]:
result = await a.ReAct(
    instruct={
        "instruction": "explain to me what the paper is about in detail, compare with other recent papers on same discipline and provide a comparison of the results",
        "context": {"paper_url": str(fp)},
    },
    interpret=True,
    interpret_domain="AI",
    interpret_style="exhaustive",
    extension_allowed=True,
    max_extensions=5,
    verbose=True,
    response_format=ResearchReport,
)

Action reader_tool invoked, status: completed.
Action reader_tool invoked, status: completed.
Action search_exa invoked, status: completed.
ReAct Round #1 Analysis:
 {
  "analysis": "To address the user's request, the immediate priority is to parse and analyze the DeepSeek_R1.pdf paper using the reader_tool. After extracting its content, a structured summary will be generated. Concurrently, recent papers (post-2022) must be retrieved via external search tools to enable comparative analysis. The first step involves opening and reading the target PDF, followed by searches for relevant recent works.",
  "planned_actions": [
    {
      "action_type": "reader_tool",
      "description": "Open and read the entire DeepSeek_R1.pdf to extract text for analysis."
    },
    {
      "action_type": "search_exa",
      "description": "Search for recent LLM papers (post-2022) focusing on training efficiency, architecture innovations, and benchmark results."
    }
  ],
  "extension_needed": true,
  

In [7]:
a.to_df()

Unnamed: 0,created_at,role,content,id,sender,recipient,metadata
0,2025-01-20 21:03:14.209670,user,{'context': [{'paper_url': '/Users/lion/lionag...,fdc3bf95-20f1-4ef6-9e30-3000f8b7557f,user,2d394afd-2800-4303-9561-9a19a3f55ab8,{'lion_class': 'lionagi.protocols.messages.ins...
1,2025-01-20 21:03:32.636672,assistant,"{'assistant_response': '```json {  ""analysis""...",8eb368aa-ee95-44cb-8f1a-5520c647db58,2d394afd-2800-4303-9561-9a19a3f55ab8,user,{'model_response': {'id': 'gen-1737424994-WMdZ...
2,2025-01-20 21:03:32.642525,action,"{'action_request': {'function': 'reader_tool',...",00c328f6-747b-4a8d-917d-0510a4983959,2d394afd-2800-4303-9561-9a19a3f55ab8,7b1144b6-125c-462f-9df5-8c8b0b7f8d85,{'lion_class': 'lionagi.protocols.messages.act...
3,2025-01-20 21:03:32.642601,action,{'action_request_id': '00c328f6-747b-4a8d-917d...,cc8c5e58-66d0-408f-bed9-727058251187,7b1144b6-125c-462f-9df5-8c8b0b7f8d85,2d394afd-2800-4303-9561-9a19a3f55ab8,{'lion_class': 'lionagi.protocols.messages.act...
4,2025-01-20 21:03:52.265885,action,"{'action_request': {'function': 'reader_tool',...",463507f3-80ff-44d4-94ac-bd572b1667a5,2d394afd-2800-4303-9561-9a19a3f55ab8,7b1144b6-125c-462f-9df5-8c8b0b7f8d85,{'lion_class': 'lionagi.protocols.messages.act...
5,2025-01-20 21:03:52.265967,action,{'action_request_id': '463507f3-80ff-44d4-94ac...,b33463c3-588b-40e6-96a9-5c5243c7f280,7b1144b6-125c-462f-9df5-8c8b0b7f8d85,2d394afd-2800-4303-9561-9a19a3f55ab8,{'lion_class': 'lionagi.protocols.messages.act...
6,2025-01-20 21:03:55.253324,action,"{'action_request': {'function': 'search_exa', ...",baefe8b5-8c25-4223-9d92-dd67264e7a60,2d394afd-2800-4303-9561-9a19a3f55ab8,6aca17b6-fad8-4cf2-b2cf-b98afa921dae,{'lion_class': 'lionagi.protocols.messages.act...
7,2025-01-20 21:03:55.253463,action,{'action_request_id': 'baefe8b5-8c25-4223-9d92...,2c94453d-8c64-4f6d-a4d1-0ddc696b5baa,6aca17b6-fad8-4cf2-b2cf-b98afa921dae,2d394afd-2800-4303-9561-9a19a3f55ab8,{'lion_class': 'lionagi.protocols.messages.act...
8,2025-01-20 21:03:55.266244,user,{'context': [{'action_request_id': '00c328f6-7...,a6e70d36-68c8-4c49-bcdb-517f30674f3e,user,2d394afd-2800-4303-9561-9a19a3f55ab8,{'lion_class': 'lionagi.protocols.messages.ins...
9,2025-01-20 21:04:10.677831,assistant,"{'assistant_response': '```json {  ""analysis""...",255acbfe-91dc-4f9f-a984-47105814879c,2d394afd-2800-4303-9561-9a19a3f55ab8,user,{'model_response': {'id': 'gen-1737425035-sP0p...


In [8]:
from IPython.display import Markdown


def display_report(report: ResearchReport):
    md_text = f"# {report.title or 'Research Findings'}\n\n"
    md_text += f"{report.content or ''}\n\n"
    if report.source:
        for s in report.source:
            md_text += f"**Source**: [{s.title}]({s.url})\n\n"
    return Markdown(md_text)


display_report(result)

# DeepSeek-R1: Advancing LLM Reasoning Through Reinforcement Learning

### Structured Summary
**Key Themes**:
- Pure reinforcement learning (RL) for reasoning capability emergence without supervised fine-tuning (SFT)
- Self-evolution of chain-of-thought (CoT) behaviors through Group Relative Policy Optimization (GRPO)
- Multi-stage training pipeline with cold-start data integration
- Scalable knowledge distillation to smaller models (1.5B-70B parameters)

**Methodologies**:
1. GRPO algorithm with group-based advantage estimation
2. Rule-based reward system (accuracy + format constraints)
3. Two-phase RL: reasoning-focused then general alignment
4. Distillation using 800K curated samples from R1 outputs

**Contributions**:
- First open-source demonstration of SFT-free RL reasoning (71% → 86.7% AIME with voting)
- Performance parity with OpenAI-o1-1217 (79.8% vs 79.2% AIME pass@1)
- 14B distilled model outperforms QwQ-32B by +19.7% on MATH-500
- Open-source release of 6 distilled models across Qwen/Llama architectures

### Comparative Analysis
**Recent Works (2023-2024)**:
1. *Training 1.7B LLaMa From Scratch* (arXiv:2412.13335):
   - Similar: Emphasis on data quality
   - Different: Requires SFT initialization vs DeepSeek's pure RL
   - Metrics: 20B tokens trained vs DeepSeek's RL-first approach

2. *Computational Bottlenecks of SLMs* (arXiv:2410.19456):
   - Similar: Focus on training efficiency
   - Different: Analyzes hardware constraints vs capability emergence
   - Metrics: 2B param focus vs DeepSeek's 70B distillation

3. *BabyHGRN: RNN Efficiency* (arXiv:2412.15978):
   - Similar: Compute efficiency goals
   - Different: RNN architecture vs transformer-based RL
   - Metrics: 55.5% BLiMP vs DeepSeek's 97.3% MATH-500

**Results Comparison**:
| Model               | AIME Pass@1 | MATH-500 | Training Efficiency |
|---------------------|-------------|----------|---------------------|
| DeepSeek-R1         | 79.8%       | 97.3%    | 37B active params   |
| OpenAI-o1-1217      | 79.2%       | 96.4%    | Closed-source       |
| BabyHGRN (100M)     | N/A         | 83.9%    | RNN-based           |
| QwQ-32B-Preview     | 50.0%       | 90.6%    | SFT-focused         |

### Critical Evaluation
**Novelty**:
- First successful pure RL reasoning capability emergence
- Cold-start pipeline resolving RL instability
- 94.3% MATH-500 performance at 32B scale via distillation

**Limitations**:
- Language mixing in non-Chinese/English queries
- Sensitivity to prompt engineering (requires zero-shot)
- Limited SWE benchmark improvement (49.2% vs base 42%)

**Impact**: Provides blueprint for RL-first training while maintaining 87.6% AlpacaEval performance

**Source**: [DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning](/Users/lion/lionagi/notebooks/data/pdf/DeepSeek_R1.pdf)

**Source**: [Experience of Training a 1.7B-Parameter LLaMa Model From Scratch](https://arxiv.org/abs/2412.13335)

**Source**: [Computational Bottlenecks of Training Small-scale Large Language Models](https://arxiv.org/abs/2410.19456)

**Source**: [BabyHGRN: Exploring RNNs for Sample-Efficient Training of Language Models](https://arxiv.org/abs/2412.15978)



In [9]:
from IPython.display import Markdown, display

for i in a.messages:
    if "assistant" in i.role:
        display(Markdown(i.rendered))

# Assistant Response

**Response**:
```json
{
  "analysis": "To address the user's request, the immediate priority is to parse and analyze the DeepSeek_R1.pdf paper using the reader_tool. After extracting its content, a structured summary will be generated. Concurrently, recent papers (post-2022) must be retrieved via external search tools to enable comparative analysis. The first step involves opening and reading the target PDF, followed by searches for relevant recent works.",
  "planned_actions": [
    {
      "action_type": "reader_tool",
      "description": "Open and read the entire DeepSeek_R1.pdf to extract text for analysis."
    },
    {
      "action_type": "search_exa",
      "description": "Search for recent LLM papers (post-2022) focusing on training efficiency, architecture innovations, and benchmark results."
    }
  ],
  "extension_needed": true,
  "milestone": "Extract DeepSeek paper content and identify 3-5 relevant recent papers for comparison.",
  "action_strategy": "sequential",
  "action_batch_size": null,
  "reason": {
    "title": "Initial Action Plan",
    "content": "The DeepSeek paper must be parsed first to identify its core contributions. Parallel searches for recent works will ensure up-to-date comparisons. Sequential execution avoids partial data analysis.",
    "confidence_score": 0.95
  },
  "action_requests": [
    {
      "function": "reader_tool",
      "arguments": {
        "action": "open",
        "path_or_url": "/Users/lion/lionagi/notebooks/data/pdf/DeepSeek_R1.pdf"
      }
    },
    {
      "function": "reader_tool",
      "arguments": {
        "action": "read",
        "doc_id": "DeepSeek_R1.pdf",
        "start_offset": 0,
        "end_offset": null
      }
    },
    {
      "function": "search_exa",
      "arguments": {
        "query": "large language model training efficiency innovations post-2022",
        "category": "research paper",
        "numResults": 5,
        "startPublishedDate": "2023-01-01T00:00:00.000Z",
        "contents": {
          "text": {
            "includeHtmlTags": false,
            "maxCharacters": 5000
          }
        }
      }
    }
  ],
  "action_required": true
}
```

# Assistant Response

**Response**:
```json
{
  "analysis": "The initial attempt to read 'DeepSeek_R1.pdf' failed due to incorrect doc_id reference. The document is now successfully opened (doc_id: DOC_151075259435909708). Recent comparative papers (2023-2024) have been retrieved through search_exa, focusing on LLM training efficiency, architecture innovations, and computational optimization. Immediate next step is to read the full DeepSeek paper using the confirmed doc_id to enable structured analysis.",
  "planned_actions": [
    {
      "action_type": "reader_tool",
      "description": "Read full DeepSeek_R1 content using valid doc_id for analysis."
    }
  ],
  "extension_needed": true,
  "milestone": "Obtain full text of DeepSeek paper to begin thematic analysis",
  "action_strategy": "sequential",
  "action_batch_size": null,
  "reason": {
    "title": "Prerequisite Data Collection",
    "content": "The paper's content must be fully extracted before meaningful comparison can be made with the 5 retrieved recent works (2023-2024). The search results show relevant focus areas including training efficiency (2410.19456), architecture alternatives (2412.15978), and parameter tuning (2411.08610) that will enable targeted comparison once DeepSeek's methodology is known.",
    "confidence_score": 0.92
  },
  "action_requests": [
    {
      "function": "reader_tool",
      "arguments": {
        "action": "read",
        "doc_id": "DOC_151075259435909708",
        "start_offset": 0,
        "end_offset": null
      }
    }
  ],
  "action_required": true
}
```

# Assistant Response

**Response**:
```json
{
  "analysis": "DeepSeek-R1 presents a novel RL-centric approach to enhance LLM reasoning without SFT, validated by competitive benchmarks. Key innovations include cold-start data integration and scalable distillation. Comparison with 2023-2024 works reveals distinct methodological differences in training efficiency and architectural choices.",
  "planned_actions": [],
  "extension_needed": false,
  "milestone": null,
  "action_strategy": "sequential",
  "action_batch_size": null,
  "reason": {
    "title": "Final Analysis Ready",
    "content": "Full paper content and comparative papers have been processed. Structured summary and critical evaluation can now be formulated.",
    "confidence_score": 0.97
  },
  "action_requests": [],
  "action_required": false,
  "structured_summary": {
    "title": "DeepSeek-R1: Incentivizing Reasoning via RL",
    "key_themes": [
      "Pure RL-driven reasoning without SFT initialization",
      "Self-evolution of CoT capabilities through GRPO algorithm",
      "Multi-stage training with cold-start data for human alignment",
      "Knowledge distillation to smaller models (1.5B-70B parameters)"
    ],
    "methodologies": [
      "Group Relative Policy Optimization (GRPO) with group-based advantage estimation",
      "Rule-based reward system (accuracy + format constraints)",
      "Two-phase RL: reasoning-focused then general alignment",
      "Distillation using 800K curated samples from R1 outputs"
    ],
    "contributions": [
      "First demonstration of SFT-free RL reasoning capability emergence",
      "79.8% pass@1 on AIME 2024 (vs OpenAI-o1-1217's 79.2%)",
      "14B distilled model outperforms QwQ-32B-Preview by +19.7% on MATH-500",
      "Open-source release of 6 distilled models across architectures"
    ]
  },
  "comparative_analysis": {
    "recent_works": [
      {
        "paper": "Experience of Training 1.7B LLaMa (2412.13335)",
        "similarities": "Emphasis on training stability and data quality",
        "differences": "Relies on SFT initialization vs DeepSeek's pure RL approach",
        "metrics": "20B tokens trained vs DeepSeek's RL-focused pipeline"
      },
      {
        "paper": "Computational Bottlenecks of SLMs (2410.19456)",
        "similarities": "Focus on resource-efficient training",
        "differences": "Analyzes hardware constraints rather than capability emergence",
        "metrics": "2B param focus vs DeepSeek's 70B distillation"
      },
      {
        "paper": "BabyHGRN: RNN Efficiency (2412.15978)",
        "similarities": "Pursuit of compute efficiency",
        "differences": "RNN architecture vs DeepSeek's transformer-based RL",
        "metrics": "55.5% BLiMP vs DeepSeek's 97.3% MATH-500"
      }
    ],
    "results_comparison": {
      "table": [
        ["Model", "AIME Pass@1", "MATH-500", "Training Efficiency"],
        ["DeepSeek-R1", "79.8%", "97.3%", "37B active params"],
        ["OpenAI-o1-1217", "79.2%", "96.4%", "Closed-source"],
        ["BabyHGRN (100M)", "N/A", "83.9%", "RNN-based"],
        ["QwQ-32B-Preview", "50.0%", "90.6%", "SFT-focused"]
      ]
    }
  },
  "critical_evaluation": {
    "novelty": [
      "First open-source demonstration of pure RL reasoning emergence",
      "Cold-start data pipeline resolving RL instability",
      "Scalable distillation maintaining 94.3% MATH-500 at 32B scale"
    ],
    "limitations": [
      "Language mixing in non-Chinese/English queries",
      "Sensitivity to prompt engineering (zero-shot required)",
      "Limited SWE benchmark improvement vs base model"
    ],
    "industry_impact": "Provides blueprint for RL-first training pipelines while maintaining 87.6% AlpacaEval 2.0 performance",
    "research_gaps": "Integration with process reward models and MCTS remains unexplored despite attempts"
  }
}
```

# Assistant Response

**Response**:
```json
{
  "title": "DeepSeek-R1: Advancing LLM Reasoning Through Reinforcement Learning",
  "content": "### Structured Summary\n**Key Themes**:\n- Pure reinforcement learning (RL) for reasoning capability emergence without supervised fine-tuning (SFT)\n- Self-evolution of chain-of-thought (CoT) behaviors through Group Relative Policy Optimization (GRPO)\n- Multi-stage training pipeline with cold-start data integration\n- Scalable knowledge distillation to smaller models (1.5B-70B parameters)\n\n**Methodologies**:\n1. GRPO algorithm with group-based advantage estimation\n2. Rule-based reward system (accuracy + format constraints)\n3. Two-phase RL: reasoning-focused then general alignment\n4. Distillation using 800K curated samples from R1 outputs\n\n**Contributions**:\n- First open-source demonstration of SFT-free RL reasoning (71% → 86.7% AIME with voting)\n- Performance parity with OpenAI-o1-1217 (79.8% vs 79.2% AIME pass@1)\n- 14B distilled model outperforms QwQ-32B by +19.7% on MATH-500\n- Open-source release of 6 distilled models across Qwen/Llama architectures\n\n### Comparative Analysis\n**Recent Works (2023-2024)**:\n1. *Training 1.7B LLaMa From Scratch* (arXiv:2412.13335):\n   - Similar: Emphasis on data quality\n   - Different: Requires SFT initialization vs DeepSeek's pure RL\n   - Metrics: 20B tokens trained vs DeepSeek's RL-first approach\n\n2. *Computational Bottlenecks of SLMs* (arXiv:2410.19456):\n   - Similar: Focus on training efficiency\n   - Different: Analyzes hardware constraints vs capability emergence\n   - Metrics: 2B param focus vs DeepSeek's 70B distillation\n\n3. *BabyHGRN: RNN Efficiency* (arXiv:2412.15978):\n   - Similar: Compute efficiency goals\n   - Different: RNN architecture vs transformer-based RL\n   - Metrics: 55.5% BLiMP vs DeepSeek's 97.3% MATH-500\n\n**Results Comparison**:\n| Model               | AIME Pass@1 | MATH-500 | Training Efficiency |\n|---------------------|-------------|----------|---------------------|\n| DeepSeek-R1         | 79.8%       | 97.3%    | 37B active params   |\n| OpenAI-o1-1217      | 79.2%       | 96.4%    | Closed-source       |\n| BabyHGRN (100M)     | N/A         | 83.9%    | RNN-based           |\n| QwQ-32B-Preview     | 50.0%       | 90.6%    | SFT-focused         |\n\n### Critical Evaluation\n**Novelty**:\n- First successful pure RL reasoning capability emergence\n- Cold-start pipeline resolving RL instability\n- 94.3% MATH-500 performance at 32B scale via distillation\n\n**Limitations**:\n- Language mixing in non-Chinese/English queries\n- Sensitivity to prompt engineering (requires zero-shot)\n- Limited SWE benchmark improvement (49.2% vs base 42%)\n\n**Impact**: Provides blueprint for RL-first training while maintaining 87.6% AlpacaEval performance",
  "source": [
    {
      "title": "DeepSeek-R1: Incentivizing Reasoning Capability in LLMs via Reinforcement Learning",
      "url": "/Users/lion/lionagi/notebooks/data/pdf/DeepSeek_R1.pdf"
    },
    {
      "title": "Experience of Training a 1.7B-Parameter LLaMa Model From Scratch",
      "url": "https://arxiv.org/abs/2412.13335"
    },
    {
      "title": "Computational Bottlenecks of Training Small-scale Large Language Models",
      "url": "https://arxiv.org/abs/2410.19456"
    },
    {
      "title": "BabyHGRN: Exploring RNNs for Sample-Efficient Training of Language Models",
      "url": "https://arxiv.org/abs/2412.15978"
    }
  ]
}
```