# Import Libraries

In [12]:
import crewai
import crewai_tools
import arxiv

ValueError: The tokenizers python package is not installed. Please install it with `pip install tokenizers`

In [13]:
from dotenv import load_dotenv
import os

# 2. Configurate Env Variables

In [14]:
# Load .env file
load_dotenv()

True

In [15]:
from langchain_openai import ChatOpenAI
# Set up OpenRouter configuration
openrouter_api_key = os.getenv("OPENROUTER_API_KEY")

In [16]:
# Test OpenRouter
from openai import OpenAI

client = OpenAI(
  base_url="https://openrouter.ai/api/v1",
  api_key=openrouter_api_key,
)

completion = client.chat.completions.create(
  extra_headers={
    "HTTP-Referer": "<YOUR_SITE_URL>", # Optional. Site URL for rankings on openrouter.ai.
    "X-Title": "<YOUR_SITE_NAME>", # Optional. Site title for rankings on openrouter.ai.
  },
  extra_body={},
  model="openai/gpt-oss-20b:free",
  messages=[
      {"role": "system", "content": "You are to imitate Simon Sinek."},
      {"role": "user", "content": "Hello, tell me what makes good leaders?"}
  ]
)
print(completion.choices[0].message.content)

**Good leaders don’t just run elections or manage spreadsheets; they *inspire and create* a shared sense of purpose.**

Here’s what makes a good leader, distilled through the lens of why we do what we do:

| Why | What it looks like in action | Why it matters |
|-----|------------------------------|----------------|
| **Clear purpose** | A leader stands before the team and says, “We’re not here to simply make a profit; we’re here to help people live better lives.” | Purpose aligns every action, decision, and strategy with a larger, human‑centric goal. It clarifies the *why* behind every *what*. |
| **Empathy = Influence** | Rather than dictating, leaders ask: “What do you need? What’s stopping you?” They listen, learn, and then respond. | Empathy builds trust, and trust is a currency far more valuable than any budget line on a balance sheet. |
| **Servant the default** | Leadership is first and foremost a service to others—helping team members grow, giving them autonomy, removing obsta

# 3. Build a Custom ArXiv Search Tool

In [None]:
# Importing crewAI tools
from crewai_tools import (
    DirectoryReadTool,
    FileReadTool,
    SerperDevTool,
    WebsiteSearchTool
)

# Set up API keys for Serper
serper_api_key = os.getenv("SERPER_API_KEY") # serper.dev API key

2025-08-16 16:15:09,652 - 26524 - telemetry.py-telemetry:51 - ERROR: HTTPSConnectionPool(host='telemetry.crewai.com', port=4319): Read timed out. (read timeout=30.0)


In [19]:
# Instantiate tools
docs_tool = DirectoryReadTool(directory='documents') # Tool for reading documents from a specified directory
file_tool = FileReadTool() # Tool for reading individual files
search_tool = SerperDevTool() # Tool for performing web searches using the Serper API

In [20]:
#DEFAULTS TO OPENAI EMBEDDING MODEL
# web_rag_tool = WebsiteSearchTool() # Tool for searching and extracting information from websites

#Configure Google Embedding Model and Summarization Model
os.getenv("GOOGLE_API_KEY")

web_rag_tool = WebsiteSearchTool(
    config=dict(
        llm=dict(
            provider="google", # or google, openai, anthropic, llama2, ...
            config=dict(
                model="gemini-2.5-flash",
                # temperature=0.5,
                # top_p=1,
                # stream=true,
            ),
        ),
        embedder=dict(
            provider="google", # or openai, ollama, ...
            config=dict(
                model="models/embedding-001",
                task_type="retrieval_document",
                # title="Embeddings",
            ),
        ),
    )
)

  from .autonotebook import tqdm as notebook_tqdm
  util.warn_deprecated(


In [21]:
from typing import Type, List
from pydantic import BaseModel, Field 
from crewai.tools import BaseTool 

import arxiv
import time
import datetime

class FetchArxivPapersInput(BaseModel):
    """Input schema for FetchArxivPapersTool."""
    target_date: datetime.date = Field(..., description="Target date to fetch papers for.")

class FetchArxivPapersTool(BaseTool):
    name: str = "fetch_arxiv_papers"
    description: str = "Fetches all ArXiv papers from selected categories submitted on the target date."
    args_schema: Type[BaseModel] = FetchArxivPapersInput

    def _run(self, target_date: datetime.date) -> List[dict]:
        # List of AI-related categories. 
        # You can also include ["cs.AI", "cs.LG", "cs.CV", "cs.MA", "cs.RO"]
        AI_CATEGORIES = ["cs.CL"]

        # Define the date range for the target date
        start_date = target_date.strftime('%Y%m%d%H%M')
        end_date = (target_date + datetime.timedelta(days=1)).strftime('%Y%m%d%H%M')

        # Initialize the ArXiv client
        client = arxiv.Client(
            page_size=100,  # Fetch 100 results per page
            delay_seconds=3  # Delay between requests to respect rate limits
        )

        all_papers = []

        for category in AI_CATEGORIES:
            print(f"Fetching papers for category: {category}")

            search_query = f"cat:{category} AND submittedDate:[{start_date} TO {end_date}]"

            search = arxiv.Search(
                query=search_query,
                sort_by=arxiv.SortCriterion.SubmittedDate,
                max_results=None  # Fetch all results
            )

            # Collect results for the category
            category_papers = []
            for result in client.results(search):
                category_papers.append({
                    'title': result.title,
                    'authors': [author.name for author in result.authors],
                    'summary': result.summary,
                    'published': result.published,
                    'url': result.entry_id
                })

                # Delay between requests to respect rate limits
                time.sleep(3)

            print(f"Fetched {len(category_papers)} papers from {category}")
            all_papers.extend(category_papers)

        return all_papers

In [22]:
arxiv_search_tool = FetchArxivPapersTool()

# Step 4: Creating Agents

Agents work better when they are asked to play a role.

Based on this fact, each Agent is initialized with a:

1. Role
2. Goal
3. Backstory

In [23]:
from crewai import LLM

llm = LLM(
    model="openrouter/deepseek/deepseek-chat-v3-0324:free",
    base_url="https://openrouter.ai/api/v1",
    api_key=openrouter_api_key,
    # temperature=0.8
)

In [24]:
from crewai import Agent, Task, Crew, Process

# Agent 1: ArXiv Researcher
researcher = Agent(
    role = "Senior Researcher",
    goal = "Find the top 10 papers from the search results from ArXiv on {date}."
            "Rank them appropirately.",
    backstory = "You are a senior researcher with a deep understanding of all topics in AI and AI research."
                "You are able to identify the best research papers based on the title and abstract.",
    verbose = True,
    tools = [arxiv_search_tool],
    llm=llm
)

# The parameter verbose is set to True to see detailed logging about the agent's execution.

# The parameter tools specifies what tools an AI agent can use during its execution.

In [25]:
# Agent 2: Frontend Engineer

frontend_engineer = Agent(
    role = "Senior Frontend & AI Engineer",
    goal = "Compile the results into a HTML file.",
    backstory = "You are a competent frontend engineer writing HTML and CSS with decades of experience."
                "You have also been working with AI for decades and understand it well.",
    verbose = True,
    llm=llm
)

# The Agent class also comes with an optional parameter llm that lets you choose different LLMs for different agents.

# Step 5: Creating Tasks
Breaking down an objective into sub-objectives or Tasks and letting each AI agent work on a Task leads to better results

In [26]:
# Task for ArXiv Researcher

research_task = Task(
    description = (" Find the top 10 research papers from the search results from ArXiv on {date}."),
    expected_output = (
        "A list of top 10 research papers with the following information in the following format:"
        "- Title"
        "- Authors"
        "- Abstract"
        "- Link to the paper"
    ),
    agent = researcher,
    human_input = True,  #human_input is set to True to ensure that the agent takes our feedback on its result after working on a task.
)

In [27]:
# Task for Frontend Engineer

reporting_task = Task(
    description = ("Compile the results into a detailed report in a HTML file."),
    expected_output = (
        "An HTML file with the results in the following format:"
        "Top 10 AI Research Papers published on {date}"
        "- Title (which on clicking opens the paper in a new tab)"
        "- Authors"
        "- Short summary of the abstract (2-4 sentences)"
    ),
    agent = frontend_engineer,
    context = [research_task],      # Ensure this task relies on output of "research_task" 
    output_file = "./ai_research_report.html",
    human_input = True,
)

# Step 6: Creating the Crew

In [28]:
arxiv_research_crew = Crew(
    agents = [researcher, frontend_engineer],
    tasks = [research_task, reporting_task],
    verbose = True,
    #process=Process.sequential  # or Process.hierarchical 
)

#Process.sequential: Tasks are executed one after another in the order they're defined
#Process.hierarchical: Creates a manager-worker relationship between agents

In [30]:
# Step 7: Running the Crew and Giving Feedback on its results. 
crew_inputs = {
    "date" : "2025-08-12"
}

result = arxiv_research_crew.kickoff(inputs = crew_inputs)

[1m[95m ## Final Result:[00m [92mHere are the top 10 research papers from ArXiv on 2025-08-12, ranked based on relevance, novelty, and potential impact in AI and related fields:

1. **Title**: ProMode: A Speech Prosody Model Conditioned on Acoustic and Textual Inputs  
   **Authors**: Eray Eren, Qingju Liu, Hyeongwoo Kim, Pablo Garrido, Abeer Alwan  
   **Abstract**: Prosody conveys rich emotional and semantic information of the speech signal as well as individual idiosyncrasies. We propose a stand-alone model that maps text-to-prosodic features such as F0 and energy and can be used in downstream tasks such as TTS. The ProMode encoder takes as input acoustic features and time-aligned textual content, both are partially masked, and obtains a fixed-length latent prosodic embedding. The decoder predicts acoustics in the masked region using both the encoded prosody input and unmasked textual content. Trained on the GigaSpeech dataset, we compare our method with state-of-the-art style e

[1m[95m ## Final Result:[00m [92m```html
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Top 10 AI Research Papers - 2025-08-12</title>
    <style>
        body {
            font-family: Arial, sans-serif;
            line-height: 1.6;
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
            color: #333;
        }
        h1 {
            color: #2c3e50;
            text-align: center;
            margin-bottom: 30px;
        }
        .paper {
            background-color: #f9f9f9;
            border-radius: 8px;
            padding: 20px;
            margin-bottom: 20px;
            box-shadow: 0 2px 5px rgba(0,0,0,0.1);
        }
        .paper-title {
            color: #3498db;
            font-size: 1.2em;
            margin-bottom: 10px;
        }
        .paper-title a {
            text-decoration: none;
            color: inher