In [1]:
# !pip install "mcp[cli]" requests urllib3 validators

In [2]:
import json
import os
import urllib.parse
import requests
import validators
from typing import Dict, Any, List
from mcp.server.fastmcp import FastMCP

mcp = FastMCP("SearchTheArxiv")

print("MCP Server created: SearchTheArxiv")

MCP Server created: SearchTheArxiv


In [3]:
class Config:
    """Configuration for searchthearxiv MCP server"""
    def __init__(self):
        self.search_url = "https://searchthearxiv.com/search"
        self.max_results = 10
        self.headers = {
            "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
            "x-requested-with": "XMLHttpRequest",
        }

config = Config()

In [4]:
def validate_query(query: str) -> bool:
    """Validate search query (max 200 chars as per searchthearxiv rules)"""
    return len(query) <= 200


In [5]:
def format_paper_result(paper: Dict[str, Any]) -> Dict[str, Any]:
    """Format paper result for consistent output"""
    return {
        "id": paper.get("id", ""),
        "title": paper.get("title", "").strip(),
        "authors": paper.get("authors", ""),
        "abstract": paper.get("abstract", "").strip(),
        "year": paper.get("year"),
        "month": paper.get("month"),
        "published_date": f"{paper.get('month', '')}-{paper.get('year', '')}" if paper.get("year") else "",
        "arxiv_url": f"https://arxiv.org/abs/{paper.get('id', '')}" if paper.get("id") else "",
        "pdf_url": f"https://arxiv.org/pdf/{paper.get('id', '')}" if paper.get("id") else "",
        "similarity_score": paper.get("similarity", 0.0)
    }

In [6]:
def make_web_request(query: str, max_results: int) -> Dict[str, Any]:
    """Make web request to searchthearxiv API"""
    try:
        # Handle arXiv URL queries
        if validators.url(query):
            arxiv_id = query.split("/")[-1]
            search_query = arxiv_id
        else:
            search_query = query
            if not validate_query(search_query):
                return {
                    "success": False,
                    "error": "Invalid search query"
                }
        encoded_query = urllib.parse.quote(search_query)
        params = {"query": encoded_query}
        
        response = requests.get(
            config.search_url,
            params=params,
            headers=config.headers,
            timeout=30
        )
        response.raise_for_status()
        
        result = response.json()
        papers = result.get("papers", [])
        
        if not papers:
            return {
                "success": True,
                "query": query,
                "total_found": 0,
                "papers": []
            }
        
        # Format and limit results
        formatted_papers = []
        for paper in papers[:max_results]:
            formatted_papers.append(format_paper_result(paper))
        
        return {
            "success": True,
            "query": query,
            "total_found": len(formatted_papers),
            "papers": formatted_papers
        }
        
    except Exception as e:
        return {
            "success": False,
            "error": f"Search failed: {str(e)}"
        }

In [7]:
@mcp.tool()
def search_arxiv_papers(query: str, max_results: int = 10) -> str:
    """
    Search for ML papers on arXiv using semantic search via searchthearxiv.com
    
    Args:
        query: Search query (max 200 chars) or arXiv URL  
        max_results: Maximum number of results (default: 10, max: 50)
    
    Returns:
        JSON string with search results
    """
    if not query:
        return json.dumps({"success": False, "error": "Query is required"})
    
    if not validate_query(query):
        return json.dumps({"success": False, "error": "Query too long (max 200 characters)"})
    
    if max_results > 50:
        max_results = 50
    elif max_results < 1:
        max_results = 1
    
    result = make_web_request(query, max_results)
    return json.dumps(result, indent=2)

In [8]:
@mcp.tool()
def get_paper_details(arxiv_id: str) -> str:
    """
    Get detailed information about a specific arXiv paper
    
    Args:
        arxiv_id: arXiv paper ID (e.g., '2301.12345' or '1706.03762')
    
    Returns:
        JSON string with paper details
    """
    if not arxiv_id:
        return json.dumps({"success": False, "error": "arXiv ID is required"})
    
    # Clean the arxiv_id 
    clean_id = arxiv_id.replace("https://arxiv.org/abs/", "").replace("https://arxiv.org/pdf/", "")
    
    result = make_web_request(clean_id, 1)
    
    if result.get("success") and result.get("papers"):
        paper = result["papers"][0]
        return json.dumps({
            "success": True,
            "paper": paper
        }, indent=2)
    else:
        return json.dumps({
            "success": False,
            "error": f"Paper {clean_id} not found"
            
        })


In [9]:
@mcp.tool()
def search_by_category(category: str, query: str = "", max_results: int = 10) -> str:
    """
    Search papers in specific ML categories
    
    Args:
        category: Category (cs.CV, cs.LG, cs.CL, cs.AI, cs.NE, cs.RO)
        query: Optional search query within category
        max_results: Maximum number of results (default: 10)
    
    Returns:
        JSON string with search results
    """
    valid_categories = ["cs.CV", "cs.LG", "cs.CL", "cs.AI", "cs.NE", "cs.RO"]
    
    if category not in valid_categories:
        return json.dumps({
            "success": False,
            "error": f"Invalid category. Must be one of: {', '.join(valid_categories)}"
        })
    
    # Combine category with query
    search_query = f"{category} {query}".strip()
    
    result = make_web_request(search_query, max_results)
    return json.dumps(result, indent=2)


In [10]:
@mcp.tool()
def get_trending_papers(days: int = 7, max_results: int = 10) -> str:
    """
    Get trending/popular papers (simulated by searching common ML terms)
    
    Args:
        days: Time period in days (ignored - API limitation)
        max_results: Maximum number of results
    
    Returns:
        JSON string with trending papers
    """
    # Simulate trending by searching popular ML terms
    trending_queries = [
        "transformer neural networks",
        "large language models", 
        "computer vision deep learning",
        "reinforcement learning",
        "diffusion models"
    ]
    
    import random
    selected_query = random.choice(trending_queries)
    
    result = make_web_request(selected_query, max_results)
    
    if result.get("success"):
        result["note"] = f"Trending papers simulation using query: '{selected_query}'"
    
    return json.dumps(result, indent=2)

In [11]:
@mcp.resource("arxiv://categories")
def get_categories() -> str:
    """List of supported arXiv categories"""
    categories = {
        "cs.CV": "Computer Vision and Pattern Recognition",
        "cs.LG": "Machine Learning", 
        "cs.CL": "Computation and Language",
        "cs.AI": "Artificial Intelligence",
        "cs.NE": "Neural and Evolutionary Computing",
        "cs.RO": "Robotics"
    }
    return json.dumps(categories, indent=2)



In [12]:
@mcp.resource("arxiv://stats")
def get_stats() -> str:
    """Server statistics and information"""
    stats = {
        "server_name": "SearchTheArxiv MCP Server",
        "total_papers": "300,000+",
        "categories_supported": 6,
        "embedding_model": "text-embedding-ada-002",
        "search_engine": "Pinecone + OpenAI",
        "data_source": "arXiv + Cornell University",
        "update_frequency": "Weekly"
    }
    return json.dumps(stats, indent=2)


In [13]:
@mcp.resource("arxiv://help")
def get_help() -> str:
    """Help and usage information"""
    help_info = {
        "available_tools": [
            {
                "name": "search_arxiv_papers",
                "description": "Search for ML papers using natural language or arXiv URL",
                "example": "search_arxiv_papers('transformer attention mechanism', 5)"
            },
            {
                "name": "get_paper_details", 
                "description": "Get details for a specific paper by arXiv ID",
                "example": "get_paper_details('1706.03762')"
            },
            {
                "name": "search_by_category",
                "description": "Search within specific ML categories",
                "example": "search_by_category('cs.CV', 'object detection', 10)"
            },
            {
                "name": "get_trending_papers",
                "description": "Get trending/popular papers",
                "example": "get_trending_papers(7, 10)"
            }
        ],
        "tips": [
            "Queries are limited to 200 characters",
            "Use arXiv URLs for specific paper searches",
            "Supported categories: cs.CV, cs.LG, cs.CL, cs.AI, cs.NE, cs.RO",
            "Results are ranked by semantic similarity"
        ]
    }
    return json.dumps(help_info, indent=2)

In [14]:
def test_server():
    """Test server tools locally"""
    print("Testing MCP server tools...")
    
    # Test 1: Basic search
    print("\n1. Testing search_arxiv_papers:")
    result1 = search_arxiv_papers("transformer attention", 3)
    data1 = json.loads(result1)
    print(f"Success: {data1.get('success')}")
    if data1.get('success'):
        print(f"Found {data1.get('total_found')} papers")
        for paper in data1.get('papers', []):
            print(f"- {paper.get('title', '')[:60]}...")
    
    # Test 2: Paper details
    print(f"\n2. Testing get_paper_details:")
    result2 = get_paper_details("1706.03762")  # Attention is All You Need
    data2 = json.loads(result2)
    print(f"Success: {data2.get('success')}")
    if data2.get('success'):
        paper = data2.get('paper', {})
        print(f"Title: {paper.get('title', '')}")
    
    # Test 3: Category search
    print(f"\n3. Testing search_by_category:")
    result3 = search_by_category("cs.CV", "object detection", 2)
    data3 = json.loads(result3)
    print(f"Success: {data3.get('success')}")
    if data3.get('success'):
        print(f"Found {data3.get('total_found')} papers in cs.CV")
    
    # Test 4: Resources
    print(f"\n4. Testing resources:")
    categories = get_categories()
    print(f"Categories: {json.loads(categories)}")
    
    print("\nAll tests completed!")

# Run tests
test_server()



Testing MCP server tools...

1. Testing search_arxiv_papers:


Success: True
Found 3 papers
- AttentionViz: A Global View of Transformer Attention...
- Attention Is All You Need...
- Analyzing the Structure of Attention in a Transformer Langua...

2. Testing get_paper_details:
Success: True
Title: Six-Degree-of-Freedom Motion Emulation for Data-Driven Modeling of Underwater Vehicles

3. Testing search_by_category:
Success: True
Found 2 papers in cs.CV

4. Testing resources:
Categories: {'cs.CV': 'Computer Vision and Pattern Recognition', 'cs.LG': 'Machine Learning', 'cs.CL': 'Computation and Language', 'cs.AI': 'Artificial Intelligence', 'cs.NE': 'Neural and Evolutionary Computing', 'cs.RO': 'Robotics'}

All tests completed!


In [15]:
print("\n" + "="*60)
print("SEARCHTHEARXIV MCP SERVER")
print("="*60)
print("📚 Semantic search over 300,000+ ML papers")
print("🔍 Categories: cs.CV, cs.LG, cs.CL, cs.AI, cs.NE, cs.RO") 
print("🤖 Powered by OpenAI + Pinecone")
print("📖 Data from arXiv + Cornell University")

print(f"\n🛠️  Available Tools:")
print("- search_arxiv_papers(query, max_results)")
print("- get_paper_details(arxiv_id)")
print("- search_by_category(category, query, max_results)")
print("- get_trending_papers(days, max_results)")

print(f"\n📋 Available Resources:")
print("- arxiv://categories - List supported categories")
print("- arxiv://stats - Server statistics")  
print("- arxiv://help - Help and usage info")

print('result = await session.call_tool("search_arxiv_papers", {"query": "transformer", "max_results": 5})')

if __name__ == "__main__":
    # This allows the server to be run directly
    print("Starting SearchTheArxiv MCP Server...")


    print("\nServer configuration ready!")
    print(f"Tools registered: {len(mcp._tool_manager._tools)}")
    print(f"Resources registered: {len(mcp._resource_manager._resources)}")


SEARCHTHEARXIV MCP SERVER
📚 Semantic search over 300,000+ ML papers
🔍 Categories: cs.CV, cs.LG, cs.CL, cs.AI, cs.NE, cs.RO
🤖 Powered by OpenAI + Pinecone
📖 Data from arXiv + Cornell University

🛠️  Available Tools:
- search_arxiv_papers(query, max_results)
- get_paper_details(arxiv_id)
- search_by_category(category, query, max_results)
- get_trending_papers(days, max_results)

📋 Available Resources:
- arxiv://categories - List supported categories
- arxiv://stats - Server statistics
- arxiv://help - Help and usage info
result = await session.call_tool("search_arxiv_papers", {"query": "transformer", "max_results": 5})
Starting SearchTheArxiv MCP Server...

Server configuration ready!
Tools registered: 4
Resources registered: 3
