# **AI Research Assistant**

AI assitant that discovers, filters, and analyzes web content using Crawö4AI's URL Seeder to:

* Discover all available URLs without crawling them first.
* Score and rank them by relevance using AI
* Crawl only the most relevant content
* Generate research insights with proper citations.

**About the research assistant** :

A smart research assistant that:

1. Takes any research query (eg. Knowledge graphs)
2. Discovers relevant articles from news sites
3. Ranks them by relevance using BM25 scoring
4. Crawls only the top-ranked articles
5. Synthesizes findings into a comprehensive report
   

## Pipeline Overview

User Query -> Query Enhancement -> URL Discovery -> Relevance Scoring -> Smart Crawling -> AI Synthesis. -> Research Report

In [28]:
import asyncio
import json
import os
from typing import List, Dict, Optional, Tuple
from dataclasses import dataclass, asdict
from datetime import datetime
from pathlib import Path

#Rich for beutiful console output
from rich.console import Console
from rich.panel import Panel
from rich.table import Table
from rich.progress import Progress, SpinnerColumn, TextColumn

# Crawl4AI imports for intelligent crawling
from crawl4ai import (
    AsyncWebCrawler,
    BrowserConfig,
    CrawlerRunConfig,
    AsyncUrlSeeder,
    SeedingConfig,
    AsyncLogger,
    PruningContentFilter,
    DefaultMarkdownGenerator
)


# LiteLLM for AI capabilities
import litellm

# Initialize Rich console for pretty outputs
console=Console()

print("Environment ready :) All dependencies loaded successfully.")

Environment ready :) All dependencies loaded successfully.


## Step 1: Configuration and Data Classes

Here we define the research pipeline configuration. These dataclasses act as out control center, allowing us to fine-tune every aspect of the research process. Think of them as the settings panel for the research assistant, from discovery limits to AI model choices.


In [29]:
@dataclass
class ResearchConfig:
        """
        Configuration for the research pipeline
        
        This class controls every aspect of our research assistant:
        - How many URLs to discover and crawl
        - Which scoring methods to use
        - Whether to use AI enhancement
        - Output preferences
        """
        
        # Core Settings
        domain: str= "www.bbc.com/sport"
        max_urls_discovery: int =500    # Cast a wide net initially
        max_urls_to_crawl: int=10       # But only crawl the best
        top_k_urls: int=10              # Focus on top results
        
        # Scoring and filtering
        score_threshold: float=0.3      # Minimum relevance score
        scoring_method: str="bm25"      # BM25 is great for relevance
        
        # AI and processing
        use_llm_enhancement: bool=True  # Enhance queries with AI
        llm_model: str="openai/gpt-4o-mini" # Fast and capable
        
        # URL discovery options
        extract_head_metada: bool = False   # Get titles, descriptions
        live_check: bool= True              # Verify URLs are accessible
        force_refresh: bool= True           # Bypass cache
        
        # Crawler settings
        max_concurrent_crawls: int=5        # Parallel crawling
        timeout: int = 30000                # 30 second timeout
        headless: bool = True               # No browser window
        
        # Output settings
        output_dir: Path = Path("research_results")
        verbose: bool=True

@dataclass
class ResearchQuery:
    """Container for research query and metadata """
    original_query: str
    enhanced_query: Optional[str] = None
    search_patterns: List[str] = None
    timestamp: str = None
    
@dataclass
class ResearchResult:
    """Container for research results"""
    query: ResearchQuery
    discovered_urls: List[Dict]
    crawled_content: List[Dict]
    synthesis: str
    citations: List[Dict]
    metadata: Dict

# Create default configuration
config= ResearchConfig()
console.print(Panel(
    f"[bold cyan]Research Configuration[/bold cyan]\n"
    f" Domain: {config.domain}\n"
    f" Max Discovery: {config.max_urls_discovery}\n"
    f" Max Crawl: {config.max_urls_to_crawl}\n"
    f" AI Model: {config.llm_model}",
    title="Settings"
))

    

## Step 2: Query Enhancement with AI

Not all search queries are created equal. Here we use AI to transform simple queries into comprehensive search strategies. The LLM analyzes your query, extracts key concepts, and generates related terms - turning "football news" into a rich set of search patters.

In [30]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ['OPENAI_API_KEY'] = os.getenv("OPENAI_API_KEY")

In [31]:
async def enhance_query_with_llm(query: str,config:ResearchConfig) -> ResearchQuery:
    """
    Transform simple queries into comprehensive search strategies
    Why enhance queries?
    - User often use simple terms ("football news")
    - But relevant content might use varied terminology
    - AI helps capture all relevant variations    
    """
    
    console.print(f"\n[cyan] Enhancing query: '{query}...[/cyan]")
    try:
        # Ask AI to analyze and expand the query
        response = await litellm.acompletion(
            model=config.llm_model,
            messages=[{
                "role":"user",
                "content":f"""Given this research query: "{query}"
                Extract:
                1. Key terms and concepts (as a list)
                2. Related search terms
                3. A more specific/enhanced version of the query
                
                Return as JSON:
                {{
                    "key_terms":["term1","term2"],
                    "related_terms": ["related1","related2"],
                    "enhanced_query": "enhanced version of query"
                }}
               """
            }],
            temperature=0.3, #Low termperature for consistency
            response_format={"type":"json_object"}
        )
        
        data=json.loads(response.choices[0].message.content)
        
        # Create search patterns from extracted terms
        # These patterns help the URL seeder find relevant pages
        
        all_terms= data["key_terms"] + data ["related_terms"]
        #patterns = [f"*{term.lower()}*" for term in all_terms]
        
        result = ResearchQuery(
            original_query=query,
            enhanced_query=data["enhanced_query"],
            search_patterns= "", #patterns[:10], #Limit to 10 patterns
            timestamp=datetime.now().isoformat()
        )
        
        # Show the enhancement
        console.print(Panel(
            f"[green] Enhanced Query:[/green] {result.enhanced_query}\n"
            f"[dim] Key terms: {', '.join(data['key_terms'])}[/dim]",
            title = "Query Enhancement"
        ))
    
        return result

    except Exception as e:
        console.print(f"[yellow] Enhancement failed, using original query: {e}[/yellow]")
        #Fallback to simple tokenization
        words= query.lower().split()
        patterns =[f"*{word}*" for word in words if len(word)>2]
        
        return ResearchQuery(
            original_query=query,
            enhanced_query=query,
            search_patterns=patterns,
            timestamp=datetime.now().isoformat()
        )
        
# Example usage
test_query= "fPremier League news"
enhanced = await enhance_query_with_llm(test_query,config)

### Step 3: Smart URL Discovery with AsyncUrlSeeder

This is where the magic begins! Instead of crawling pages to find links, AsyncUrlSeeder discovers URLs from sitemaps and Common Crawl data. It's like having a map of the entire website before you start exploring. We'll discover hundreds of URLs in seconds, complete with metadata.

In [35]:
async def discover_urls(
    domain: str,
    query: ResearchQuery,
    config: ResearchConfig
 ) -> List[Dict]:
   """
   Discover and rank URLs without crawling them
    
   The URL Seeder is incredibly powerful because it:
   1. Gets URLs from sitemaps (official site maps)
   2. Get URLs from Common Crawl (web-scale data)
   3. Extracts metadata without full page loads
   4. Scores relevance using BM25 algorithm
    
   This means we know which pages are worth crawling
   BEFORE we spend time crawling them!
   """
   console.print(f"\n[cyan] Discovering URLs from {domain}...[/cyan]")
    
   # Use context manager for automatic cleanup
   async with AsyncUrlSeeder(logger=AsyncLogger(verbose=config.verbose)) as seeder:
      # Configure the discovery process
      seeding_config = SeedingConfig (
         # Data sources
         source = "sitemap", # Use both sitemap AND Common Crawl # REVIEW THIS URGENT
         
         # Metadata extraction
         extract_head=config.extract_head_metada, #Get titles, descriptions
         
         # Relevance scoring
         query=query.enhanced_query or query.original_query,
         scoring_method=config.scoring_method, #BM25 scoring
         score_threshold=config.score_threshold, # Minimum scoring
         
         # Limits and performance
         max_urls= config.max_urls_discovery,
         live_check=config.live_check,
         force=config.force_refresh, #Bypass cache if needed
         
         # Performance tuning
         # concurrency = 1000, # Parallel workers
      ) 
      try:
         # Discover URLs - this is fast
         urls=await seeder.urls(domain, seeding_config)
          
         # Results are already sorted by relevance
         # thanks to BM25 scoring
         top_urls = urls[:config.top_k_urls]
          
         # Show discovery results
         console.print(f"[green] Discovered {len(urls)} URLs, selected top {len(top_urls)}[/green]")
          
         # Display a sample of what we found
         if top_urls:
            table= Table(title ="Top Discovered URLs")
            table.add_column("Score", style="cyan")
            table.add_column("Title", style="green")
            table.add_column("URL",style="dim")
             
            for url in top_urls[:5]:
               score= f"{url.get('relevance_score',0):.3f}"
               title ="N/A"
               if url.get('head_data') and url['head_data'].get('title'):
                     title=url['head_data']['title'][:50] + "..."
               url_str = url['url'][:60] + "..."
               table.add_row(score, title, url_str)
            console.print(table)
         return top_urls
         
      except Exception as e:
         console.print(f"[red] URL discovery failed: {e}[/red]")
         return []
      
# Example discovery
discovered = await discover_urls(config.domain, enhanced, config) 
                
       

## Step 4: Intelligent Content Crawling

Now we crawl only the most relevant URLs. This is where our smart filtering pays off, instead of crawling hundreds of pages, we focus on the top 10-20 most relevant ones. We use content filtering to extract only the meaningful text, removing ads and navigation.

In [36]:
async def crawl_selected_urls(
    urls:List[Dict],
    query: ResearchQuery,
    config: ResearchConfig
) -> List[Dict]:
    """
    Crawl only the most relevant URLs with smart content filtering
    Key optimizations:
    1. We already know these URLs are relevant (from scoring)
    2. We crawl them in parallel for speed
    3. We extract only meaningful content (no ads/nav)
    4. We generate clean markdown for analysis
    """
    
    # Extract URLs from discovery results
    url_list=[u['url'] for u in urls if 'url' in u][:config.max_urls_to_crawl]
    if not url_list:
        console.print("[red] No URLs to crawl [/red]")
        return []

    console.print(f"\n [cyan] Crawling {len(url_list)} URLs...[/cyan]")
    
    # Configure intelligent content extraction
    # This removes ads, navigation and other noise
    md_generator = DefaultMarkdownGenerator(
        content_filter = PruningContentFilter(
            threshold=0.48, # Content relevance threshold
            threshold_type ="dynamic", # Adapts to page structure
            min_word_threshold= 10 # Ignore tiny text blocks
        ),
    )
    # Configure the crawler
    crawler_config=CrawlerRunConfig(
        markdown_generator=md_generator,
        exclude_external_links=True, #Focus on content, not links
        excluded_tags= ['nav','header','footer','aside'], #Skip UI elements
    )
    
    # Create crawler with browser config
    async with AsyncWebCrawler(
        config=BrowserConfig(
            headless=config.headless,
            verbose=config.verbose
        )
    ) as crawler:
        # Crawl URLs in parallel for speed
        # arun_many handles concurrency automatically
        results = await crawler.arun_many(
            url_list,
            config=crawler_config,
            max_concurrent=config.max_concurrent_crawls
        )
        
        # Process successful results
        crawled_content=[]
        for url, result in zip(url_list,results):
            if result.success:
                # Extract the content we need
                content_data={
                    'url':url,
                    'title':result.metadata.get('title', 'No title'),
                    'markdown':result.markdown.fit_markdown or result.markdown.raw_markdown,
                    'metadata': result.metadata
                }
                crawled_content.append(content_data)
                console.print(f"[green] check [/green] Crawled: {url[:60]}...")
            else:
                console.print(f"[red] No check [/red] Failed: {url[:50]}... - {result.error}")
        console.print(f"[green] Successfully crawled {len(crawled_content)} pages[/green]")
        return crawled_content
    

# Example crawling
crawled = await crawl_selected_urls(discovered[:5], enhanced, config)

python(70992) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


python(71181) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71189) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71190) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71194) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


python(71198) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(71199) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


python(71225) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


## Step 5: AI-Powered Research Synthesis
This is where we transform raw content into insights. The AI analyzes all crawled articles, identifies key themes, and generates a comprehensive synthesis with proper citations. It's like having a research assistant read everything and write you a summary.

In [38]:
async def generate_research_synthesis(
    query: ResearchQuery,
    crawled_content: List[Dict],
    config: ResearchConfig
) -> Tuple[str, List[Dict]]:
    """
    Use AI to synthesize findings from multiple sources
    
    The synthesis process:
    1. Sends all content to the LLM
    2. Asks for key finding and analysis
    3. Ensures proper citation of sources
    4. Generates actionable insights
    """
    
    if not crawled_content:
        return "No content available for synthesis.", []
    console.print("\n[cyan] Generating research synthesis... [/cyan]")
    
    # Prepare content for the AI
    # We include source info for proper citations
    content_sections=[]
    for i, content in enumerate(crawled_content,1):
        section = f"""
SOURCE {i}:
Title: {content['title']}
URL: {content['url']}
Content Preview:
<content>
{content['markdown'][:1500]}...
</content>
"""
        content_sections.append(section)
    combined_content="\n --- \n".join(content_sections)
    
    try:
        # Generate comprehensive synthesis
        response = await litellm.acompletion(
            model=config.llm_model,
            messages=[{
                "role":"user",
                "content":f"""Research Query: "{query.original_query}"
Based on the following resources, provide a comprehensive research synthesis.

<resources>
{combined_content}
</resources>

Please provide:
1. An executive summary (2-3 sentences)
2. Key finding (3-5 bullet points)
3. Detailed analysis (2-3 paragraphs)
4. Future implications or trends

Format your response with clear sections and cite sources using [Source N] notation.
Keep the total response under 800 words."""
            }],
            temperature=0.7 # Some creativity for synthesis
        )
        
        synthesis = response.choices[0].message.content

        # Extract citations from the synthesis
        citations = []
        for i, content in enumerate (crawled_content,1):
            # Check if this source was cited
            if f"[Source {i}]" in synthesis or f"Source {i}" in synthesis:
                citations.append({
                    'source_id':i,
                    'title':content['title'],
                    'url':content['url']
                })
        return synthesis,citations
    
    except Exception as e:
        console.print(f"[red] Synthesis generation failed:  {e}[/red]")
        # Fallback to simple summary
        summary = f"Research on '{query.original_query}' found {len(crawled_content)} relevant articles:\n\n"
        for content in crawled_content[:3]:
            summary += f"- {content['title']}\n {content['url']}\n\n"
        return summary, []

# Example synthesis
synthesis, citations = await generate_research_synthesis(enhanced, crawled, config)
console.print(Panel(synthesis[:500]+"...",title="Research Synthesis Preview"))

## Step 6: Complete Research Pipeline
 
 Now let's put itl all together! This orchestrator function manages the entire research pipeline from query to final report. It coordinates all the components we've built, handling errors gracefully and providing progress updates.

In [42]:
async def research_pipeline(
    query: str,
    config: ResearchConfig = None
) -> ResearchResult:
    """
    Main research pipeline orchestrator
    
    This brings together all components:
    1. Query enhancement (AI-powered)
    2. URL discovery (AsyncUrlSeeder)
    3. Smart crawling (AsyncWebCrawler)
    4. AI synthesis (liteLLM)
    
    Returns a complete research result
    """
    
    if config is None:
        config=ResearchConfig()
    start_time = datetime.now()
    
    # Display pipeline header
    console.print(Panel(
        f"[bold cyan]Research Pipeline [/bold cyan]\n\n"
        f"[dim]Query:[/dim]{query}\n"
        f"[dim]Domain:[/dim]{config.domain}",
        title="Starting Research",
        border_style="cyan"
    ))
    
    # Step 1: Enhance query
    console.print(f"\n[bold cyan] Step 1: Query Processing [/bold cyan]")
    if config.use_llm_enhancement:
        research_query= await enhance_query_with_llm(query, config)
    else:
        # Simple fallback without AI
        research_query = ResearchQuery(
            original_query= query,
            enhanced_query=query,
            search_patterns=[f"*{word}*" for word in query.lower().split()],
            timestamp= datetime.now().isoformat()
        )
    
    # Step 2: Discover URLs
    console.print(f"\n[bold cyan] Step 2: URL Discovery [/bold cyan]")
    discovered_urls= await discover_urls(
        domain=config.domain,
        query=research_query,
        config=config
    )
    
    if not discovered_urls:
        # No URLs found - return empty result
        return ResearchResult(
            query=research_query,
            discovered_urls=[],
            crawled_content=[],
            synthesis="No relevant URLs found for the given query.",
            citations=[],
            metadata= {'duration': str(datetime.now() - start_time)}
        )
        
    # Step 3: Crawl Selected URLs
    console.print(f"\n[bold cyan] Step 3: Content Crawling [/bold cyan]")
    crawled_content = await crawl_selected_urls(
        urls=discovered_urls,
        query=research_query,
        config=config
    )
    
    # Step 4: Generate synthesis
    console.print(f"\n [bold cyan] Step 4: Synthesis Generation [/bold cyan]")
    synthesis, citations = await generate_research_synthesis(
        query=research_query,
        crawled_content= crawled_content,
        config= config
    )
    
    # Create final result
    result= ResearchResult(
        query= research_query,
        discovered_urls=discovered_urls,
        crawled_content= crawled_content,
        synthesis=synthesis,
        citations=citations,
        metadata={
            'duration': str(datetime.now() - start_time),
            'domain': config.domain,
            'timestamp':datetime.now().isoformat(),
            'total_discovered': len(discovered_urls),
            'total_crawled': len(crawled_content),
            'total_cited': len(citations)
        }
    )
    
    # Display summary
    duration= datetime.now() - start_time
    console.print(Panel(
        f"[bold green] Research completed in {duration} [/bold green]\n"
        f" Discovered: {len(discovered_urls)} URLs\n"
        f"Crawled: {len(crawled_content)} pages\n"
        f"Citations: {len(citations)} sources",
        title = " Pipeline Complete",
        border_style="green"
    ))
    return result

#Example: Run complete pipeline
result = await research_pipeline ("Champions League latest result", config)

python(76935) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


python(76950) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(76953) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(76954) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(76964) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(76965) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(76966) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(76969) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(76970) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(76972) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


python(76973) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


python(76974) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x142defe30>


python(76975) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


## Step 7: Beutiful Output Formating

A good research report need clear presentation. Here we format our results into a professional report with executive summary, key findings, and proper citations. This makes the research actionable and easy to share.

In [43]:
def format_research_output(result: ResearchResult) -> None:
    """
    Create a beautifully formatted research report
    Good formatting makes insights actionable:
    - Clear structure with sections
    - Highlighted key findings
    - Proper source attribution
    - Easy to scan and understand  
    """
    
    # Header
    console.print("\n"+"="*60)
    console.print("[bold cyan] RESEARCH REPORT [/bold cyan]")
    console.print("="* 60)
    
    # Query information
    console.print(f"\n[bold]Query: [/bold] {result.query.original_query}")
    if result.query.enhanced_query != result.query.original_query:
        console.print(f"[dim]Enhanced: {result.query.enhanced_query} [/dim]")
    
    # Statistics
    stats_table = Table(show_header=False, box=None)
    stats_table.add_column(style="cyan")
    stats_table.add_column()
    
    stats_table.add_row("URLs Discovered", str(result.metadata['total_discovered']))
    stats_table.add_row("Pages Crawled", str(result.metadata['total_crawled']))
    stats_table.add_row("Source Cited)", str(result.metadata['total_cited']))
    stats_table.add_row("Processing Time", result.metadata['duration'])
    
    console.print("\n[bold] Statistics: [/bold]")
    console.print(stats_table)
    
    # Synthesis
    console.print("\n[bold] SYNTHESIS [/bold]")
    console.print("-" * 60)
    console.print(result.synthesis)
    
    # Citations
    if result.citations:
        console.print("\n[bold] SOURCES [/bold]")
        console.print("-"*60)
        for citation in result.citations:
            console.print(f"\n[{citation['source_id']}] [cyan]{citation['title']}[/cyan]")
            console.print(f"   [dim]{citation['url']}[/dim]")
    
    # Top discovered URLs
    console.print("\n[bold] TOP DISCOVERED URLS [/bold]")
    console.print("-"*60)
    
    urls_table=Table()
    urls_table.add_column("Score",style="cyan")
    urls_table.add_column("Title")
    urls_table.add_column("URL", style="dim")
    
    for url_data in result.discovered_urls[:5]:
        score=f"{url_data.get('relevance_score',0):.3f}"
        title= "N/A"
        if url_data.get('head_data') and url_data['head_data'].get('title'):
            title=url_data['head_data']['title'][:40]+"..."
        url = url_data['url'][:50]+"..."
        
        urls_table.add_row(score, title, url) 
    
    console.print(urls_table) 
    
# Display the formatted report
format_research_output(result)                          