In [1]:
from agno.agent import Agent
from agno.models.google import Gemini
from agno.tools.crawl4ai import Crawl4aiTools
from pydantic import BaseModel
from typing import Optional

class ArticleInfo(BaseModel):
    title: str
    url: Optional[str]
    topic_category: str

bbc_news_url = "https://www.bbc.com/news/technology"  # Change as needed

instructions = f"""
Your goal is to crawl the BBC news page at '{bbc_news_url}' and extract a recent, relevant article.  
Specifically, look for HTML anchor `<a>` tags where:
- the href attribute starts with `/news/articles/` or `/future/article/`
- the `data-testid` attribute equals `internal-link`

Extract the href value and construct the full article URL by prefixing `https://www.bbc.com`.  
Verify the URL is accessible and not broken.  
Return strictly a JSON object with:  
{{"title": "<Article Title>", "url": "<Full Article URL>", "topic_category": "Technology"}}  
Return only one valid and relevant article.  
Avoid homepage URLs or category landing pages.
"""

crawl_agent = Agent(
    name="BBCNewsCrawlerAgent",
    model=Gemini(id="gemini-2.0-flash"),
    instructions=instructions,
    tools=[Crawl4aiTools()],
    add_history_to_messages=False,
    markdown=False,  # Keep output plain JSON for easier parsing
    debug_mode=True,
    show_tool_calls=True,
    response_model=ArticleInfo
)

# Run the agent
result = crawl_agent.run(f"Crawl and extract a Technology article URL from '{bbc_news_url}'.")

# Output
print("Extracted Article Title:", result.content.title)
print("Extracted Article URL:", result.content.url)
print("Topic Category:", result.content.topic_category)

Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


Extracted Article Title: The AI that can write better jokes than humans
Extracted Article URL: https://www.bbc.com/future/article/20240503-the-ai-that-can-write-better-jokes-than-humans
Topic Category: Technology


In [4]:
from agno.agent import Agent
from agno.models.google import Gemini
from agno.tools.crawl4ai import Crawl4aiTools
from pydantic import BaseModel
from typing import List, Optional

class ArticleLink(BaseModel):
    h2_text: str
    href_link: str
    full_url: str

class ArticlesList(BaseModel):
    articles: List[ArticleLink]

bbc_news_url = "https://www.bbc.com/news/technology"

instructions = f"""
Your goal is to crawl the BBC news page at '{bbc_news_url}' and extract article information from the specific HTML structure.

Look for this pattern in the HTML:
1. Find <div> elements with data-testid="anchor-inner-wrapper"
2. Within each of these divs, find <a> tags with data-testid="internal-link"
3. Extract the href attribute from these <a> tags
4. Find the associated <h2> tag within the same structure
5. Extract the text content from the <h2> tag

The HTML structure looks like:
<div data-testid="anchor-inner-wrapper">
  <a href="/news/articles/c0mlr09xeewo" data-testid="internal-link">
    <div ...>
      <h2 data-testid="card-headline">Article Title Here</h2>
    </div>
  </a>
</div>

For each article found:
- Extract the h2 text content (article headline)
- Extract the href from the <a> tag with data-testid="internal-link"
- If href starts with '/', prepend 'https://www.bbc.com' to create full URL

Return a JSON object:
{{
  "articles": [
    {{"h2_text": "Nasa astronaut Butch Wilmore retires after nine months in space", "href_link": "/news/articles/c0mlr09xeewo", "full_url": "https://www.bbc.com/news/articles/c0mlr09xeewo"}}
  ]
}}

Extract ALL articles with this structure from the page.
"""

crawl_agent = Agent(
    name="BBCStructuredExtractorAgent",
    model=Gemini(id="gemini-2.0-flash"),
    instructions=instructions,
    tools=[Crawl4aiTools()],
    add_history_to_messages=False,
    markdown=False,
    debug_mode=True,
    show_tool_calls=True,
    response_model=ArticlesList
)

result = crawl_agent.run(f"Extract all articles with the specified HTML structure from '{bbc_news_url}'.")

print("=== EXTRACTED ARTICLES ===")
for i, article in enumerate(result.content.articles, 1):
    print(f"{i}. Headline: {article.h2_text}")
    print(f"   Relative URL: {article.href_link}")
    print(f"   Full URL: {article.full_url}")
    print("-" * 70)

print(f"\nTotal articles found: {len(result.content.articles)}")

# Test the first few URLs to see if they work
print("\n=== TESTING FIRST FEW URLs ===")
import requests
for i, article in enumerate(result.content.articles[:3], 1):
    try:
        response = requests.head(article.full_url, timeout=5)
        status = "✅ WORKING" if response.status_code == 200 else f"❌ ERROR ({response.status_code})"
        print(f"{i}. {status} - {article.full_url}")
    except Exception as e:
        print(f"{i}. ❌ ERROR - {article.full_url} - {str(e)}")

Both GOOGLE_API_KEY and GEMINI_API_KEY are set. Using GOOGLE_API_KEY.


=== EXTRACTED ARTICLES ===
1. Headline: Apple apologises after iPad ad sparks creativity debate
   Relative URL: /news/technology-68973938
   Full URL: https://www.bbc.com/news/technology-68973938
----------------------------------------------------------------------
2. Headline: Inside China's online influencer factories
   Relative URL: /news/technology-68784981
   Full URL: https://www.bbc.com/news/technology-68784981
----------------------------------------------------------------------
3. Headline: The risky business of deep sea mining
   Relative URL: /news/business-68679108
   Full URL: https://www.bbc.com/news/business-68679108
----------------------------------------------------------------------
4. Headline: Tech firms accused of profiting from child abuse images
   Relative URL: /news/technology-68889401
   Full URL: https://www.bbc.com/news/technology-68889401
----------------------------------------------------------------------
5. Headline: Inside a UK gigafactory race
  