In [25]:
from langchain_core.tools import tool

import requests
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
import json


@tool
def scrape_dawn_articles(start_date=None, end_date=None):
    """
    Scrapes Dawn articles between given dates, extracts title and content,
    and prints them in JSON format. if no date is given, defaults to today.
    This function fetches the top 3 articles from the editorial section of Dawn
    newspaper for each day in the specified date range.
    It extracts the title and content of each article and returns them as a list of dictionaries.
    
    Args:
        start_date (str, optional): Format "YYYY-MM-DD". Defaults to today.
        end_date (str, optional): Format "YYYY-MM-DD". Defaults to today.
    """
    # Default to today if no dates are given
    if start_date is None:
        start_date = datetime.today().strftime("%Y-%m-%d")
    if end_date is None:
        end_date = datetime.today().strftime("%Y-%m-%d")

    def extract_article_text(url):
        try:
            response = requests.get(url)
            response.raise_for_status()
        except requests.RequestException as e:
            print(f"Failed to fetch the URL: {e}")
            return {"title": "", "content": "", "url": url}

        soup = BeautifulSoup(response.text, 'html.parser')
        title_tag = soup.select_one("h1.story__title, h2.story__title")
        title = title_tag.get_text(strip=True) if title_tag else "No title found"
        content = "\n".join(p.get_text(strip=True) for p in soup.select(".story__content p"))
        return {"title": title, "content": content.strip(), "url": url}

    def get_top_urls(start_date, end_date):
        all_urls = []
        start = datetime.strptime(start_date, "%Y-%m-%d")
        end = datetime.strptime(end_date, "%Y-%m-%d")
        current = start

        while current <= end:
            date_str = current.strftime("%Y-%m-%d")
            page_url = f"https://www.dawn.com/newspaper/editorial/{date_str}"
            try:
                response = requests.get(page_url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                count = 0
                for a_tag in soup.select("article.story a.story__link[href]"):
                    href = a_tag.get("href")
                    if href.startswith("https://www.dawn.com/news/") and href not in all_urls:
                        all_urls.append(href)
                        count += 1
                    if count == 3:
                        break
            except requests.RequestException as e:
                print(f"[{date_str}] Failed to fetch: {e}")
            current += timedelta(days=1)
        return all_urls

    # Fetch article URLs
    urls = get_top_urls(start_date, end_date)

    # Extract article data
    articles = [extract_article_text(url) for url in urls]

    # Print results as JSON
    # print(json.dumps(articles, indent=2, ensure_ascii=False))

    return articles

# Example usage:
# scrape_dawn_articles("2025-06-08", "2025-06-09")
# or just:

# scrape_dawn_articles()


In [54]:
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain.agents import create_tool_calling_agent, AgentExecutor


llm = ChatGoogleGenerativeAI(model="models/gemini-2.0-flash")

tools = [scrape_dawn_articles]

from langchain_core.prompts import ChatPromptTemplate

from langchain_core.prompts import ChatPromptTemplate

from langchain_core.prompts import ChatPromptTemplate

prompt = ChatPromptTemplate.from_messages([
    ("system", """You are a helpful assistant specialized in scraping and analyzing news articles from the Dawn newspaper editorial section.
Your primary function is to process user requests related to news articles and their content.

**Conditional Behavior for Article Processing:**
- **Default Action (when only dates are provided):** If the user's request consists *only* of a date or a date range (e.g., "articles for June 10", "news from 2025-06-05 to 2025-06-07") **AND contains NO other specific output instructions** (e.g., "summarize", "provide urls", "list headlines", "find key points", etc.), you MUST scrape the articles. After successfully scraping, for EACH scraped article, read its content and provide two distinct lists based on it, strictly following the format below:
    1. Words: List all good vocabulary words from the article, along with a concise meaning for each. Format each entry as "word: meaning", and place each complete entry on a new line. Do not use any bullet points, dashes, or other special formatting. 
    2. Phrases and Idioms: List different phrases and idioms found in the article, along with a concise meaning for each. Format each entry as "phrase: meaning", and place each complete entry on a new line. Do not use any bullet points, dashes, or other special formatting.

- **Specific Request Priority:** If the user provides *any* specific instruction or requests a particular type of information about the articles (e.g., "I just want the URLs", "summarize the articles", "find key points", "list headlines", "show topics"), you MUST prioritize and fulfill *only* that specific request. In these cases, the vocabulary and idiom lists should NOT be provided under any circumstances.

**Topic Restriction:**
- You can only answer questions or fulfill requests that are directly related to news articles or the news domain, specifically from the Dawn newspaper.
- For any unrelated topic or query, you MUST respond by stating that you can only assist with news-related queries from the Dawn newspaper.
"""),
    ("human", "{input}"),
    ("ai", "Use the tools to answer the question."),
    ("placeholder", "{agent_scratchpad}")
])

In [55]:
agent = create_tool_calling_agent(llm, tools, prompt)

In [56]:
agent_exec = AgentExecutor(agent=agent, tools=tools, verbose=True)

In [57]:
agent_exec.invoke({"input": "give me urls of articles from 4 to 6 june"})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `scrape_dawn_articles` with `{'end_date': '2024-06-06', 'start_date': '2024-06-04'}`
responded: 


[0m[36;1m[1;3m[{'title': 'Cipher acquittal', 'content': 'YESTERDAY afternoon, Imran Khan and Shah Mahmood Qureshistood acquittedof the charge of compromising state secrets.\n‘Traitors’ for a grand total of about four months and one week, the two have been vindicated by the Islamabad High Court of charges brought against them in the infamous ‘cipher case’, as it came to be known.\n\nWhile the detailed verdict is awaited, it is worth recalling that this was the most serious of the three convictions former prime minister Imran Khan had been slapped with inquick successionin the days leading up to the Feb 8 general election. Both Mr Khan and ex-foreign minister Mr Qureshi had been handed10 years’ rigorous punishmenteach for, in the eyes of the presiding judge, ‘leaking’ asecret cableand thereby compromising the safety,

{'input': 'give me urls of articles from 4 to 6 june',
 'output': 'Here are the URLs for the articles published in Dawn newspaper between June 4th and June 6th, 2024:\n\n*   https://www.dawn.com/news/1837680/cipher-acquittal\n*   https://www.dawn.com/news/1837679/china-sojourn\n*   https://www.dawn.com/news/1837678/measles-resurgence\n*   https://www.dawn.com/news/1837900/lower-inflation\n*   https://www.dawn.com/news/1837901/out-of-time\n*   https://www.dawn.com/news/1837902/a-sobering-election\n*   https://www.dawn.com/news/1838113/real-powers\n*   https://www.dawn.com/news/1838112/airline-safety\n*   https://www.dawn.com/news/1838111/violent-crime-wave\n'}