# ALl things Extracted except date and time code

In [None]:
import asyncio
import json
from typing import List, Dict
from crawl4ai import AsyncWebCrawler, CrawlResult
from bs4 import BeautifulSoup
from datetime import datetime
import re

# --------- CONFIG --------- #
INPUT_FILENAME = "filtered_news_articles.json"
OUTPUT_FILENAME = "scraped_article_details.json"

# --- CORE LOGIC --- #

def load_articles_from_json(filename: str) -> List[Dict]:
    """Loads the list of articles from the initial JSON file."""
    try:
        with open(filename, 'r', encoding='utf-8') as f:
            return json.load(f)
    except FileNotFoundError:
        print(f"[!] Error: Input file '{filename}' not found.")
        print("Please run the first script to generate it.")
        return []
    except json.JSONDecodeError:
        print(f"[!] Error: Could not read '{filename}'. Make sure it's a valid JSON file.")
        return []

def parse_content_and_datetime(soup: BeautifulSoup) -> Dict:
    """
    Tries to find the publication date/time and the main article content.
    This is often the hardest part, as every website is different.
    """
    # 1. Find Date and Time
    # Try finding a <time> tag with a 'datetime' attribute (common standard)
    datetime_str = "Not found"
    time_tag = soup.find('time', attrs={'datetime': True})
    if time_tag and time_tag.has_attr('datetime'):
        datetime_str = time_tag['datetime']
    else:
        # Fallback: search for date patterns like YYYY-MM-DD in the text
        # This regex is a simple example
        pattern = re.compile(r'\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}')
        match = pattern.search(soup.get_text())
        if match:
            datetime_str = match.group(0)

    # 2. Find Main Article Content
    # Try to find a standard <article> tag first
    content = "Content not found"
    article_body = soup.find('article')
    if article_body:
        # Join paragraphs with a newline for readability
        content = article_body.get_text(separator='\n', strip=True)
    else:
        # Fallback: get all text from the page (can be noisy)
        # We limit the length to avoid grabbing huge, irrelevant text blocks
        all_text = soup.get_text(separator='\n', strip=True)
        if len(all_text) > 200: # Basic check to see if there's meaningful content
             content = all_text

    return {
        "scraped_datetime": datetime_str,
        "content": content
    }


async def scrape_single_article(article_info: Dict, crawler: AsyncWebCrawler) -> Dict:
    """Scrapes one URL and extracts the required details."""
    url = article_info.get("url")
    if not url:
        return None

    print(f"-> Scraping {url}")
    result: CrawlResult = await crawler.arun(url)

    if not result.success or not result.cleaned_html:
        print(f"[!] Failed to crawl {url}")
        return {**article_info, "scraped_datetime": "Crawl Failed", "content": "Crawl Failed"}

    soup = BeautifulSoup(result.cleaned_html, "html.parser")
    
    # Get the date, time, and content
    extracted_data = parse_content_and_datetime(soup)

    # Combine the original info with the newly scraped data
    return {**article_info, **extracted_data}


# --- MAIN FUNCTION --- #

async def main():
    """Main function to orchestrate the scraping process."""
    articles_to_scrape = load_articles_from_json(INPUT_FILENAME)

    if not articles_to_scrape:
        return # Stop if there's nothing to do

    print(f"✅ Found {len(articles_to_scrape)} articles to scrape from '{INPUT_FILENAME}'.\n")

    # Use the crawler within a context manager
    async with AsyncWebCrawler() as crawler:
        tasks = [scrape_single_article(article, crawler) for article in articles_to_scrape]
        all_results = await asyncio.gather(*tasks)

    # Filter out any potential failures
    successful_results = [res for res in all_results if res is not None]

    # Save the detailed results to a new JSON file
    with open(OUTPUT_FILENAME, 'w', encoding='utf-8') as f:
        json.dump(successful_results, f, indent=4, ensure_ascii=False)

    print(f"\n✅ Done! Saved details for {len(successful_results)} articles to '{OUTPUT_FILENAME}'.")


# --- RUNNER --- #
if __name__ == "__main__":
    asyncio.run(main())