In [None]:
# First, install required packages
!pip install fastapi uvicorn beautifulsoup4 requests playwright

# Install Playwright browsers
!playwright install

# Import necessary libraries
from fastapi import FastAPI
import uvicorn
import json
from threading import Thread
import requests
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
import asyncio
import nest_asyncio

# Enable nested asyncio for Colab
nest_asyncio.apply()

print("Web processing environment setup complete!")

# Create a simple FastAPI server with dynamic content
app = FastAPI()

@app.get("/")
async def read_root():
    return {
        "html": """
        <!DOCTYPE html>
        <html>
        <body>
            <div id="content">Loading...</div>
            <script>
                setTimeout(() => {
                    document.getElementById('content').innerText = 'Loaded Content';
                }, 2000);
            </script>
        </body>
        </html>
        """
    }

# Start server in a separate thread
def run_server():
    uvicorn.run(app, host="127.0.0.1", port=8000)

server_thread = Thread(target=run_server, daemon=True)
server_thread.start()

class DynamicWebLoader:
    """
    Handles web pages that load content dynamically through JavaScript.
    """
    def __init__(self, url: str, wait_time: int = 5):
        self.url = url
        self.wait_time = wait_time

    async def load_dynamic_content(self) -> str:
        """
        Load a webpage and wait for dynamic content to render.
        Uses Playwright to handle JavaScript execution.
        """
        async with async_playwright() as p:
            browser = await p.chromium.launch()
            page = await browser.new_page()

            # Load the page and wait for dynamic content
            await page.goto(self.url)
            await page.wait_for_timeout(self.wait_time * 1000)

            # Extract the rendered content
            content = await page.content()
            await browser.close()

            return content

# Test function using asyncio
async def test_dynamic_loader():
    """Test the dynamic content loader"""
    loader = DynamicWebLoader("http://127.0.0.1:8000")
    content = await loader.load_dynamic_content()
    print("Dynamic content loaded:", 'Loaded Content' in content)

# Run the test
await test_dynamic_loader()

In [None]:
class StructuredHTMLProcessor:
    """
    Processes HTML documents while preserving their semantic structure.
    """
    def __init__(self, html_content: str):
        self.soup = BeautifulSoup(html_content, 'html.parser')

    def extract_main_article(self) -> dict:
        """Extract the main content from article tags or main content area."""
        article = self.soup.find('article') or self.soup.find('main')
        if article:
            return {
                'content': article.get_text(strip=True),
                'has_article_tag': bool(self.soup.find('article')),
                'word_count': len(article.get_text().split())
            }
        return {}

    def extract_heading_hierarchy(self) -> list:
        """Extract headings while preserving their hierarchical structure."""
        headings = []
        for level in range(1, 7):
            for heading in self.soup.find_all(f'h{level}'):
                headings.append({
                    'level': level,
                    'text': heading.get_text(strip=True),
                    'id': heading.get('id', ''),
                    'has_links': bool(heading.find_all('a'))
                })
        return headings

    def extract_lists(self) -> dict:
        """Extract ordered and unordered lists."""
        lists = {
            'ordered': [],
            'unordered': [],
            'definition': []
        }

        # Process ordered lists
        for ol in self.soup.find_all('ol'):
            lists['ordered'].append([
                item.get_text(strip=True) for item in ol.find_all('li')
            ])

        # Process unordered lists
        for ul in self.soup.find_all('ul'):
            lists['unordered'].append([
                item.get_text(strip=True) for item in ul.find_all('li')
            ])

        # Process definition lists
        for dl in self.soup.find_all('dl'):
            defs = []
            for dt, dd in zip(dl.find_all('dt'), dl.find_all('dd')):
                defs.append({
                    'term': dt.get_text(strip=True),
                    'definition': dd.get_text(strip=True)
                })
            lists['definition'].append(defs)

        return lists

    def extract_metadata(self) -> dict:
        """Extract metadata from meta tags and other sources."""
        metadata = {
            'title': self.soup.title.string if self.soup.title else '',
            'meta': {},
            'links': []
        }

        # Extract meta tags
        for meta in self.soup.find_all('meta'):
            name = meta.get('name', meta.get('property', ''))
            if name:
                metadata['meta'][name] = meta.get('content', '')

        # Extract important links
        for link in self.soup.find_all('a'):
            metadata['links'].append({
                'text': link.get_text(strip=True),
                'href': link.get('href', ''),
                'title': link.get('title', '')
            })

        return metadata

# Let's test our HTML processor with a sample document
def test_html_processor():
    """Test the structured HTML processor with a sample document"""

    sample_html = """
    <!DOCTYPE html>
    <html>
    <head>
        <title>Sample Document</title>
        <meta name="description" content="A test document">
        <meta name="keywords" content="test, sample, document">
    </head>
    <body>
        <article>
            <h1>Main Title</h1>
            <p>This is the introduction.</p>

            <h2>First Section</h2>
            <ul>
                <li>First point</li>
                <li>Second point</li>
            </ul>

            <h2>Second Section</h2>
            <ol>
                <li>Step one</li>
                <li>Step two</li>
            </ol>

            <dl>
                <dt>Term 1</dt>
                <dd>Definition 1</dd>
                <dt>Term 2</dt>
                <dd>Definition 2</dd>
            </dl>
        </article>
    </body>
    </html>
    """

    # Process the HTML
    processor = StructuredHTMLProcessor(sample_html)

    # Extract and display different components
    print("Article Content:")
    print(json.dumps(processor.extract_main_article(), indent=2))

    print("\nHeading Hierarchy:")
    print(json.dumps(processor.extract_heading_hierarchy(), indent=2))

    print("\nLists:")
    print(json.dumps(processor.extract_lists(), indent=2))

    print("\nMetadata:")
    print(json.dumps(processor.extract_metadata(), indent=2))

# Run the test
test_html_processor()

In [None]:
from typing import List, Dict, Any
import json
from bs4 import BeautifulSoup

class WebEmbeddedJSONProcessor:
    """
    Handles JSON data embedded in or loaded by web pages.
    """
    def __init__(self, html_content: str):
        self.soup = BeautifulSoup(html_content, 'html.parser')

    def extract_json_ld(self) -> List[dict]:
        """Extract JSON-LD metadata from HTML content."""
        json_ld_tags = self.soup.find_all('script', type='application/ld+json')
        results = []

        for tag in json_ld_tags:
            try:
                data = json.loads(tag.string)
                results.append(data)
            except json.JSONDecodeError:
                continue

        return results

    def process_api_response(self, response_text: str) -> dict:
        """
        Process JSON from API responses.
        Handles JSON that's dynamically loaded into the page.
        """
        try:
            data = json.loads(response_text)
            return {
                'data': self.normalize_api_data(data),
                'metadata': self.extract_api_metadata(data)
            }
        except json.JSONDecodeError:
            return {'error': 'Invalid JSON in API response'}

    def normalize_api_data(self, data: Any) -> Any:
        """Normalize API response data for consistent processing."""
        if isinstance(data, dict):
            return {
                key: self.normalize_api_data(value)
                for key, value in data.items()
            }
        elif isinstance(data, list):
            return [self.normalize_api_data(item) for item in data]
        return data

    def extract_api_metadata(self, data: dict) -> dict:
        """Extract metadata from API response."""
        metadata = {}
        if isinstance(data, dict):
            # Extract common metadata fields
            metadata = {
                'total_items': data.get('total'),
                'page': data.get('page'),
                'has_more': data.get('has_more', False),
                'timestamp': data.get('timestamp')
            }
        return metadata

def test_json_processor():
    """Test processing different types of web-embedded JSON"""

    # Create a test page with embedded JSON-LD
    test_html = """
    <html>
    <head>
        <script type="application/ld+json">
        {
            "@context": "https://schema.org",
            "@type": "Article",
            "headline": "Understanding RAG Systems",
            "author": {
                "@type": "Person",
                "name": "John Doe"
            }
        }
        </script>
    </head>
    <body>
        <div id="content">Main content here</div>
    </body>
    </html>
    """

    # Test JSON-LD extraction
    processor = WebEmbeddedJSONProcessor(test_html)
    json_ld = processor.extract_json_ld()

    print("Extracted JSON-LD:")
    print(json.dumps(json_ld, indent=2))

    # Test API response processing
    api_response = {
        "products": [
            {"id": 1, "name": "Product A", "price": 29.99},
            {"id": 2, "name": "Product B", "price": 39.99}
        ],
        "metadata": {
            "total": 2,
            "page": 1
        }
    }

    normalized_data = processor.process_api_response(json.dumps(api_response))
    print("\nProcessed API Response:")
    print(json.dumps(normalized_data, indent=2))

# Run the test
test_json_processor()

**Bringing It All Together**

In [8]:
class ComprehensiveWebProcessor:
    """
    A unified processor that handles dynamic content, HTML structure,
    and embedded JSON in web pages.
    """
    def __init__(self, url: str, wait_time: int = 5):
        self.url = url
        self.wait_time = wait_time
        self.raw_html = None
        self.processed_content = {}

    async def process_page(self) -> dict:
        """
        Process a webpage combining all our processing capabilities.
        Returns a complete analysis of the page content.
        """
        # 1. Load dynamic content
        dynamic_loader = DynamicWebLoader(self.url)
        self.raw_html = await dynamic_loader.load_dynamic_content()

        # 2. Process HTML structure
        html_processor = StructuredHTMLProcessor(self.raw_html)
        html_structure = {
            'main_content': html_processor.extract_main_article(),
            'headings': html_processor.extract_heading_hierarchy(),
            'lists': html_processor.extract_lists(),
            'metadata': html_processor.extract_metadata()
        }

        # 3. Handle embedded JSON
        json_processor = WebEmbeddedJSONProcessor(self.raw_html)
        json_ld = json_processor.extract_json_ld()

        # 4. Combine all information
        self.processed_content = {
            'structural_content': html_structure,
            'embedded_json': json_ld,
            'metadata': {
                'url': self.url,
                'processing_time': self.wait_time,
                'has_dynamic_content': bool(html_structure.get('main_content'))
            }
        }

        return self.processed_content

In [None]:
async def test_comprehensive_processor():
    """Test the comprehensive web content processor"""

    # Start our test server with a complex page
    app = FastAPI()

    @app.get("/")
    async def read_root():
        return {
            "html": """
            <!DOCTYPE html>
            <html>
            <head>
                <title>Test Complex Page</title>
                <script type="application/ld+json">
                {
                    "@context": "https://schema.org",
                    "@type": "Article",
                    "headline": "Test Article"
                }
                </script>
            </head>
            <body>
                <article>
                    <h1>Main Content</h1>
                    <p>Static content here</p>
                    <div id="dynamic-content">Loading...</div>
                    <script>
                        setTimeout(() => {
                            document.getElementById('dynamic-content').innerText =
                                'Dynamically Loaded Content';
                        }, 1000);
                    </script>
                    <ul>
                        <li>First item</li>
                        <li>Second item</li>
                    </ul>
                </article>
            </body>
            </html>
            """
        }

    # Start server in thread
    server_thread = Thread(target=lambda: uvicorn.run(app, host="127.0.0.1", port=8000))
    server_thread.daemon = True
    server_thread.start()

    # Allow server to start
    await asyncio.sleep(1)

    # Test the processor
    processor = ComprehensiveWebProcessor("http://127.0.0.1:8000")
    content = await processor.process_page()

    print("Comprehensive Processing Results:")
    print("\n1. Structural Content:")
    print(json.dumps(content['structural_content'], indent=2))

    print("\n2. Embedded JSON:")
    print(json.dumps(content['embedded_json'], indent=2))

    print("\n3. Metadata:")
    print(json.dumps(content['metadata'], indent=2))

# Run the comprehensive test
await test_comprehensive_processor()

**Best Practices and Common Challenges**

In [16]:
import time
import asyncio
from typing import Dict, Any

class DynamicContentBestPractices:
    """Demonstrate best practices for dynamic content timing"""

    def __init__(self, url: str):
        self.url = url

    async def wait_for_content(self, selector: str, timeout: int = 30) -> bool:
        """
        Wait for specific content to appear using smart timing.
        Returns True if content appears within timeout.
        """
        async with async_playwright() as p:
            browser = await p.chromium.launch()
            page = await browser.new_page()

            try:
                # Navigate with timeout
                await page.goto(self.url, timeout=timeout * 1000)

                # Wait for specific content
                await page.wait_for_selector(selector, timeout=timeout * 1000)
                return True

            except Exception as e:
                print(f"Timeout waiting for content: {str(e)}")
                return False

            finally:
                await browser.close()

In [17]:
class WebContentErrorHandler:
    """Handle common web content processing errors"""

    @staticmethod
    async def process_with_retry(processor: ComprehensiveWebProcessor, max_retries: int = 3):
        """Process content with automatic retry on failure"""
        for attempt in range(max_retries):
            try:
                return await processor.process_page()
            except Exception as e:
                if attempt == max_retries - 1:
                    raise e
                print(f"Attempt {attempt + 1} failed, retrying...")
                await asyncio.sleep(2 ** attempt)  # Exponential backoff

In [18]:
class ContentValidator:
    """Validate processed web content"""

    @staticmethod
    def validate_processed_content(content: dict) -> dict:
        """
        Validate and report on processed content quality.
        Returns validation results.
        """
        validation = {
            'is_valid': True,
            'issues': [],
            'warnings': []
        }

        # Check structural content
        if not content.get('structural_content', {}).get('main_content'):
            validation['warnings'].append('No main content found')

        # Check JSON-LD
        if not content.get('embedded_json'):
            validation['warnings'].append('No JSON-LD data found')

        # Check metadata
        if not content.get('metadata', {}).get('url'):
            validation['issues'].append('Missing URL in metadata')
            validation['is_valid'] = False

        return validation

In [19]:
class PerformanceMonitor:
    """Monitor web content processing performance"""

    def __init__(self):
        self.metrics = []

    async def measure_processing_time(self, processor: ComprehensiveWebProcessor) -> dict:
        """Measure processing time for different components"""
        metrics = {}

        start_time = time.time()
        content = await processor.process_page()
        total_time = time.time() - start_time

        metrics['total_processing_time'] = total_time
        metrics['content_size'] = len(str(content))
        metrics['timestamp'] = time.time()

        self.metrics.append(metrics)
        return metrics

In [None]:
async def demonstrate_best_practices():
    """Show how to implement best practices"""

    # Setup test environment
    url = "http://127.0.0.1:8000"

    # 1. Test dynamic content timing
    timing = DynamicContentBestPractices(url)
    content_loaded = await timing.wait_for_content("#dynamic-content")
    print(f"Dynamic content loaded: {content_loaded}")

    # 2. Test error handling
    processor = ComprehensiveWebProcessor(url)
    error_handler = WebContentErrorHandler()
    content = await error_handler.process_with_retry(processor)

    # 3. Validate content
    validator = ContentValidator()
    validation_results = validator.validate_processed_content(content)
    print("\nValidation Results:")
    print(json.dumps(validation_results, indent=2))

    # 4. Monitor performance
    monitor = PerformanceMonitor()
    metrics = await monitor.measure_processing_time(processor)
    print("\nPerformance Metrics:")
    print(json.dumps(metrics, indent=2))

# Run the demonstration
await demonstrate_best_practices()