In [None]:
# -*- coding: utf-8 -*-
"""
# Web Content Processing in RAG Systems - Part 1: Setup and Examples

This notebook is Part 1 of our web content processing series, accompanying Chapter 4
of "Mastering Retrieval Augmented Generation". We'll establish our development
environment and create example web content that demonstrates various scenarios
you'll encounter in real-world applications.

## What's in this Series
1. Part 1 (Current): Setup and Web Content Examples
2. Part 2: Processing Static HTML Content
3. Part 3: Handling Dynamic Web Content
4. Part 4: Working with Web APIs and JSON
5. Part 5: Performance Optimization and Error Handling

Let's begin by setting up our environment with the necessary tools for web content processing.
"""

# First, let's install all necessary packages
!pip install beautifulsoup4 requests html5lib selenium webdriver_manager
!pip install playwright
!playwright install chromium

import os
import json
import requests
from bs4 import BeautifulSoup
from typing import List, Dict, Any, Optional
import logging
from pathlib import Path
from datetime import datetime
import asyncio
from playwright.async_api import async_playwright
import re

# Set up logging for better visibility into our operations
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

"""## Understanding Our Tools

Before we create sample content, let's understand the key libraries we'll be using:

1. BeautifulSoup4: For parsing and navigating HTML/XML content
2. Playwright: For handling dynamic web content and JavaScript
3. Requests: For making HTTP requests and handling static content
4. html5lib: For robust HTML parsing

Each tool serves a specific purpose in our web content processing toolkit.
"""

# Create a directory for our sample content
!mkdir -p rag_web_samples

"""## Creating Sample Web Content

We'll create various types of web content that demonstrate different scenarios
you'll encounter in real-world RAG systems. This includes static HTML, dynamic
content, and different structural patterns.
"""

def create_static_html_sample():
    """
    Creates a static HTML file demonstrating common web content patterns:
    - Article content with metadata
    - Navigation structures
    - Lists and tables
    - Semantic HTML elements
    """
    static_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="description" content="Sample article about AI technology">
    <meta name="keywords" content="AI, Machine Learning, Technology">
    <title>Understanding AI Technologies</title>
</head>
<body>
    <header>
        <nav>
            <ul>
                <li><a href="#intro">Introduction</a></li>
                <li><a href="#main">Main Content</a></li>
                <li><a href="#conclusion">Conclusion</a></li>
            </ul>
        </nav>
    </header>

    <main>
        <article>
            <h1>Understanding AI Technologies</h1>
            <div class="metadata">
                <p>Author: Jane Smith</p>
                <p>Published: 2025-02-08</p>
                <p>Category: Technology</p>
            </div>

            <section id="intro">
                <h2>Introduction</h2>
                <p>Artificial Intelligence has transformed various industries...</p>
            </section>

            <section id="main">
                <h2>Key AI Technologies</h2>
                <table>
                    <thead>
                        <tr>
                            <th>Technology</th>
                            <th>Description</th>
                            <th>Use Cases</th>
                        </tr>
                    </thead>
                    <tbody>
                        <tr>
                            <td>Machine Learning</td>
                            <td>Systems that learn from data</td>
                            <td>Prediction, Classification</td>
                        </tr>
                        <tr>
                            <td>Natural Language Processing</td>
                            <td>Processing human language</td>
                            <td>Translation, Chatbots</td>
                        </tr>
                    </tbody>
                </table>
            </section>

            <section id="conclusion">
                <h2>Conclusion</h2>
                <p>The future of AI technology looks promising...</p>
            </section>
        </article>
    </main>

    <footer>
        <p>Â© 2025 AI Technology Review</p>
    </footer>
</body>
</html>
""".strip()

    with open('rag_web_samples/static_article.html', 'w') as f:
        f.write(static_html)
    logger.info("Created static HTML sample")

def create_dynamic_html_sample():
    """
    Creates an HTML file with dynamic content loaded via JavaScript:
    - Async data loading
    - Interactive elements
    - Dynamic updates
    """
    dynamic_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Dynamic Content Example</title>
</head>
<body>
    <div id="app">
        <h1>Real-time Data Dashboard</h1>

        <!-- Dynamically loaded content -->
        <div id="data-container">
            Loading data...
        </div>

        <!-- Interactive elements -->
        <div class="controls">
            <button onclick="loadData()">Refresh Data</button>
        </div>
    </div>

    <script>
        // Simulate dynamic data loading
        async function loadData() {
            const container = document.getElementById('data-container');
            container.innerHTML = 'Loading...';

            // Simulate API call
            const data = {
                timestamp: new Date().toISOString(),
                metrics: {
                    users: Math.floor(Math.random() * 1000),
                    transactions: Math.floor(Math.random() * 500),
                    revenue: Math.floor(Math.random() * 10000)
                }
            };

            // Update display
            container.innerHTML = `
                <div class="metrics">
                    <p>Last Updated: ${data.timestamp}</p>
                    <ul>
                        <li>Active Users: ${data.metrics.users}</li>
                        <li>Transactions: ${data.metrics.transactions}</li>
                        <li>Revenue: $${data.metrics.revenue}</li>
                    </ul>
                </div>
            `;
        }

        // Initial load
        document.addEventListener('DOMContentLoaded', loadData);
    </script>
</body>
</html>
""".strip()

    with open('rag_web_samples/dynamic_dashboard.html', 'w') as f:
        f.write(dynamic_html)
    logger.info("Created dynamic HTML sample")

def create_embedded_json_sample():
    """
    Creates an HTML file with embedded JSON data:
    - JSON-LD structured data
    - Application state
    - Configuration data
    """
    embedded_json_html = """
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <title>Product Catalog</title>

    <!-- JSON-LD structured data -->
    <script type="application/ld+json">
    {
        "@context": "https://schema.org",
        "@type": "Product",
        "name": "Smart Home Hub",
        "description": "Central control for your smart home devices",
        "brand": {
            "@type": "Brand",
            "name": "TechHome"
        },
        "offers": {
            "@type": "Offer",
            "price": "199.99",
            "priceCurrency": "USD"
        }
    }
    </script>
</head>
<body>
    <div id="product-catalog">
        <h1>Product Catalog</h1>

        <!-- Product data will be loaded here -->
        <div id="products"></div>
    </div>

    <!-- Embedded application data -->
    <script>
        const appConfig = {
            apiEndpoint: "/api/products",
            updateInterval: 300,
            features: {
                realTimePricing: true,
                inventoryTracking: true
            }
        };

        const initialState = {
            products: [
                {
                    id: "SHH-001",
                    name: "Smart Home Hub",
                    price: 199.99,
                    stock: 45
                },
                {
                    id: "SSB-002",
                    name: "Smart Security Bundle",
                    price: 299.99,
                    stock: 30
                }
            ],
            lastUpdate: "2025-02-08T10:00:00Z"
        };
    </script>
</body>
</html>
""".strip()

    with open('rag_web_samples/embedded_json.html', 'w') as f:
        f.write(embedded_json_html)
    logger.info("Created embedded JSON sample")

# Create all our sample files
create_static_html_sample()
create_dynamic_html_sample()
create_embedded_json_sample()

"""## Understanding Our Sample Content

Let's examine the sample files we've created and understand their characteristics:

1. static_article.html:
   - Demonstrates semantic HTML structure
   - Contains metadata in various forms
   - Includes tables and lists
   - Uses proper HTML5 sectioning

2. dynamic_dashboard.html:
   - Shows client-side data loading
   - Includes interactive elements
   - Demonstrates state management
   - Uses asynchronous operations

3. embedded_json.html:
   - Contains structured JSON-LD data
   - Demonstrates application configuration
   - Shows state management patterns
   - Includes multiple JSON formats

These samples will help us explore different aspects of web content processing
in the following notebooks.

## Verifying Our Samples

Let's verify our sample files and examine their characteristics:
"""

def analyze_samples():
    """Analyze and display information about our sample files."""
    sample_dir = Path('rag_web_samples')

    print("Sample Files Analysis:")
    print("-" * 50)

    for file_path in sample_dir.glob('*.html'):
        size = file_path.stat().st_size

        with open(file_path, 'r') as f:
            content = f.read()
            soup = BeautifulSoup(content, 'html5lib')

            # Analyze content
            scripts = len(soup.find_all('script'))
            json_ld = len(soup.find_all('script', {'type': 'application/ld+json'}))

            print(f"\nFile: {file_path.name}")
            print(f"Size: {size} bytes")
            print(f"Scripts: {scripts}")
            print(f"JSON-LD blocks: {json_ld}")
            print(f"Main heading: {soup.find('h1').text if soup.find('h1') else 'None'}")

# Run the analysis
analyze_samples()

"""## What's Next?

In Part 2, we'll begin implementing processors for static HTML content, building
upon these samples to create robust content extraction capabilities. We'll focus on:

1. HTML parsing and navigation
2. Content extraction strategies
3. Metadata handling
4. Structure preservation

The sample files we've created here will serve as our test cases throughout
the series, helping us validate our implementations against real-world scenarios."""

In [None]:
# -*- coding: utf-8 -*-
"""
# Web Content Processing in RAG Systems - Part 2: Static HTML Processing

This notebook focuses on processing static HTML content effectively in RAG systems.
We'll build a robust HTML processor that can extract meaningful content while
preserving important structural relationships and metadata.

Make sure you've run Part 1 first to create the sample files we'll use here.
"""

# First, let's ensure we have all necessary packages
!pip install beautifulsoup4 html5lib lxml pandas

import os
import json
from bs4 import BeautifulSoup
from typing import List, Dict, Any, Optional
import logging
from pathlib import Path
from datetime import datetime
import pandas as pd
import re

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

"""## Understanding HTML Processing Challenges

Before we dive into implementation, let's understand the key challenges in
processing HTML content for RAG systems:

1. Structure Preservation: We need to maintain the hierarchical relationships
   between different parts of the content.

2. Content Classification: Different parts of an HTML page serve different
   purposes (navigation, main content, sidebars, etc.).

3. Metadata Extraction: HTML pages often contain rich metadata in various
   forms (meta tags, JSON-LD, Open Graph tags).

4. Text Cleaning: HTML content often needs cleaning to remove boilerplate,
   advertisements, and irrelevant content.

Our implementation will address each of these challenges systematically.
"""

class HTMLProcessor:
    """
    A comprehensive HTML processor designed for RAG systems.
    Extracts and structures content while preserving important relationships
    and metadata.
    """

    def __init__(self, html_content: str):
        """
        Initialize the HTML processor with content.

        Args:
            html_content: Raw HTML content to process
        """
        # Parse with html5lib for maximum compatibility
        self.soup = BeautifulSoup(html_content, 'html5lib')
        self.metadata = {}
        self.content_blocks = []

    def extract_metadata(self) -> Dict[str, Any]:
        """
        Extract comprehensive metadata from the HTML document.
        Handles various metadata formats including meta tags, JSON-LD,
        and Open Graph tags.

        Returns:
            Dictionary containing extracted metadata
        """
        metadata = {
            'title': None,
            'description': None,
            'keywords': None,
            'author': None,
            'published_date': None,
            'modified_date': None,
            'structured_data': [],
            'open_graph': {}
        }

        # Extract basic metadata
        title_tag = self.soup.find('title')
        metadata['title'] = title_tag.text.strip() if title_tag else None

        # Process meta tags
        for meta in self.soup.find_all('meta'):
            name = meta.get('name', '').lower()
            property = meta.get('property', '').lower()
            content = meta.get('content', '')

            if name == 'description':
                metadata['description'] = content
            elif name == 'keywords':
                metadata['keywords'] = [k.strip() for k in content.split(',')]
            elif name == 'author':
                metadata['author'] = content
            elif name in ['published_time', 'article:published_time']:
                metadata['published_date'] = content
            elif name in ['modified_time', 'article:modified_time']:
                metadata['modified_date'] = content
            elif property.startswith('og:'):  # Open Graph tags
                metadata['open_graph'][property[3:]] = content

        # Extract JSON-LD structured data
        for script in self.soup.find_all('script', type='application/ld+json'):
            try:
                json_data = json.loads(script.string)
                metadata['structured_data'].append(json_data)
            except (json.JSONDecodeError, TypeError) as e:
                logger.warning(f"Error parsing JSON-LD: {str(e)}")

        self.metadata = metadata
        return metadata

    def extract_main_content(self) -> List[Dict[str, Any]]:
        """
        Extract the main content from the HTML document.
        Uses heuristics to identify and extract meaningful content blocks.

        Returns:
            List of dictionaries containing content blocks with their metadata
        """
        content_blocks = []

        # Find the main content area
        main_content = self.soup.find(['main', 'article']) or self.soup.find(
            ['div', 'section'],
            class_=re.compile(r'(content|article|post)'))

        if not main_content:
            logger.warning("No main content area found, processing entire body")
            main_content = self.soup.body

        # Process content blocks
        for block in main_content.find_all(['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
                                          'ul', 'ol', 'table']):
            block_type = block.name
            block_content = {}

            if block_type in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']:
                block_content = {
                    'type': 'heading',
                    'level': int(block_type[1]),
                    'text': block.get_text(strip=True),
                    'id': block.get('id', ''),
                }

            elif block_type in ['ul', 'ol']:
                items = [li.get_text(strip=True) for li in block.find_all('li')]
                block_content = {
                    'type': 'list',
                    'list_type': block_type,
                    'items': items
                }

            elif block_type == 'table':
                # Convert table to DataFrame and then to dict
                table_data = []
                headers = []

                # Extract headers
                header_row = block.find('tr')
                if header_row:
                    headers = [th.get_text(strip=True) for th in header_row.find_all(['th', 'td'])]

                # Extract rows
                for row in block.find_all('tr')[1:]:  # Skip header row
                    row_data = [td.get_text(strip=True) for td in row.find_all('td')]
                    if row_data:  # Skip empty rows
                        table_data.append(row_data)

                if headers and table_data:
                    df = pd.DataFrame(table_data, columns=headers)
                    block_content = {
                        'type': 'table',
                        'headers': headers,
                        'data': df.to_dict('records')
                    }

            else:  # Paragraphs and other text blocks
                block_content = {
                    'type': 'text',
                    'text': block.get_text(strip=True)
                }

            if block_content:  # Only add non-empty blocks
                content_blocks.append(block_content)

        self.content_blocks = content_blocks
        return content_blocks

    def clean_content(self):
        """
        Clean and normalize extracted content.
        Removes boilerplate, normalizes whitespace, and handles special characters.
        """
        for block in self.content_blocks:
            if 'text' in block:
                # Normalize whitespace
                block['text'] = re.sub(r'\s+', ' ', block['text']).strip()

                # Remove common boilerplate phrases
                boilerplate = [
                    'Share this article',
                    'Follow us',
                    'Advertisement',
                    'Subscribe to our newsletter'
                ]
                for phrase in boilerplate:
                    block['text'] = block['text'].replace(phrase, '')

            elif block['type'] == 'list':
                block['items'] = [
                    re.sub(r'\s+', ' ', item).strip()
                    for item in block['items']
                ]

    def process(self) -> Dict[str, Any]:
        """
        Process the HTML document completely.
        Extracts metadata and content, then returns the structured result.

        Returns:
            Dictionary containing processed content and metadata
        """
        try:
            metadata = self.extract_metadata()
            content_blocks = self.extract_main_content()
            self.clean_content()

            return {
                'metadata': metadata,
                'content': content_blocks,
                'stats': {
                    'total_blocks': len(content_blocks),
                    'processed_at': datetime.now().isoformat()
                }
            }

        except Exception as e:
            logger.error(f"Error processing HTML: {str(e)}")
            raise

"""## Testing Our Implementation

Let's test our HTML processor with the static article sample we created in Part 1.
This will demonstrate how it handles different types of content and metadata.
"""

def test_html_processor():
    """Test the HTML processor with our sample file."""
    try:
        # Read the sample file
        with open('rag_web_samples/static_article.html', 'r') as f:
            html_content = f.read()

        # Process the content
        processor = HTMLProcessor(html_content)
        result = processor.process()

        # Display results
        print("Extracted Metadata:")
        print("-" * 50)
        print(json.dumps(result['metadata'], indent=2))

        print("\nContent Blocks:")
        print("-" * 50)
        for block in result['content']:
            print(f"\nType: {block['type']}")
            if block['type'] == 'heading':
                print(f"Level: {block['level']}")
                print(f"Text: {block['text']}")
            elif block['type'] == 'table':
                print("Table Headers:", block['headers'])
                print("First Row:", block['data'][0])
            else:
                print(f"Content: {block.get('text', block.get('items', []))}")

        print("\nProcessing Statistics:")
        print("-" * 50)
        print(json.dumps(result['stats'], indent=2))

    except Exception as e:
        print(f"Error during testing: {str(e)}")

# Run the test
test_html_processor()

"""## Important Concepts to Note

Our HTML processor demonstrates several important concepts for RAG systems:

1. Hierarchical Processing: We maintain the document's structure by preserving
   heading levels and content relationships.

2. Content Classification: We distinguish between different types of content
   (headings, paragraphs, lists, tables) and process each appropriately.

3. Rich Metadata Extraction: We handle multiple metadata formats and preserve
   structured data that might be valuable for retrieval.

4. Clean and Normalized Output: Our processor produces consistent, clean output
   suitable for further processing in a RAG pipeline.

## What's Next?

In Part 3, we'll explore handling dynamic web content, including:
- JavaScript-rendered content
- Interactive elements
- Real-time updates
- Single Page Applications (SPAs)

We'll build upon our static HTML processing capabilities while adding support
for dynamic content extraction."""

In [None]:
# -*- coding: utf-8 -*-
"""
# Web Content Processing in RAG Systems - Part 3: Dynamic Web Content

This notebook focuses on processing dynamic web content in RAG systems. While static
HTML processing gives us a foundation, modern web applications often rely heavily
on JavaScript to render content, handle user interactions, and manage state. We'll
explore how to handle these dynamic elements effectively.

Make sure you've completed Parts 1 and 2 before starting this section, as we'll
build upon concepts and code established there.
"""

# First, let's install necessary packages
!pip install playwright pandas beautifulsoup4 asyncio
!playwright install chromium

import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup
import json
import logging
from typing import List, Dict, Any, Optional
from datetime import datetime
import time
from pathlib import Path

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

"""## Understanding Dynamic Web Content

Modern web applications present several challenges beyond static HTML processing:

1. JavaScript Rendering: Content is often generated and modified by JavaScript
   after the initial page load.

2. Asynchronous Loading: Data might be fetched from APIs and rendered gradually
   rather than being available immediately.

3. State Management: The page's content can change based on user interactions
   and application state.

4. Single Page Applications (SPAs): The entire application might run in the
   browser, with content updates happening without full page reloads.

Let's build a processor that can handle these challenges.
"""

class DynamicContentProcessor:
    """
    Processes dynamic web content with support for JavaScript rendering,
    asynchronous loading, and state changes.
    """

    def __init__(self, wait_time: int = 5000):
        """
        Initialize the dynamic content processor.

        Args:
            wait_time: Time to wait for dynamic content to load (milliseconds)
        """
        self.wait_time = wait_time
        self.page = None
        self.context = None
        self.browser = None

    async def _initialize_browser(self):
        """
        Initialize the browser with appropriate settings for content extraction.
        Configures browser behavior to handle modern web applications.
        """
        playwright = await async_playwright().start()
        self.browser = await playwright.chromium.launch(
            headless=True,  # Run without visible browser window
        )

        # Create a context with specific settings
        self.context = await self.browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        )

        # Create a page with event handlers
        self.page = await self.context.new_page()

        # Monitor network activity
        self.page.on('request', lambda req: logger.debug(f'Request: {req.url}'))
        self.page.on('response', lambda res: logger.debug(f'Response: {res.url}'))

    async def _wait_for_dynamic_content(self):
        """
        Wait for dynamic content to load and stabilize.
        Uses multiple strategies to ensure content is fully rendered.
        """
        # Wait for initial network activity to settle
        await self.page.wait_for_load_state('networkidle')

        # Wait for any animations to complete
        await self.page.wait_for_timeout(self.wait_time)

        # Scroll to load lazy content
        await self.page.evaluate("""
            window.scrollTo({
                top: document.body.scrollHeight,
                behavior: 'smooth'
            });
        """)

        # Wait for any new content to load
        await self.page.wait_for_timeout(1000)

    async def _extract_dynamic_state(self) -> Dict[str, Any]:
        """
        Extract application state and dynamic data.
        Captures both visible content and internal application state.

        Returns:
            Dictionary containing extracted state and data
        """
        # Get any global state variables
        state = await self.page.evaluate("""() => {
            const state = {};

            // Common state variable names
            const stateVars = [
                'window.__INITIAL_STATE__',
                'window.__PRELOADED_STATE__',
                'window.__APOLLO_STATE__',
                'window.__NUXT__'
            ];

            // Try to capture known state variables
            for (const varName of stateVars) {
                try {
                    const value = eval(varName);
                    if (value) {
                        state[varName] = value;
                    }
                } catch (e) {
                    // Ignore errors for missing variables
                }
            }

            return state;
        }""")

        return state

    async def _extract_network_data(self) -> List[Dict[str, Any]]:
        """
        Extract data from network requests.
        Captures API responses and dynamically loaded content.

        Returns:
            List of captured network responses
        """
        # Enable network interception
        await self.page.route('**/*', lambda route: route.continue_())

        responses = []

        def handle_response(response):
            try:
                if response.status == 200:
                    content_type = response.headers.get('content-type', '')
                    if 'application/json' in content_type:
                        responses.append({
                            'url': response.url,
                            'type': 'json',
                            'content': response.json()
                        })
            except Exception as e:
                logger.warning(f"Error processing response: {str(e)}")

        self.page.on('response', handle_response)

        return responses

    async def process_url(self, url: str) -> Dict[str, Any]:
        """
        Process a URL containing dynamic content.
        Handles page loading, content extraction, and cleanup.

        Args:
            url: URL to process

        Returns:
            Dictionary containing processed content and metadata
        """
        try:
            # Initialize browser if needed
            if not self.browser:
                await self._initialize_browser()

            logger.info(f"Processing URL: {url}")

            # Navigate to the page
            await self.page.goto(url, wait_until='networkidle')

            # Wait for dynamic content
            await self._wait_for_dynamic_content()

            # Extract dynamic state
            state = await self._extract_dynamic_state()

            # Get the rendered HTML
            content = await self.page.content()

            # Parse with BeautifulSoup for content extraction
            soup = BeautifulSoup(content, 'html5lib')

            # Extract network data
            network_data = await self._extract_network_data()

            # Combine all extracted information
            result = {
                'url': url,
                'timestamp': datetime.now().isoformat(),
                'rendered_content': {
                    'title': soup.title.text if soup.title else None,
                    'body': soup.body.get_text(strip=True) if soup.body else None,
                },
                'dynamic_state': state,
                'network_data': network_data,
                'metadata': {
                    'processing_time': time.time(),
                    'renderer': 'playwright',
                    'wait_time': self.wait_time
                }
            }

            return result

        except Exception as e:
            logger.error(f"Error processing URL: {str(e)}")
            raise

        finally:
            if self.browser:
                await self.browser.close()

    @staticmethod
    def clean_extracted_content(content: Dict[str, Any]) -> Dict[str, Any]:
        """
        Clean and normalize extracted content.
        Removes unnecessary content and normalizes data structures.

        Args:
            content: Raw extracted content

        Returns:
            Cleaned and normalized content
        """
        # Function to recursively clean dictionary values
        def clean_value(value):
            if isinstance(value, str):
                # Remove excessive whitespace
                return ' '.join(value.split())
            elif isinstance(value, list):
                return [clean_value(v) for v in value]
            elif isinstance(value, dict):
                return {k: clean_value(v) for k, v in value.items()}
            return value

        # Clean all content recursively
        cleaned = {k: clean_value(v) for k, v in content.items()}

        return cleaned

"""## Testing Dynamic Content Processing

Let's test our dynamic content processor with the sample dynamic dashboard
we created in Part 1. This will demonstrate how it handles JavaScript-rendered
content and state changes.
"""

async def test_dynamic_processor():
    """Test the dynamic content processor with our sample file."""
    try:
        # Create a local HTTP server to serve our test file
        import http.server
        import socketserver
        import threading

        PORT = 8000
        Handler = http.server.SimpleHTTPRequestHandler

        def run_server():
            with socketserver.TCPServer(("", PORT), Handler) as httpd:
                print(f"Serving at port {PORT}")
                httpd.serve_forever()

        # Start server in a separate thread
        server_thread = threading.Thread(target=run_server)
        server_thread.daemon = True
        server_thread.start()

        # Process the dynamic content
        processor = DynamicContentProcessor(wait_time=5000)
        result = await processor.process_url(
            f"http://localhost:{PORT}/rag_web_samples/dynamic_dashboard.html"
        )

        # Clean the results
        cleaned_result = processor.clean_extracted_content(result)

        # Display results
        print("Processed Dynamic Content:")
        print("-" * 50)
        print(json.dumps(cleaned_result, indent=2))

    except Exception as e:
        print(f"Error during testing: {str(e)}")

# Run the test using asyncio
await test_dynamic_processor()

"""## Handling Different Types of Dynamic Content

Our processor demonstrates several important capabilities for handling
dynamic web content:

1. JavaScript Rendering:
   - Uses Playwright to fully render JavaScript content
   - Waits for dynamic updates to complete
   - Handles modern web frameworks and libraries

2. State Management:
   - Captures application state variables
   - Monitors network requests for data updates
   - Tracks dynamic content changes

3. Content Extraction:
   - Processes both initially loaded and dynamically added content
   - Handles lazy-loaded and infinite-scroll content
   - Preserves relationships between dynamic elements

4. Error Handling:
   - Manages timeouts and loading failures
   - Handles partial content loads
   - Provides detailed error information

## What's Next?

In Part 4, we'll explore working with web APIs and JSON data, including:
- RESTful API interaction
- GraphQL queries
- Streaming data handling
- Rate limiting and error handling

We'll build upon our dynamic content processing capabilities while adding
sophisticated API interaction features."""

In [None]:
# -*- coding: utf-8 -*-
"""
# Web Content Processing in RAG Systems - Part 4: Web APIs and JSON Processing

This notebook demonstrates API integration in RAG systems with robust error handling
and port management. We'll use dynamic port allocation to avoid conflicts.
"""

# Let's modify our test to use a dynamic port and better cleanup
async def test_api_integration():
    """
    Test the API client with a sample scenario using dynamic port allocation.
    Includes proper resource cleanup and error handling.
    """
    from aiohttp import web
    import socket

    def find_free_port():
        """Find a free port on localhost by letting the OS assign one."""
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.bind(('', 0))
            s.listen(1)
            port = s.getsockname()[1]
            return port

    # Sample data
    products = [
        {"id": 1, "name": "Product A", "price": 99.99},
        {"id": 2, "name": "Product B", "price": 149.99}
    ]

    async def get_products(request):
        """Mock endpoint that returns products."""
        return web.json_response({"data": products})

    port = find_free_port()
    logger.info(f"Starting test server on port {port}")

    # Create the API server
    app = web.Application()
    app.router.add_get('/api/products', get_products)

    runner = web.AppRunner(app)
    site = None
    client = None

    try:
        # Set up the server
        await runner.setup()
        site = web.TCPSite(runner, 'localhost', port)
        await site.start()

        # Initialize our API client
        base_url = f'http://localhost:{port}/api'
        client = APIClient(base_url)

        # Document the endpoint
        client.document_endpoint(
            'products',
            description='Get all products',
            response_schema={
                'type': 'object',
                'properties': {
                    'data': {
                        'type': 'array',
                        'items': {
                            'type': 'object',
                            'properties': {
                                'id': {'type': 'integer'},
                                'name': {'type': 'string'},
                                'price': {'type': 'number'}
                            }
                        }
                    }
                }
            }
        )

        # Create a data processor
        processor = DataProcessor()

        def process_products(data: Dict) -> List[Dict]:
            """Process product data for RAG system."""
            products = data.get('data', [])
            return [{
                'product_id': p['id'],
                'name': p['name'],
                'price_usd': p['price'],
                'price_formatted': f"${p['price']:.2f}"
            } for p in products]

        processor.register_processor('products', process_products)

        # Make API requests
        logger.info("Making first request (no cache)...")
        response1 = await client.request('GET', 'products')
        processed1 = processor.process_response(response1, 'products')

        logger.info("Making second request (should use cache)...")
        response2 = await client.request('GET', 'products')
        processed2 = processor.process_response(response2, 'products')

        # Display results
        print("\nFirst Request Results:")
        print("-" * 50)
        print(json.dumps(processed1, indent=2))

        print("\nCache Statistics:")
        print("-" * 50)
        print(f"Cache size: {len(client.cache)}")
        print(f"Cache items: {list(client.cache.keys())}")

    except Exception as e:
        logger.error(f"Test failed: {str(e)}")
        raise

    finally:
        logger.info("Cleaning up resources...")
        if client:
            await client.close()
        if runner:
            await runner.cleanup()

# Run the test with proper async handling
try:
    await test_api_integration()
except Exception as e:
    print(f"Test failed with error: {str(e)}")
    print("Please ensure no other tests are running and try again.")
"""

This fixed version includes:
1. Dynamic port allocation to avoid conflicts
2. Proper resource cleanup in the finally block
3. Better error handling and logging
4. Clear separation of setup, test, and cleanup phases

To use this example:
1. Make sure you have the necessary imports from earlier in the notebook
2. Run this test after defining the APIClient and DataProcessor classes
"""

In [None]:
# -*- coding: utf-8 -*-
"""
# Web Content Processing in RAG Systems - Part 5: Performance Monitoring Foundation

This notebook begins our exploration of performance optimization and testing for web
content processing in RAG systems. We'll establish a robust foundation for
monitoring and measuring performance, which will guide our optimization efforts.

Our performance monitoring system needs to track multiple aspects of processing:
- Execution time for different operations
- Memory usage patterns
- Resource utilization
- Error rates and types
- Processing throughput
"""

# Install required packages for monitoring and profiling
!pip install memory_profiler psutil pytest-benchmark

import time
import psutil
import logging
from typing import Dict, List, Any, Optional
from datetime import datetime
import json
from dataclasses import dataclass, asdict
import statistics
from contextlib import contextmanager
import gc

# Set up logging with detailed formatting
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

@dataclass
class OperationMetrics:
    """
    Tracks detailed metrics for a single operation.
    This helps us understand the performance characteristics of each
    processing step.
    """
    operation_name: str
    start_time: datetime
    end_time: Optional[datetime] = None
    memory_start: float = 0.0
    memory_peak: float = 0.0
    memory_end: float = 0.0
    cpu_percent: float = 0.0
    items_processed: int = 0
    bytes_processed: int = 0
    error_count: int = 0

    @property
    def duration_seconds(self) -> float:
        """Calculate operation duration in seconds."""
        if not self.end_time:
            return 0
        return (self.end_time - self.start_time).total_seconds()

    @property
    def processing_rate(self) -> float:
        """Calculate items processed per second."""
        if self.duration_seconds == 0:
            return 0
        return self.items_processed / self.duration_seconds

    @property
    def memory_delta(self) -> float:
        """Calculate memory usage change during operation."""
        return self.memory_end - self.memory_start

    def to_dict(self) -> Dict:
        """Convert metrics to a dictionary for analysis and storage."""
        base_dict = asdict(self)
        base_dict.update({
            'duration_seconds': self.duration_seconds,
            'processing_rate': self.processing_rate,
            'memory_delta': self.memory_delta
        })
        return base_dict

class PerformanceMonitor:
    """
    Comprehensive performance monitoring system for web content processing.
    Tracks detailed metrics across multiple operations and provides analysis tools.
    """

    def __init__(self):
        """Initialize the performance monitor."""
        self.metrics_history: List[OperationMetrics] = []
        self.active_operations: Dict[str, OperationMetrics] = {}
        self.process = psutil.Process()

    def _get_memory_usage(self) -> float:
        """Get current memory usage in megabytes."""
        return self.process.memory_info().rss / 1024 / 1024

    @contextmanager
    def monitor_operation(self, operation_name: str, items_count: int = 0, bytes_count: int = 0):
        """
        Context manager for monitoring an operation's performance.

        Args:
            operation_name: Name of the operation to monitor
            items_count: Number of items being processed
            bytes_count: Size of data being processed in bytes
        """
        # Force garbage collection before measuring
        gc.collect()

        # Initialize metrics
        metrics = OperationMetrics(
            operation_name=operation_name,
            start_time=datetime.now(),
            memory_start=self._get_memory_usage(),
            items_processed=items_count,
            bytes_processed=bytes_count
        )

        self.active_operations[operation_name] = metrics

        try:
            yield metrics

        finally:
            # Record final measurements
            metrics.end_time = datetime.now()
            metrics.memory_end = self._get_memory_usage()
            metrics.cpu_percent = self.process.cpu_percent()

            # Store metrics
            self.metrics_history.append(metrics)
            del self.active_operations[operation_name]

    def get_operation_statistics(self, operation_name: str) -> Dict[str, Any]:
        """
        Calculate detailed statistics for an operation type.

        Args:
            operation_name: Name of the operation to analyze

        Returns:
            Dictionary containing comprehensive performance statistics
        """
        relevant_metrics = [
            m for m in self.metrics_history
            if m.operation_name == operation_name
        ]

        if not relevant_metrics:
            return {}

        durations = [m.duration_seconds for m in relevant_metrics]
        memory_deltas = [m.memory_delta for m in relevant_metrics]
        processing_rates = [m.processing_rate for m in relevant_metrics]

        return {
            'samples_count': len(relevant_metrics),
            'duration': {
                'mean': statistics.mean(durations),
                'median': statistics.median(durations),
                'min': min(durations),
                'max': max(durations),
                'std_dev': statistics.stdev(durations) if len(durations) > 1 else 0
            },
            'memory': {
                'mean_delta': statistics.mean(memory_deltas),
                'max_delta': max(memory_deltas),
                'peak_usage': max(m.memory_peak for m in relevant_metrics)
            },
            'processing_rate': {
                'mean': statistics.mean(processing_rates),
                'median': statistics.median(processing_rates)
            },
            'error_rate': sum(m.error_count for m in relevant_metrics) / len(relevant_metrics),
            'total_processed': {
                'items': sum(m.items_processed for m in relevant_metrics),
                'bytes': sum(m.bytes_processed for m in relevant_metrics)
            }
        }

    def generate_report(self, operation_name: Optional[str] = None) -> Dict[str, Any]:
        """
        Generate a comprehensive performance report.

        Args:
            operation_name: Optional name to filter specific operations

        Returns:
            Dictionary containing detailed performance analysis
        """
        operations = (
            [operation_name] if operation_name
            else set(m.operation_name for m in self.metrics_history)
        )

        report = {
            'timestamp': datetime.now().isoformat(),
            'total_operations': len(self.metrics_history),
            'operations': {}
        }

        for op in operations:
            report['operations'][op] = self.get_operation_statistics(op)

        return report

"""## Testing Our Performance Monitoring

Let's create some example operations to test our monitoring system and
demonstrate how it tracks different performance aspects.
"""

def test_performance_monitor():
    """Test the performance monitoring system with sample operations."""
    monitor = PerformanceMonitor()

    # Test 1: CPU-intensive operation
    def cpu_intensive_task():
        result = 0
        for i in range(1000000):
            result += i * i
        return result

    with monitor.monitor_operation('cpu_task', items_count=1000000):
        result = cpu_intensive_task()

    # Test 2: Memory-intensive operation
    def memory_intensive_task():
        large_list = list(range(1000000))
        return sum(large_list)

    with monitor.monitor_operation('memory_task', items_count=1000000):
        result = memory_intensive_task()

    # Generate and display report
    report = monitor.generate_report()
    print("\nPerformance Report:")
    print("-" * 50)
    print(json.dumps(report, indent=2))

# Run the test
test_performance_monitor()

"""## Key Concepts in Performance Monitoring

Our performance monitoring system demonstrates several important concepts:

1. Comprehensive Metrics Collection:
   - Execution time measurement
   - Memory usage tracking
   - CPU utilization monitoring
   - Processing rate calculation

2. Statistical Analysis:
   - Mean and median calculations
   - Standard deviation for variance analysis
   - Peak value tracking
   - Error rate monitoring

3. Resource Management:
   - Proper cleanup with context managers
   - Garbage collection integration
   - Memory leak detection capabilities

4. Flexible Reporting:
   - Detailed per-operation statistics
   - Aggregated performance metrics
   - Customizable report generation

"""