# Scraper Optimization: Function vs Class-Based Approach

**Problem:** Original implementation parses HTML twice (2 HTTP requests, duplicate parsing)

**Approach:** Refactor to class-based with lazy evaluation to parse once, access multiple times


In [None]:
# Original implementation (from scraper.py)
from bs4 import BeautifulSoup
import requests
import time

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

def fetch_website_contents(url):
    """Fetches and parses HTML"""
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else "No title found"
    if soup.body:
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)[:2_000]

def fetch_website_links(url):
    """Fetches and parses HTML again (inefficient)"""
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [link.get("href") for link in soup.find_all("a")]
    return [link for link in links if link]


In [None]:
# Optimized class-based implementation
class WebsiteScraper:
    """
    Parse once, access multiple times. Uses @property for lazy evaluation.
    Only computes what's accessed, caches results.
    """
    def __init__(self, url):
        self.url = url
        self.response = requests.get(url, headers=headers)
        self.soup = BeautifulSoup(self.response.content, "html.parser")
        self._title = None
        self._content = None
        self._links = None
    
    @property
    def title(self):
        if self._title is None:
            self._title = self.soup.title.string if self.soup.title else "No title found"
        return self._title
    
    @property
    def content(self):
        if self._content is None:
            if self.soup.body:
                for irrelevant in self.soup.body(["script", "style", "img", "input"]):
                    irrelevant.decompose()
                text = self.soup.body.get_text(separator="\n", strip=True)
            else:
                text = ""
            self._content = (self.title + "\n\n" + text)[:2_000]
        return self._content
    
    @property
    def links(self):
        if self._links is None:
            links = [link.get("href") for link in self.soup.find_all("a")]
            self._links = [link for link in links if link]
        return self._links


## Performance Comparison


In [None]:
test_url = "https://example.com"

# Original: 2 HTTP requests, 2 parsing operations
start = time.time()
content1 = fetch_website_contents(test_url)
links1 = fetch_website_links(test_url)
time_original = time.time() - start

print(f"Original: {time_original:.4f}s | 2 requests | 2 parses")


In [None]:
# Optimized: 1 HTTP request, 1 parse, lazy evaluation
start = time.time()
scraper = WebsiteScraper(test_url)
content2 = scraper.content
links2 = scraper.links
time_optimized = time.time() - start

print(f"Optimized: {time_optimized:.4f}s | 1 request | 1 parse")
print(f"Speedup: {time_original / time_optimized:.2f}x faster")


## Notes

**Use class-based when:**
- Multiple operations on same expensive resource (network, parsing)
- Need caching/lazy evaluation
- Shared state across operations

**Use functions when:**
- Single, independent operation
- No shared state needed

**Tradeoff:** More code complexity vs. better performance and reusability


## Implementation Notes

**`@property` pattern:** Makes methods accessible as attributes (`scraper.content` vs `scraper.get_content()`)

**Lazy evaluation:** Initialize cache as `None`, compute on first access, return cached value thereafter. Only computes what you actually use.


In [None]:
# Verify lazy evaluation: only accessed properties are computed
scraper = WebsiteScraper("https://example.com")
print(f"After init - title: {scraper._title is None}, content: {scraper._content is None}, links: {scraper._links is None}")

_ = scraper.title
print(f"After title access - content: {scraper._content is None}, links: {scraper._links is None}")

_ = scraper.content
print(f"After content access - links: {scraper._links is None}")


---

## JavaScript-Ready Scraper: Playwright Implementation

**Problem:** `requests` only gets static HTML; fails on JavaScript-rendered sites (React, Vue, etc.)

**Solution:** Use Playwright to run a real browser, execute JavaScript, then extract rendered HTML


In [None]:
"""
JavaScript-Ready Website Scraper using Playwright

Why This Works:
- requests: Gets static HTML only (fast, lightweight, but fails on JS-rendered sites)
- Playwright: Runs a real browser, executes JavaScript, then extracts rendered HTML
  (handles React, Vue, Angular, and other JS frameworks)

Tradeoffs:
- Playwright: Handles JavaScript, but slower and more resource-intensive
- requests: Fast and lightweight, but only works for static sites

Use Case - Summarization:
This enables scraping modern JavaScript-heavy websites (e.g., openai.com, react apps)
for LLM summarization. The rendered content is then passed to the LLM for analysis,
enabling summarization of sites that would otherwise return empty or incomplete content.
"""

from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup

class Website:
    def __init__(self, url):
        self.url = url
        self.title = None
        self.text = None
    
    def fetch(self):
        """Fetch website using Playwright (handles JavaScript)"""
        with sync_playwright() as p:
            browser = p.chromium.launch(headless=True)
            page = browser.new_page()
            
            # Navigate and wait for content
            page.goto(self.url, wait_until="networkidle")
            
            # Get rendered HTML
            html = page.content()
            self.title = page.title()
            
            browser.close()
        
        # Parse with BeautifulSoup (same as before)
        soup = BeautifulSoup(html, "html.parser")
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""


## Notes

**When to use Playwright:**
- JavaScript-rendered sites (React, Vue, Angular, SPA)
- Dynamic content that loads after page load
- Sites that require browser execution

**When `requests` is sufficient:**
- Static HTML sites
- Server-side rendered content
- Performance-critical scenarios

**Tradeoff:** Browser overhead (slower, more resources) vs. compatibility (handles modern web apps)
