WEB SCRAPING

I attempted BeautifulSoup, confirmed it's not sufficient, and adapted to use Selenium — still fully open-source and commonly used for scraping dynamic sites.”

In [330]:
!pip install requests



In [331]:
!pip install selenium webdriver-manager beautifulsoup4 lxml




In [332]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import time

In [333]:
import os
import sys

def suppress_stderr():
    """Redirect stderr to /dev/null to suppress unwanted warnings or messages."""
    devnull = os.open(os.devnull, os.O_RDWR)
    os.dup2(devnull, 2)  # 2 is the file descriptor for stderr

# Suppress stderr before importing modules that might trigger the message
suppress_stderr()


In [334]:
BASE_URL = "https://www.aarp.org/health/"

def get_driver():
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    return webdriver.Chrome(options=options)


In [335]:
def get_rendered_soup(url, max_scroll_attempts=30, scroll_pause=3,verbose=False):
    options = Options()
    options.add_argument("--headless=new")
    options.add_argument("--disable-gpu")
    options.add_argument("--window-size=1920,1080")
    driver = webdriver.Chrome(options=options)
    driver.get(url)

    last_count = 0
    for attempt in range(max_scroll_attempts):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(scroll_pause)

        soup = BeautifulSoup(driver.page_source, "lxml")
        new_count = len(soup.find_all("a", href=True))
        if new_count == last_count:
            print(f"✅ Scrolling complete after {attempt + 1} attempts.")
            break
        last_count = new_count

    final_html = driver.page_source
    driver.quit()

    return BeautifulSoup(final_html, "lxml")

In [336]:
known_selectors = [
    "div.article-body",
    "article",
    "main div.content",
    "section.article",
    "div.aarpe-text-image.text-image-overlay",  # Found in FAQ-style pages
    "div.textimage.parbase.section",            
    "div.text-content",
    "div.body-container"
]

In [337]:
import re
from urllib.parse import urljoin
ALLOWED_PREFIX = "/health/"  # Restrict to only health-related content


def is_probably_article_link(href, text):
    if not href:
        return False
    if any(bad in href for bad in [
        "login", "register", "games", "membership", "mailto:", "#",
        "facebook.com", "twitter.com", "instagram.com", "pinterest.com"
    ]):
        return False
    if len(text.strip()) < 15:  # Ignore very short or non-descriptive links
        return False
    return True
    
# Extract filtered article links from base page 

def get_valid_article_links(soup):
    links = soup.find_all("a", href=True)
    filtered_links = set()

    junk_patterns = [
        "calculator", "games", "login", "register",
        "video", "gallery", "#", "mailto:"
    ]
    category_like_patterns = [
    re.compile(r"^https://www\.aarp\.org/health/conditions-treatments/?$"),
    re.compile(r"^https://www\.aarp\.org/health/?$"),
    ]

    for tag in links:
        href = tag["href"]
        if href.startswith(ALLOWED_PREFIX):
            #full_url = "https://www.aarp.org" + href if href.startswith("/")
            full_url = urljoin("https://www.aarp.org", href)

            if any(pattern in full_url for pattern in junk_patterns):
                continue
                
            if any(regex.fullmatch(full_url) for regex in category_like_patterns):
                continue
    
            if BASE_URL in full_url and full_url.count("/") >= 5:
                filtered_links.add(full_url)
                
        # Add special case: "Features and Resources" section links (even outside /health/)
    fallback_links = [
        "/coronavirus/"
    ]
    for href in fallback_links:
        tag = soup.find("a", href=href)
        if tag:
            full_url = urljoin("https://www.aarp.org", href)
            filtered_links.add(full_url)

            
    return list(filtered_links)


def extract_main_content(soup):
    """
    Extracts main content from an AARP article or FAQ-style page using common HTML structures and fallbacks.
    """
    # Step 1: Try common static article containers
    selectors = [
        "div.article-body",
        "article",
        "main div.content",
        "section.article",
        "div.aarpe-text-image.text-image-overlay",  # Medicare FAQ intro
        "div.textimage.parbase.section",
        "div.text-content",
        "div.body-container"
    ]
    
    for selector in selectors:
        container = soup.select_one(selector)
        if container:
            paragraphs = container.find_all("p")
            headings = container.find_all(["h2", "h3"])
            texts = [p.get_text(strip=True) for p in paragraphs if p.get_text(strip=True)]
            combined_text = "\n".join(texts)
            if len(texts) >= 3 or len(combined_text) > 250:
                return combined_text
            elif len(texts) >= 2 and len(headings) >= 1 and len(text) > 200:
                return text

    return None

    
def extract_landing_page_links(soup):
    previews = []
    for a in soup.find_all("a", href=True):
        text = a.get_text(strip=True)
        href = a["href"]
        if text and "/health/" in href and len(text) > 30:
            full_url = urljoin("https://www.aarp.org", href)
            previews.append((text, full_url))
    return previews

# === 4. Fallback content from h1, h2, p ===
def extract_fallback_text(soup):
    fallback_tags = ['h1', 'h2', 'p']
    junk_keywords = [
        "login", "sign up", "facebook", "twitter", "linkedin", "register",
        "crossword", "games", "menu", "policy", "advertisement", "cookie",
        "terms", "contact", "member offers", "en español", "back to top"
    ]

    content = []
    for tag in soup.find_all(fallback_tags):
        text = tag.get_text(strip=True)
        if not text or len(text) < 40:
            continue
        if any(kw in text.lower() for kw in junk_keywords):
            continue
        content.append(text)

    clean = "\n".join(sorted(set(content), key=content.index))
    return clean if len(clean) > 200 else None


# === 5. Title extractor ===
def extract_title(soup):
   # Priority 1: <h1 id="article-title"> — true article pages
    h1 = soup.find("h1", {"id": "article-title"})
    if h1:
        return h1.get_text(strip=True)

    # Priority 2: general <h1> tag (like AARP Hearing Center)
    h1_generic = soup.find("h1")
    if h1_generic:
        return h1_generic.get_text(strip=True)

    # Fallback: <title>
    if soup.title and "javascript must be enabled" not in soup.title.text.lower():
        return soup.title.text.strip()

    return "Untitled Page"


# === 6. Full article text extractor ===

In [338]:
def is_promotional_or_junk_content(text):
    junk_phrases = [
        "Javascript must be enabled to use this site",
        "Please enable Javascript",
        "AARP membership", "free crossword", "play game",
        "sale ends", "join now", "subscribe", "login", "renew",
        "Sign Up", "Register", "Help", "Discounts", "Member-only"
    ]
    count = sum(1 for phrase in junk_phrases if phrase.lower() in text.lower())
    return count >= 3  # tweak threshold as needed

In [339]:
#Just Test # WORKS
# === 6. Full article text extractor ===
def extract_article_text(url):
    try:
        soup = get_rendered_soup(url)
        title = extract_title(soup)
        # Try to extract content from common article containers
        main = extract_main_content(soup)
        if main:
            print(f"{title}")
            return {"type": "article", "title": title, "content": main}


        # Try article content first
        article = soup.find("article")
        if article:
            paragraphs = article.find_all("p")
            content = "\n".join([
                p.get_text(strip=True)
                for p in paragraphs
                if len(p.get_text(strip=True)) > 40
            ])
            if len(content.split()) > 100:
                return {"type": "article", "title": title, "content": content}
                

        # Try fallback extraction
        fallback = extract_fallback_text(soup)

        # Try finding valid links if it's a landing page
        links = extract_landing_page_links(soup)
        is_landing_page = "/center" in url or "resource-center" in url or title.lower().endswith("center") or bool(links)

        # For landing page: include content only if it's not junk
        if is_landing_page:
            if fallback and not is_promotional_or_junk_content(fallback):
                return {
                    "type": "landing_page",
                    "title": title,
                    "content": fallback,
                    "links": links
                }
            else:
                return {
                    "type": "landing_page",
                    "title": title,
                    "content": None,
                    "links": links
                }

        # Not landing, not article, try fallback if it exists
        if fallback and not is_promotional_or_junk_content(fallback):
            return {"type": "article", "title": title, "content": fallback}

        return {"type": "unknown", "title": title, "content": None}

    except Exception as e:
        print(f"❌ Failed to scrape {url}: {e}")
        return {"type": "error", "title": None, "content": None}

In [340]:
# RUN
import json
    
# === 7. Main runner ===
def run_scraper():
    soup = get_rendered_soup(BASE_URL)
    article_links = get_valid_article_links(soup)
    print(f"✅ Found {len(article_links)} article links \n")

    results = []

    for i, url in enumerate(article_links, 1):
        print(f"[{i}] Scraping: {url}")
        result = extract_article_text(url)

        content = result.get("content", "")
        if result["type"] == "landing_page" and not result.get("content"):
            result["content"] = (
                "This page serves as a catalog or directory of articles and does not contain standalone "
                "narrative content. It primarily links to other resources within the health section."
            )

        entry = {
            "url": url,
            "type": result.get("type"),
            "title": result.get("title"),
            "content": result.get("content", ""),
            "links": result.get("links")
        }

        results.append(entry)
        
        if result["type"] == "article" and result["content"]:
            print(f"📝 Article: {result['title']}\n{result['content'][:300]}...\n")

        elif result["type"] == "landing_page":
            print(f"\n📚 Landing Page: {result['title']}\n" + "-" * 60)

            if result["content"]:
                preview = result["content"].strip().split("\n")[0]
                print(f"📝 Intro: {preview[:300]}...\n")

            if result.get("links"):
                print("🔗 Related Articles / Sections:")
                for i, (text, link) in enumerate(result["links"][:5], 1):
                        print(f"  {i}. {text}\n     ↳ {link}")
            else:
                print("⚠️ No article found on this landing page.\n")
                
            print("-" * 60 + "\n")

        else:
            print(f"⚠️ Skipped or unknown: {result['title'] or 'No title'}\n")

    return results

def save_results_to_json(results, filename="scraped_articles.json"):
    with open(filename, "w") as f:
        json.dump(results, f, indent=2)
    print(f"✅ Results saved to {filename}")

if __name__ == "__main__":
    results = run_scraper()
    save_results_to_json(results)

✅ Scrolling complete after 2 attempts.
✅ Found 25 article links 

[1] Scraping: https://www.aarp.org/health/drugs-supplements/elderberry-for-cold-and-flu/
✅ Scrolling complete after 2 attempts.
Can Elderberry Cure Your Cold Symptoms?
📝 Article: Can Elderberry Cure Your Cold Symptoms?
You’re laid low by afever, chills, runny nose, congestion. Can elderberry deliver relief?
Thanks to what are believed to be the herbal supplement’s immune-boosting superpowers (the tiny purple berry is loaded with vitamins and antioxidants), elderberry is having something of a moment.
Subscribe to A...

[2] Scraping: https://www.aarp.org/health/conditions-treatments/fast-constipation-relief/
✅ Scrolling complete after 2 attempts.
8 Ways to Get Fast Relief from Constipation
📝 Article: 8 Ways to Get Fast Relief from Constipation
Millions of older people suffer from constipation, sometimes for years. Long-term constipation is not only uncomfortable, it's also linked to hemorrhoids and other health problems, s