In [3]:
#!/usr/bin/env python3
"""
sitemap_product_course_filter.py

Usage:
    python sitemap_product_course_filter.py
    (enter site URL when prompted)

Requirements:
    pip install requests beautifulsoup4
"""

import time
import requests
import urllib.robotparser
import xml.etree.ElementTree as ET
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from typing import List, Set
import json

# --- Config / defaults ---
USER_AGENT = "Mozilla/5.0 (compatible; sitemap-filter-bot/1.0)"
REQUEST_TIMEOUT = 10
SLEEP_BETWEEN_REQUESTS = 0.5  # polite pacing
DEFAULT_KEYWORDS = [
    "product", "products",
    "course", "courses",
    "cert", "certificate", "certification",
    "training", "certified"
]


# --- helpers ---
def fetch_text(url: str) -> str:
    try:
        headers = {"User-Agent": USER_AGENT}
        resp = requests.get(url, headers=headers, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        return resp.text
    except Exception as e:
        print(f"[!] Failed to fetch {url}: {e}")
        return ""


def get_robots_txt(base_url: str) -> str:
    if not base_url.endswith("/"):
        base_url += "/"
    robots_url = urljoin(base_url, "robots.txt")
    try:
        text = fetch_text(robots_url)
        return text
    except Exception:
        return ""


def get_sitemaps_from_robots(robots_txt: str) -> List[str]:
    sitemaps = []
    for line in robots_txt.splitlines():
        if line.strip().lower().startswith("sitemap:"):
            sitemaps.append(line.split(":", 1)[1].strip())
    return sitemaps


def parse_sitemap(sitemap_url: str, seen=None) -> List[str]:
    """
    Parse regular sitemap and sitemap-index recursively.
    Returns list of URLs found.
    """
    if seen is None:
        seen = set()
    urls = []
    if sitemap_url in seen:
        return urls
    seen.add(sitemap_url)

    try:
        headers = {"User-Agent": USER_AGENT}
        resp = requests.get(sitemap_url, headers=headers, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        root = ET.fromstring(resp.content)
    except Exception as e:
        print(f"[!] Failed to download/parse sitemap {sitemap_url}: {e}")
        return urls

    # Sitemap namespace
    ns = {"sm": "http://www.sitemaps.org/schemas/sitemap/0.9"}

    # urlset -> url elements
    for loc in root.findall(".//sm:url/sm:loc", ns):
        if loc.text:
            urls.append(loc.text.strip())

    # sitemapindex -> nested sitemap locs
    for s in root.findall(".//sm:sitemap/sm:loc", ns):
        if s.text:
            nested = s.text.strip()
            urls.extend(parse_sitemap(nested, seen=seen))
    return urls


def build_robot_parser_from_text(robots_txt: str, base_url: str) -> urllib.robotparser.RobotFileParser:
    rp = urllib.robotparser.RobotFileParser()
    # RobotFileParser expects a URL or read() — but parse() accepts lines:
    rp.parse(robots_txt.splitlines())
    # set a URL so can_fetch uses same base domain logic (not strictly necessary)
    try:
        rp.set_url(urljoin(base_url if base_url.endswith("/") else base_url + "/", "robots.txt"))
    except Exception:
        pass
    return rp


def filter_allowed_urls_by_robots(urls: List[str], rp: urllib.robotparser.RobotFileParser) -> List[str]:
    allowed = []
    for u in urls:
        try:
            if rp.can_fetch(USER_AGENT, u) or rp.can_fetch("*", u):
                allowed.append(u)
        except Exception:
            # If parser fails for a URL, be conservative and skip it
            continue
    return allowed


def extract_links_from_html(html: str, base_url: str) -> Set[str]:
    """Return absolute links found in the HTML restricted to same domain as base_url."""
    links = set()
    soup = BeautifulSoup(html, "html.parser")
    base_domain = urlparse(base_url).netloc
    for a in soup.find_all("a", href=True):
        href = a["href"].strip()
        if href.startswith("#") or href.lower().startswith("mailto:") or href.lower().startswith("javascript:"):
            continue
        abs_link = urljoin(base_url, href)
        if urlparse(abs_link).netloc == base_domain:
            links.add(abs_link)
    return links


def url_matches_keywords(url: str, keywords: List[str]) -> bool:
    lower = url.lower()
    return any(kw in lower for kw in keywords)


def page_matches_keywords(html_text: str, keywords: List[str]) -> bool:
    lower = html_text.lower()
    return any(kw in lower for kw in keywords)


# --- main flow with relevance filter ---
def collect_crawlable_relevant_links(
    site_url: str,
    keywords: List[str] = None,
    do_page_content_check: bool = True,
) -> List[str]:
    if keywords is None:
        keywords = DEFAULT_KEYWORDS

    # Normalize site base
    parsed = urlparse(site_url)
    if not parsed.scheme:
        site_url = "https://" + site_url
    base_url = f"{urlparse(site_url).scheme}://{urlparse(site_url).netloc}"

    print(f"Base site: {base_url}")

    robots_txt = get_robots_txt(base_url)
    if not robots_txt:
        print("❌ No robots.txt found or failed to fetch. Aborting.")
        return []

    sitemaps = get_sitemaps_from_robots(robots_txt)
    if not sitemaps:
        print("❌ No sitemap entries found in robots.txt. Aborting.")
        return []

    print(f"Found {len(sitemaps)} sitemap(s) in robots.txt.")
    # parse sitemaps
    candidate_urls = []
    for sm in sitemaps:
        print(f"Parsing sitemap: {sm}")
        candidate_urls.extend(parse_sitemap(sm))
        time.sleep(SLEEP_BETWEEN_REQUESTS)

    candidate_urls = list(dict.fromkeys(candidate_urls))  # deduplicate while preserving order
    print(f"Total URLs found in sitemaps: {len(candidate_urls)}")

    # Build robot parser
    rp = build_robot_parser_from_text(robots_txt, base_url)
    allowed_urls = filter_allowed_urls_by_robots(candidate_urls, rp)
    print(f"Allowed by robots.txt: {len(allowed_urls)}")

    # Stage 1 filter: match keywords in URL path
    fast_matched = [u for u in allowed_urls if url_matches_keywords(u, keywords)]
    print(f"Fast URL-match results: {len(fast_matched)}")

    # Stage 2 (optional): fetch allowed_urls pages that didn't match URL keywords,
    # parse their content and internal links to find keyword mentions or linked pages with keywords.
    thorough_matched = set(fast_matched)
    if do_page_content_check:
        # We'll check both pages that didn't match and follow internal links (one level)
        to_check = [u for u in allowed_urls if u not in fast_matched]
        print(f"Will perform HTML content check for {len(to_check)} pages (this may take longer).")
        for idx, page_url in enumerate(to_check, 1):
            # politeness
            time.sleep(SLEEP_BETWEEN_REQUESTS)
            html = fetch_text(page_url)
            if not html:
                continue
            # 1) Does page content contain keywords?
            if page_matches_keywords(html, keywords):
                thorough_matched.add(page_url)
                print(f"  + content-match: {page_url}")
                continue
            # 2) Look for internal links that have keywords in path (one-level expansion)
            internal_links = extract_links_from_html(html, base_url)
            for link in internal_links:
                if url_matches_keywords(link, keywords) and rp.can_fetch(USER_AGENT, link):
                    thorough_matched.add(link)
            # optional: also check internal link page text (not implemented to avoid heavy crawling)
    # Final result (sorted)
    result = sorted(thorough_matched)
    return result


def main():
    site = "https://www.netcomlearning.com/".strip()
    print("Enter keywords separated by comma (or press enter to use defaults)")
    kw_input = ', '.join(DEFAULT_KEYWORDS).strip()
    keywords = None
    if kw_input:
        keywords = [k.strip().lower() for k in kw_input.split(",") if k.strip()]
    print("\nRunning... (respecting robots.txt)\n")
    results = collect_crawlable_relevant_links(site, keywords=keywords, do_page_content_check=True)
    print("\n=== Relevant crawlable links ===")
    if results:
        with open("relevant_links.json", "w", encoding="utf-8") as f:
            json.dump(results, f, indent=2)
        for r in results:
            print(r)
        print(f"\nStored {len(results)} links in relevant_links.json")
    else:
        print("No relevant links found using the provided filters.")


if __name__ == "__main__":
    main()


Enter keywords separated by comma (or press enter to use defaults)

Running... (respecting robots.txt)

Base site: https://www.netcomlearning.com
Found 18 sitemap(s) in robots.txt.
Parsing sitemap: https://www.netcomlearning.com/sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/courses-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/certifications-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/blogs-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/case-study-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/press-releases-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/vendors-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/solutions-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/industry-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/skilling-page-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/products-sitemap.xml
Parsing sitemap: https://www.netcomlearning.com/webina

KeyboardInterrupt: 

NameError: name 'main' is not defined