In [11]:
################################################################
# Elastic Security Labs Scraper                                #
# Pulls Security Research blog posts from the site!            #
################################################################

import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

EXCLUDE_WORDS = ["author", "wp-admin", "category", "#comments", "#mobile-menu", "/feed/", "/services/", "/tag/", "/contact-us/", "/about/", "sign-up", "who-we-are", "bhis", "myshopify", "/mailfail/", "linkedin", "youtube", "/tools/", "/blog/", "/free-tools", "/company/", "/podcast/", "bsky.app", "cheatsheet", "twitter", "wildwesthackinfest", "#header", "community", "/events", "free_cybersecurity_webcasts", "free_cybersecurity_tools", "free-cybersecurity-tools", "www.blackhillsinfosec.com/blog/", "prompt-zine", "rekcah", "/copy-for/", "/location", "/contact", "/rss"]
EXCLUDED_URLS = {"https://www.elastic.co/search-labs", "https://www.elastic.co", "https://www.elastic.co/observability-labs", "https://www.elastic.co/security-labs", "https://www.elastic.co/security-labs/about", "https://search.elastic.co/?location%5B0%5D=Security%20Labs&referrer=https://www.elastic.co/security-labs/topics/security-research", "https://www.elastic.co/security-labs/rss/feed.xml", "https://cloud.elastic.co/registration?cta=cloud-registration&tech=trial&plcmt=navigation&pg=security-labs", "https://www.elastic.co/contact", "https://www.elastic.co/security-labs/rss/topics/security-research.xml", "https://elastic.co?utm_source=elastic-search-labs&utm_medium=referral&utm_campaign=search-labs&utm_content=footer", "https://www.elastic.co/security-labs/sitemap.xml", "https://search.elastic.co/?location%5B0%5D=Security%20Labs&referrer=https://www.elastic.co/security-labs/topics/malware-analysis", "https://search.elastic.co/?location%5B0%5D=Security%20Labs&referrer=https://www.elastic.co/security-labs/topics/perspectives"}

def check_for_new_pages(url):

    page = 1
    while True:
        url = f"{base_url}/page/{page}" if page > 1 else base_url
        print(f"Scraping page: {page}")

        html = check_status(url)
        links = parse_links(html)

        if not links:
            break

        yield links
        page += 1

def check_status(url):

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36",
        "Accept": "application/html"
    }

    r = requests.get(url, headers=headers)
    
    if r.status_code == 200:
        return r.text
    else:
        return f"Error: {r.status_code}"

def parse_links(html):
    if not html:
        return []

    soup = BeautifulSoup(html, 'html.parser')

    links = []
    seen = set()

    #pattern = re.compile(r"<change me>")
    
    for link in soup.find_all('a', href=True):
        href = link['href']

        '''
        if not pattern.search(href):
            continue
        '''
        if any(word.lower() in href.lower() for word in EXCLUDE_WORDS):
            continue

        #if href.rstrip("/") == "<change me if needed>":
            #continue

        # Join BASE URL with relative URLs **IF NEEDED**
        full_url = urljoin(base_url, href)
        
        if full_url not in seen:
            links.append(full_url)
            seen.add(full_url)
        '''
        if href not in seen:
            links.append(href)
            seen.add(href)
        '''

    return links

if __name__ == "__main__":

    base_url = "https://www.elastic.co/security-labs/topics/security-research"
    all_posts = []

    for page_links in check_for_new_pages(base_url):
        all_posts.extend(page_links)

    print(f"Scraped: {base_url} all found pages")
    #print(f"\nFound {len(all_posts)} blog posts")

    filtered_posts = [post for post in all_posts if post not in EXCLUDED_URLS]
    print(f"Found {len(filtered_posts)} blogs after filtering")
    print(filtered_posts)

    '''
    url = "https://blog.nviso.eu/category/blue-team/"
    html = check_status(url)
    get_links = parse_links(html)
    print(get_links)
    '''

Scraping page: 1
Scraping page: 2
Scraped: https://www.elastic.co/security-labs/topics/security-research all found pages
Found 63 blogs after filtering
['https://www.elastic.co/security-labs/taking-shellter', 'https://www.elastic.co/security-labs/entra-id-oauth-phishing-detection', 'https://www.elastic.co/security-labs/call-stacks-no-more-free-passes-for-malware', 'https://www.elastic.co/security-labs/misbehaving-modalities', 'https://www.elastic.co/security-labs/aws-sns-abuse', 'https://www.elastic.co/security-labs/detecting-hotkey-based-keyloggers', 'https://www.elastic.co/security-labs/the-grand-finale-on-linux-persistence', 'https://www.elastic.co/security-labs/emulating-aws-s3-sse-c', 'https://www.elastic.co/security-labs/approaching-the-summit-on-persistence', 'https://www.elastic.co/security-labs/detecting-hotkey-based-keyloggers-jp', 'https://www.elastic.co/security-labs/behavior-rule-bug-bounty', 'https://www.elastic.co/security-labs/detonating-beacons-to-illuminate-detection-