In [2]:
################################################################
# NVISO Labs Scraper                                           #
# Pulls Awareness blogs from the site!                         #
################################################################

import re
import requests
from bs4 import BeautifulSoup

EXCLUDE_WORDS = ["author", "wp-admin", "category", "#comments"]

def check_for_new_pages(url):

    page = 1
    while True:
        url = f"{base_url}/category/awareness/page/{page}" if page > 1 else base_url
        print(f"Scraping page: {page}")

        html = check_status(url)
        links = parse_links(html)

        if not links:
            break

        yield links
        page += 1

def check_status(url):

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36",
        "Accept": "application/html"
    }

    r = requests.get(url, headers=headers)
    
    if r.status_code == 200:
        return r.text
    else:
        return f"Error: {r.status_code}"

def parse_links(html):
    if not html:
        return []

    soup = BeautifulSoup(html, 'html.parser')

    links = []
    seen = set()

    pattern = re.compile(r"blog.nviso.eu")
    
    for link in soup.find_all('a', href=True):
        href = link['href']

        if not pattern.search(href):
            continue

        if any(word.lower() in href.lower() for word in EXCLUDE_WORDS):
            continue

        if href.rstrip("/") == "https://blog.nviso.eu":
            continue

        if href not in seen:
            links.append(href)
            seen.add(href)

    return links

if __name__ == "__main__":

    base_url = "https://blog.nviso.eu/category/awareness/"
    all_posts = []

    for page_links in check_for_new_pages(base_url):
        all_posts.extend(page_links)

    print(f"Scraped: {base_url} all found pages")
    print(f"\nFound {len(all_posts)} blog posts")

    print(all_posts)

    '''
    url = "https://blog.nviso.eu/category/blue-team/"
    html = check_status(url)
    get_links = parse_links(html)
    print(get_links)
    '''

Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraped: https://blog.nviso.eu/category/awareness/ all found pages

Found 10 blog posts
['https://blog.nviso.eu/2025/07/29/refinery-raid/', 'https://blog.nviso.eu/2024/07/18/hunting-for-remote-management-tools/', 'https://blog.nviso.eu/2024/07/02/the-end-of-passwords-embrace-the-future-with-passkeys/', 'https://blog.nviso.eu/2024/05/23/format-string-exploitation-a-hands-on-exploration-for-linux/', 'https://blog.nviso.eu/2024/01/22/is-the-google-search-bar-enough-to-hack-belgium-companies/', 'https://blog.nviso.eu/2023/11/08/ai-in-cybersecurity-bridging-the-gap-between-imagination-and-reality/', 'https://blog.nviso.eu/2023/04/04/an-innocent-picture-how-the-rise-of-ai-makes-it-easier-to-abuse-photos-online/', 'https://blog.nviso.eu/2021/11/09/another-spin-to-gamification-how-we-used-gather-town-to-build-a-great-cyber-security-game/', 'https://blog.nviso.eu/2020/05/25/a-checklist-to-populate-your-acceptable-use-policy/', 'https://blog.nvi