In [15]:
################################################################
# NVISO Labs Scraper                                           #
# Pulls Azure reports from the site!                           #
################################################################

import re
import requests
from bs4 import BeautifulSoup

EXCLUDE_WORDS = ["author", "wp-admin", "category", "#comments",]

def check_for_new_pages(url):

    page = 1
    while True:
        url = f"{base_url}/category/cloud-security/azure/page/{page}" if page > 1 else base_url
        print(f"Scraping page: {page}")

        html = check_status(url)
        links = parse_links(html)

        if not links:
            break

        yield links
        page += 1

def check_status(url):

    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/140.0.0.0 Safari/537.36",
        "Accept": "application/html"
    }

    r = requests.get(url, headers=headers)
    
    if r.status_code == 200:
        return r.text
    else:
        return f"Error: {r.status_code}"

def parse_links(html):
    if not html:
        return []

    soup = BeautifulSoup(html, 'html.parser')

    links = []
    seen = set()

    pattern = re.compile(r"blog.nviso.eu")
    
    for link in soup.find_all('a', href=True):
        href = link['href']

        if not pattern.search(href):
            continue

        if any(word.lower() in href.lower() for word in EXCLUDE_WORDS):
            continue

        if href.rstrip("/") == "https://blog.nviso.eu":
            continue

        if href not in seen:
            links.append(href)
            seen.add(href)

    return links

if __name__ == "__main__":

    base_url = "https://blog.nviso.eu/category/cloud-security/azure"
    all_posts = []

    for page_links in check_for_new_pages(base_url):
        all_posts.extend(page_links)

    print(f"Scraped: {base_url} all found pages")
    print(f"\nFound {len(all_posts)} blog posts")

    print(all_posts)

    '''
    url = "https://blog.nviso.eu/category/blue-team/"
    html = check_status(url)
    get_links = parse_links(html)
    print(get_links)
    '''

Scraping page: 1
Scraping page: 2
Scraping page: 3
Scraping page: 4
Scraped: https://blog.nviso.eu/category/cloud-security/azure all found pages

Found 13 blog posts
['https://blog.nviso.eu/2025/09/25/securing-microsoft-entra-id-lessons-from-the-field-part-1/', 'https://blog.nviso.eu/2025/03/21/how-to-hunt-defend-against-business-email-compromise-bec/', 'https://blog.nviso.eu/2024/09/17/emergency-accounts-last-call/', 'https://blog.nviso.eu/2024/09/05/validate-your-windows-audit-policy-configuration-with-kql/', 'https://blog.nviso.eu/2024/03/18/top-things-that-you-might-not-be-doing-yet-in-entra-conditional-access-advanced-edition/', 'https://blog.nviso.eu/2024/02/27/top-things-that-you-might-not-be-doing-yet-in-entra-conditional-access/', 'https://blog.nviso.eu/2023/05/05/implementing-business-continuity-on-azure/', 'https://blog.nviso.eu/2022/05/18/detecting-preventing-rogue-azure-subscriptions/', 'https://blog.nviso.eu/2020/09/17/sentinel-query-detect-zerologon-cve-2020-1472/', 'htt