In [None]:
from bs4 import BeautifulSoup
import requests
from urllib.parse import urljoin
import time
import json

BASE_URL = "https://developer.bitmovin.com"
DOC_TYPE = ["/streams/docs", "/playback/docs", "/encoding/docs"]

def get_sidebar_links():
    for doc_type_link in DOC_TYPE:
        response = requests.get(BASE_URL + doc_type_link)
        soup = BeautifulSoup(response.text, "html.parser")

        # Extract all <a> tags in the sidebar that have hrefs pointing to docs
        sidebar_links = soup.select("a.rm-Sidebar-link[href^='/']")
        
        urls = []
        for link in sidebar_links:
            href = link.get("href")
            if href and "/docs/" in href:
                full_url = urljoin(BASE_URL, href)
                urls.append(full_url)
    
    return list(set(urls))  # remove duplicates

links = get_sidebar_links()

In [23]:
def extract_article(url):
    res = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    soup = BeautifulSoup(res.text, "html.parser")

    # Target the 'dehydrated' content blocks inside the page
    dehydrated_blocks = soup.find_all("div", attrs={"dehydrated": lambda v: v is not None})
    if not dehydrated_blocks:
        return None

    # Try to find the main title
    title_tag = soup.find("h1")
    title = title_tag.get_text(strip=True) if title_tag else "Untitled"

    # Extract all text from the dehydrated blocks
    text_parts = []
    for div in dehydrated_blocks:
        text_parts.append(div.get_text(separator="\n", strip=True))

    full_text = "\n\n".join(text_parts)

    # Optional: Extract media links
    imgs = [img["src"] for img in soup.find_all("img") if img.get("src")]
    videos = [vid["src"] for vid in soup.find_all("video") if vid.get("src")]

    return {
        "url": url,
        "title": title,
        "text": full_text,
        "images": imgs,
        "videos": videos
    }



parsed_docs = []
for link in links:
    print(f"⏳ Fetching: {link}")
    try:
        article_data = extract_article(link)
        if article_data:
            parsed_docs.append(article_data)
    except Exception as e:
        print(f"⚠️ Error: {e}")
    time.sleep(0.5)  # Be respectful to the server

# Save results as JSON
with open("parsed_docs.json", "w", encoding="utf-8") as f:
    json.dump(parsed_docs, f, indent=2, ensure_ascii=False)


⏳ Fetching: https://developer.bitmovin.com/encoding/docs/supported-output-options-for-per-title-encoding
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/understanding-the-bitmovin-encoding-object-model
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/muting-and-unmuting-webhooks
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/zixi-inputs
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/contribution-devices
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/bitmovin-approved-contribution-devices
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/managing-your-payment-billing-details
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/live-to-vod-workflows
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/using-per-title-encoding-with-ssai
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/creating-access-and-secret-keys-for-google-cloud-storage
⏳ Fetching: https://developer.bitmovin.com/encoding/docs/changes-to-fmp4-outputs