In [None]:
# pip install beautifulsoup4
#pip install playwright
# pip install nest_asyncio
# Have to run "playwright install" for pip install for installing headless browser info etc

In [38]:
# This script uses Playwright to render a dynamic webpage and BeautifulSoup to parse it.
# It recursively converts the HTML structure into a hierarchical JSON format containing:
# - tag names
# - classes
# - attributes
# - text content
# - child elements
import nest_asyncio
import asyncio
from playwright.async_api import async_playwright
from bs4 import BeautifulSoup, NavigableString
import json
import random
import time

In [14]:
nest_asyncio.apply()
import os
os.environ["DEBUG"] = "pw:browser, pw:page, pw:launcher"

In [54]:
async def human_like_interaction(page):
    # Random mouse movement
    for _ in range(5):
        x = random.randint(0, 1280)
        y = random.randint(0, 720)
        await page.mouse.move(x, y)
        await asyncio.sleep(random.uniform(0.5, 1.5))

    # Random scroll
    await page.mouse.wheel(0, random.randint(100, 500))

    # Click on random elements
    elements = await page.query_selector_all('a, button, .clickable')
    if elements:
        random_element = random.choice(elements)
        await random_element.click()
        await asyncio.sleep(random.uniform(1, 2))

In [64]:
# async def headful_render_page(url):
#     async with async_playwright() as p:
#         browser = await p.chromium.launch(headless=False)  # Show the browser window
#         page = await browser.new_page()
#         # Mimic a normal browser's User-Agent and Accept-Language headers
#         page.set_extra_http_headers({
#             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
#             'Accept-Language': 'en-US,en;q=0.9'
#         })
#         await page.goto(url, timeout=90000)
        
#         # Wait for body or another key element
#         await page.wait_for_selector("body", timeout=90000)
#         await asyncio.sleep(5)  # Allow for additional content to load
        
#         html = await page.content()
#         await browser.close()
#     return html

async def render_page(url):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        print("Navigating to the page...")
        # Mimic a normal browser's User-Agent and Accept-Language headers
        await page.set_extra_http_headers({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
            'Accept-Language': 'en-US,en;q=0.9'
        })
        
        
        
        await page.goto(url, timeout=90000)
        # await human_like_interaction(page)  # Simulate human-like interactions
        print("Waiting for body...")
        await page.wait_for_selector("body", timeout=90000)  # More general selector
        print("Content loading...")
        await asyncio.sleep(5)  # Wait for JS content
        content = await page.content()

        print(content[:1000])  # print first 1000 chars to verify
        html = await page.content()
        await browser.close()
    return html

# --- Step 2: Recursively convert to JSON ---
def element_to_json(element):
    if isinstance(element, NavigableString):
        text = element.strip()
        return {"text": text} if text else None

    node = {
        "tag": element.name,
        "attrs": dict(element.attrs),
        "text": element.get_text(strip=True) if element.get_text(strip=True) else "",
        "children": []
    }

    for child in element.children:
        child_json = element_to_json(child)
        if child_json:
            node["children"].append(child_json)

    return node

# --- Step 3: Parse and convert full page to JSON ---
def html_to_json(html):
    soup = BeautifulSoup(html, "html.parser")
    body = soup.body or soup
    return element_to_json(body)

In [66]:
async def convert_website_to_json(url):
    html = await render_page(url)
    return html_to_json(html)
await asyncio.sleep(5) 
# 🟢 Now run this cell to execute:
url = "https://www.straighttalk.com/all-plans"
json_result = await convert_website_to_json(url)
print(json.dumps(json_result, indent=2))

Navigating to the page...
Waiting for body...
Content loading...
<html style="height:100%"><head><meta name="ROBOTS" content="NOINDEX, NOFOLLOW"><meta name="format-detection" content="telephone=no"><meta name="viewport" content="initial-scale=1.0"><meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"><script type="text/javascript" src="/_Incapsula_Resource?SWJIYLWA=719d34d31c8e3a6e6fffd425f7e032f3"></script><script src="/u-first-Dame-to-should-Wassage-All-bed-this-not-" async=""></script></head><body style="margin:0px;height:100%"><iframe id="main-iframe" src="/_Incapsula_Resource?SWUDNSAI=31&amp;xinfo=58-168583938-0%200CNN%20RT%281746702111386%20656%29%20q%280%20-1%20-1%20-1%29%20r%280%20-1%29%20B12%2814%2c0%2c0%29&amp;incident_id=49000330555978062-776278698389211258&amp;edet=12&amp;cinfo=0e000000&amp;rpinfo=0&amp;cts=yt0jNVh9tYLcMBVHQIzjRTcKK7p5moqAUboIUKrJxRI7nkc9S2ofZG6WeYI4jjtu&amp;mth=GET" frameborder="0" width="100%" height="100%" marginheight="0px" marginwidth="0px">Req

In [71]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

driver.get("https://www.straighttalk.com/all-plans")
html = driver.page_source
print(html)

driver.quit()

<html><head>
        <noscript>
            <title>Pardon Our Interruption</title>
        </noscript>

        <meta name="viewport" content="width=1000">
        <meta name="robots" content="noindex, nofollow">
        <meta http-equiv="cache-control" content="no-cache, no-store, must-revalidate">
        <meta http-equiv="pragma" content="no-cache">
        <meta http-equiv="expires" content="0">

        <style>
            .container { max-width: 800px; margin: auto; font-family: 'Helvetica Neue',Helvetica,Arial,sans-serif; color: #7a838c; }
            h1 { color: #2a2d30; font-weight: 500; }
            li { margin: 0 0 10px; }
            a { color: #428bca; }
            a:hover, a:focus { color: #2a6496; }
        </style>

        <script>
          var isSpa = new URLSearchParams(window.location.search).get('X-SPA') === '1' || window.isImpervaSpaSupport;
        </script>

        <!-- This head template should be placed before the following script tag that loads the challe