In [2]:
!playwright install chromium

In [9]:
import re
from pprint import pprint
from typing import List, Optional

import html2text
import nest_asyncio
import pandas as pd

from langchain_groq import ChatGroq

from playwright.async_api import async_playwright
from pydantic import BaseModel, Field
from tqdm import tqdm

nest_asyncio.apply() # fixes asyncio issue with Jupyter notebooks
tqdm.pandas() # fixes tqdm progress bar in Jupyter notebooks

# debugging
import traceback

#### Fetch Web content as Markdown

In [29]:
playwright = None
browser = None
USER_AGENT ="Mozilla/5.0 \
            (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/120.0.0 Safari/537.36"
try:
    playwright = await async_playwright().start() # start Playwright
    # Launch the browser
    browser = await playwright.chromium.launch(
        headless=True,
        # args=["--no-sandbox", "--disable-setuid-sandbox"], # only for containerized environments
        args=[ "--disable-gpu", "--disable-software-rasterizer"], # for local environments
    )
    # Create a new browser context
    context = await browser.new_context(
        user_agent=USER_AGENT,
        viewport={"width": 1280, "height": 800},
    )

    # Create a new page within the context
    page = await context.new_page()
    print(f"Using user-agent: {await page.evaluate('navigator.userAgent')}") # check user agent
    page.set_default_timeout(60000) # set default timeout to 60 seconds
    page.set_default_navigation_timeout(60000) # set default navigation timeout to 60 seconds

    await page.goto("https://www.groq.com/") # go to Groq website

    content = await page.content() # get the content of the page
except Exception as e:
    print(f"Error: {e}")
    traceback.print_exc() 
finally:
    # Close the browser and Playwright resources
    if browser:
        await browser.close()
    if playwright:
        await playwright.stop()
    print("Playwright resources closed.")

print(content) # print the content of the page

Using user-agent: Mozilla/5.0             (Windows NT 10.0; Win64; x64)             AppleWebKit/537.36 (KHTML, like Gecko)             Chrome/120.0.0 Safari/537.36
Playwright resources closed.
<!DOCTYPE html><html lang="en-US" prefix="og: https://ogp.me/ns#"><head><style data-hubspot-styled-components=""></style>
	<meta charset="UTF-8">
		<meta name="viewport" content="width=device-width, initial-scale=1">
	<style data-tippy-stylesheet="">.tippy-box[data-animation=fade][data-state=hidden]{opacity:0}[data-tippy-root]{max-width:calc(100vw - 10px)}.tippy-box{position:relative;background-color:#333;color:#fff;border-radius:4px;font-size:14px;line-height:1.4;white-space:normal;outline:0;transition-property:transform,visibility,opacity}.tippy-box[data-placement^=top]>.tippy-arrow{bottom:0}.tippy-box[data-placement^=top]>.tippy-arrow:before{bottom:-7px;left:0;border-width:8px 8px 0;border-top-color:initial;transform-origin:center top}.tippy-box[data-placement^=bottom]>.tippy-arrow{top:0}.tipp

In [None]:
# convert HTML to text to remove unnecessary HTML tags (less token to process)

def html_to_text(html: str) -> str:
    """
    Convert HTML to text using html2text.
    """
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.ignore_emphasis = True
    h.ignore_tables = True
    return h.handle(html)

# convert HTML to text
print(html_to_text(content))# convert HTML to text

Now in Preview: Groq’s First Compound AI System.

Try Now &

Learn More.

The Official Llama API, accelerated by Groq. The fastest, lowest-cost way to
run Llama.

Learn More

Llama 4 is live on GroqCloud™.

Learn more &

try now.

  * Products 

Close Products Open Products

    * Pricing
    * Products Overview
    * Enterprise Access
    * GroqCloud™ Platform
    * GroqRack™ Cluster

  * Developers 

Close Developers Open Developers

    * Free API Key  
    * Start Building  
    * Discord  
    * Groq Libraries  
    * Community
    * Showcase

  * Insights 

Close Insights Open Insights

    * Inference
    * Customer Use Cases
    * Blog
    * Papers
    * Events
    * Press Releases
    * Videos

  * About 

Close About Open About

    * About Groq
    * In the News
    * Team
    * Careers

Dev Console

# Groq is Fast AI Inference

Use full chat

REDUCE CHAT

Use full chat

250 K

1 M+

Developers using GroqCloud™  
since Feb ‘24 launch

FREE API KEY

### Instant Intelligence

