In [None]:
# !playwright install chromium

In [20]:
from typing import List, Optional

import html2text
import nest_asyncio
import pandas as pd

from langchain_groq import ChatGroq

from playwright.async_api import async_playwright
from pydantic import BaseModel, Field
from tqdm import tqdm

nest_asyncio.apply() # fixes asyncio issue with Jupyter notebooks
tqdm.pandas() # fixes tqdm progress bar in Jupyter notebooks

import json

# debugging
import traceback

### Fetch Web content as Markdown

In [33]:
playwright = None
browser = None
USER_AGENT ="Mozilla/5.0 \
            (Windows NT 10.0; Win64; x64) \
            AppleWebKit/537.36 (KHTML, like Gecko) \
            Chrome/120.0.0 Safari/537.36"
try:
    playwright = await async_playwright().start() # start Playwright
    # Launch the browser
    browser = await playwright.chromium.launch(
        headless=True,
        # args=["--no-sandbox", "--disable-setuid-sandbox"], # only for containerized environments
        args=[ "--disable-gpu", "--disable-software-rasterizer"], # for local environments
    )
    # Create a new browser context
    context = await browser.new_context(
        user_agent=USER_AGENT,
        viewport={"width": 1280, "height": 800},
    )

    # Create a new page within the context
    page = await context.new_page()
    print(f"Using user-agent: {await page.evaluate('navigator.userAgent')}") # check user agent
    page.set_default_timeout(60000) # set default timeout to 60 seconds
    page.set_default_navigation_timeout(60000) # set default navigation timeout to 60 seconds

    await page.goto("https://www.lego.com/en-us/categories/price-50-75-dollars")
    # await page.goto("https://reliefweb.int/report/myanmar/initial-environmental-action-plan")

    content = await page.content() # get the content of the page
except Exception as e:
    print(f"Error: {e}")
    traceback.print_exc() 
finally:
    # Close the browser and Playwright resources
    if browser:
        await browser.close()
    if playwright:
        await playwright.stop()
    print("Playwright resources closed.")

# print(content) # print the content of the page

Using user-agent: Mozilla/5.0             (Windows NT 10.0; Win64; x64)             AppleWebKit/537.36 (KHTML, like Gecko)             Chrome/120.0.0 Safari/537.36
Playwright resources closed.


In [34]:
# convert HTML to text to remove unnecessary HTML tags (less token to process)
def html_to_text(html: str) -> str:
    """
    Convert HTML to text using html2text.
    """
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.ignore_emphasis = True
    h.ignore_tables = True
    return h.handle(html)

# convert HTML to text
print(html_to_text(content))# convert HTML to text

Skip to main content

Report an accessibility issue

Go to our accessibility page

Play Zone

Previous

Explore all current special offers and promotions now.Learn more

FREE Shipping with orders over $35!*Learn more

Next

  * Shop
  * Discover
  * Help
  * All Sets

  * Sets by theme

  * Age

  * Price ranges

  * Shop all occasions

  * LEGO® merchandise

  * LEGO® decor

  * Interests

  * Bricks and pieces

  * Exclusives

  * New

  * Bestsellers

  * Offers & sale

  * Gift cards

  * Coming soon

  * Retiring soon

SEE ALL THEMES

LEGO® Animal Crossing™

Architecture

LEGO® Art

LEGO® Avatar

Batman™

LEGO® BlueyNew

Botanical Collection

LEGO® Braille Bricks

BrickHeadz

City

Classic

Creator 3in1

DC

Despicable Me 4

Disney

DOTS

LEGO® DREAMZzz™

LEGO® DUPLO®

LEGO® DUPLO® Peppa Pig

LEGO® Education

Friends

LEGO® Fortnite®

LEGO® Gabby's Dollhouse

Harry Potter™

LEGO® Icons

Ideas

Jurassic World

Lord of the Rings™

Marvel

Minecraft®

Minifigures

Monkie Kid™

NINJAG

### Setup LLM

In [13]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [42]:
from groq import Groq

api_key = os.getenv("GROQ_API_KEY")
client = Groq(api_key=api_key)

# test if the API key is valid and the Groq client is working
completion = client.chat.completions.create(
    model="llama-3.1-8b-instant",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Say hello!"}
    ],
    temperature=0.7,
    max_completion_tokens=128,
    top_p=1,
    stream=False,
    stop=None,
)
print(completion.choices[0].message.content) # print the response from the model

Hello. How can I assist you today?


In [79]:
SYSTEM_PROMPT = """
You are an expert data assistant with strong knowledge of Markdown, website structures, HTML, CSS, JavaScript, and natural language processing.

Task:
Extract structured and relevant data from the provided raw text.

Requirements:

Output the data only in the specified structured format (JSON or CSV), as instructed.
Do not include any explanations or extra text—only output the data in a code block labeled as either json or csv.
When extracting data:
Preserve the data exactly as found; do not modify or interpret content.
Structure it strictly into JSON or CSV format as specified in the instruction.
Leave blank cells/fields for any information not present in the input.
Instructions Example:

If output should be JSON:
Output a code block starting with ```json
If output should be CSV:
Output a code block starting with ```csv
Your response must always follow these rules.
"""
# The text may contain various types of information, including but not limited to:

def create_scrape_prompt(page_content: str) -> str:
    """
    Create a prompt for the Groq model to extract structured data from the page content.
    """
    return f"""
    The following is the raw text extracted from a webpage. 
    Please extract the relevant data and output it in the below format:
        class Lego_Price(BaseModel):
            item_name: str = Field(..., description="Name of the product e.g., LEGO® Harry Potter™ Hogwarts™ Castle")
            item_price: str = Field(..., description="Price of the product e.g., $69.99")
            item_order_type: str = Field(..., description="Order type of the product e.g., Pre-order or Coming soon")
            franchise: str = Field(..., description="Type of the product e.g., from which franchise it is from")
    Raw text:
    {page_content}
    Output format: csv
    """.strip()

In [80]:
# scrape landing pages

class ProjectInformation(BaseModel):
    """
    Model to hold project information.
    """
    project_name: str = Field(..., description="Name of the project e.g., Widlife Conservation")
    project_description: str = Field(..., description="What is the project about? e.g., This project is about...")
    project_url: str = Field(..., description="URL of the project")
    project_category: Optional[str] = Field(None, description="Category of the project")
    project_benefits: Optional[str] = Field(None, description="Benefits of the project e.g., This project will benefit...")
    project_location: Optional[str] = Field(None, description="Location of the project e.g., Dubai, UAE")
    # project_image_url: Optional[str] = Field(None, description="Image URL of the project")

class Lego_Price(BaseModel):
    """
    Model to hold Lego price information.
    """
    item_name: str = Field(..., description="Name of the product e.g., LEGO® Harry Potter™ Hogwarts™ Castle")
    item_price: str = Field(..., description="Price of the product e.g., $69.99")
    item_order_type: str = Field(..., description="Order type of the product e.g., Pre-order or Coming soon")
    item_type: str = Field(..., description="Type of the product e.g., from which franchise it is from")

In [81]:
# build messages
messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": create_scrape_prompt(html_to_text(content))},
]

resp_text = client.chat.completions.create(
    model = "llama-3.1-8b-instant",
    messages = messages,
    temperature = 0,
    max_completion_tokens = 512,
).choices[0].message.content

# Strip ```json ... ``` fences if the model included them
if resp_text.startswith("```"):
    resp_text = resp_text.split("```", 2)[1].strip()

In [90]:
from io import StringIO  # Add this import at the top with other imports

def parse_llm_response(resp_text: str):
    """
    Parse LLM response text that contains either JSON or CSV data
    Returns: DataFrame for both JSON and CSV responses
    """
    # Clean the response text
    cleaned_text = resp_text.replace("json\n", "").replace("csv\n", "").strip()
    
    try:
        if resp_text.startswith("json"):
            # Parse JSON and convert to DataFrame
            data = json.loads(cleaned_text)
            if isinstance(data, list):
                df = pd.DataFrame(data)
            else:
                df = pd.DataFrame([data])
        elif resp_text.startswith("csv"):
            # Parse CSV string directly to DataFrame
            df = pd.read_csv(StringIO(cleaned_text))  # Use StringIO directly
        else:
            raise ValueError("Response is neither JSON nor CSV format")
        
        return df
    
    except (json.JSONDecodeError, pd.errors.ParserError) as e:
        print(f"Error parsing response: {e}")
        print(f"Response text: {resp_text}")
        return None
    

# Use the function
df = parse_llm_response(resp_text)
if df is not None:
    pass
    # print(df)

df

Unnamed: 0,item_name,item_price,item_order_type,franchise
0,Iron Spider-Man Bust,$59.99,Pre-order,Marvel
1,Angel,$64.99,Pre-order,Marvel
2,Ford Bronco® SUV,$64.99,Pre-order,DC
3,Ferrari FXX K,$64.99,Pre-order,DC
4,Rebel U-Wing Starfighter™,$69.99,Pre-order,Star Wars
5,Thestral Family,$69.99,Pre-order,Harry Potter
6,Spider-Man vs. Doc Ock Subway Train Scene,$54.99,Pre-order,Marvel
7,LEGO Ideas ǀ Disney Pixar Luxo Jr.,$69.99,Pre-order,Disney
8,Iron Man MK4 Bust,$59.99,Coming Soon,Marvel
9,The Fauna Collection - Tiger,$64.99,Coming Soon,LEGO
