In [2]:
import os
import re
import json
import ast
from typing import Optional
from datetime import date

from dotenv import load_dotenv

from pydantic import BaseModel, Field
from bs4 import BeautifulSoup
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
# **Approach 2:**

# 1. “Given the following HTML: `html`, write a parser using playwright that pulls out all sections involving clothing”
# 2. Validate and check the generated script for security
# 3. Execute the scripts against the HTML
# 4. “Given the following HTML snippet, extract `ClothingItem` from the data”

In [4]:
load_dotenv()

True

In [5]:
# TODO: Consider an HTML-aware text splitter to not cut off HTML tags
SEARCH_MAX_RESULTS = 1
CHUNK_SIZE = 5000
CHUNK_OVERLAP = 100
MAX_RETRIES = 3
SEARCH_QUERY = "men's blue jeans"

In [6]:
generate_scraping_script_prompt = """
Given the following HTML:
{html}

Write a parser using playwright and BeautifulSoup that pulls out all sections involving clothing.
Make sure to use the async playwright API and async code.
Return the function to extract in a function called `parse_clothing_sections`.
Remember you already have the HTML locally, so you don't need to call `.goto()`.
Put all your python code in a code block in markdown format i.e ```code ...```
Return only the python code, nothing else.

The function should look like this:
```python
async def parse_clothing_sections(html: str):
    ...
```

Parsing Script Python:
"""

In [7]:
parse_results_prompt = """
Given the following clothing search results:

Results:
{results}

Extract a list of ClothingItem objects from the results.
Here is the python class definition for ClothingItem:
{clothing_item_class}

Extract only a List[ClothingItem] and nothing else.

clothing_item_list = 
"""

In [8]:
# TODO: Consider fine tuning a smaller model on GPT-4o's output to train it to
# Parse clothing web pages consistently!!!
# That fine tuning would be the kicker feature to show that I really understand ML/AI engineering!!!
llm = ChatOpenAI(
    model="gpt-4o",
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0.0,
    streaming=True,
)

In [9]:
class ClothingItem(BaseModel):
    """
    A clothing item extracted from the internet.
    Includes a variety of possible metadata fields.
    """

    id: Optional[str] = Field(
        None, description="Unique identifier for the clothing item"
    )
    name: Optional[str] = Field(
        None, description="Descriptive name of the clothing item"
    )
    brand: Optional[str] = Field(
        None, description="Manufacturer or designer of the clothing item"
    )
    category: Optional[str] = Field(
        None, description="General category of the item, e.g., shirts, pants, dresses"
    )
    subcategory: Optional[str] = Field(
        None,
        description="More specific classification, e.g., t-shirt, jeans, cocktail dress",
    )
    price: Optional[float] = Field(None, description="Current price of the item")
    original_price: Optional[float] = Field(
        None, description="Original price if the item is on sale"
    )
    image_url: Optional[str] = Field(None, description="URL of the item's image")
    color: Optional[str] = Field(None, description="Primary color of the item")
    sizes: Optional[list[str]] = Field(
        None, description="Available sizes or size range"
    )
    material: Optional[str] = Field(
        None, description="Main fabric or material composition"
    )
    gender: Optional[str] = Field(
        None, description="Target gender if applicable (men's, women's, unisex)"
    )
    season: Optional[str] = Field(
        None,
        description="Appropriate season for the item, e.g., summer, winter, all-season",
    )
    style: Optional[str] = Field(
        None, description="Style of the item, e.g., casual, formal, sporty"
    )
    description: Optional[str] = Field(
        None, description="Detailed text description of the item"
    )
    care_instructions: Optional[str] = Field(
        None, description="Instructions for washing and maintaining the item"
    )
    availability: Optional[str] = Field(
        None, description="Availability status, e.g., in stock, out of stock, pre-order"
    )
    average_rating: Optional[float] = Field(
        None, description="Average customer rating of the item"
    )
    num_reviews: Optional[int] = Field(None, description="Number of customer reviews")
    tags: Optional[list[str]] = Field(
        None,
        description="Keywords associated with the item for searching and categorization",
    )
    dimensions: Optional[dict] = Field(
        None, description="Measurements of the item, e.g., length, width, sleeve length"
    )
    weight: Optional[float] = Field(
        None, description="Weight of the item, useful for shipping calculations"
    )
    release_date: Optional[date] = Field(
        None, description="Date when the item was first available or added to inventory"
    )
    sustainability_info: Optional[str] = Field(
        None, description="Information about eco-friendly or ethical production"
    )
    
class ClothingItemList(BaseModel):
    clothing_items: list[ClothingItem]

In [10]:
def get_search_results(search_query: str) -> dict:
    tavily_search = TavilySearchResults(max_results=SEARCH_MAX_RESULTS)
    search_results = tavily_search.invoke({"query": search_query})
    print(f"Search results: {search_results}")
    return search_results

In [11]:
def get_test_html() -> str:
    with open("test_web_page.html", "r") as f:
        return f.read()

In [12]:
def split_html(html: str) -> list[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_text(html)

In [13]:
def extract_code_from_markdown(markdown: str) -> str:
    # Try to extract code blocks marked with ```python first
    if "```python" in markdown:
        return markdown.split("```python")[1].split("```")[0]
    # If no python blocks found, try generic code blocks
    elif "```" in markdown:
        return markdown.split("```")[1].split("```")[0]
    return ""

In [14]:
def get_parsing_script(search_results: dict) -> str:
    prompt = PromptTemplate(
        input_variables=["html"],
        template=generate_scraping_script_prompt,
    )
    extract_prompt = prompt.format(html=search_results["content"])
    # print(f"Extract prompt: {extract_prompt}")
    raw_res = AIMessage.model_validate(llm.invoke(extract_prompt)).content
    # print(f"Raw res: {raw_res}")
    return extract_code_from_markdown(raw_res)

In [15]:
def validate_parsing_script(script: str) -> bool:
    """
    Validate the generated parsing script for security concerns.
    Returns True if script passes security checks, False otherwise.
    """
    # List of forbidden operations/keywords that could be dangerous
    forbidden = [
        "import os",
        "import sys",
        "subprocess",
        "eval(",
        "exec(",
        "__import__",
        "open(",
        "write",
        "delete",
        "remove",
    ]

    # Check for forbidden operations
    for item in forbidden:
        if item in script.lower():
            print(f"Security violation: Found forbidden operation '{item}'")
            return False

    # Only allow importing safe parsing libraries
    allowed_imports = {
        "json",
        "asyncio",
        "playwright",
        "beautifulsoup4",
        "bs4",
        "lxml",
        "html.parser",
        "re",
    }

    import_lines = [
        line.strip()
        for line in script.split("\n")
        if line.strip().startswith("import") or line.strip().startswith("from")
    ]

    for line in import_lines:
        module = line.split()[1].split(".")[0]
        if module not in allowed_imports:
            print(f"Security violation: Unauthorized import '{module}'")
            return False

    return True

In [16]:
class FunctionExtractor(ast.NodeVisitor):
    def __init__(self):
        self.functions = []

    def visit_FunctionDef(self, node):
        # Collect regular function name and body
        self.functions.append({
            'name': node.name,
            'type': 'regular',
            'body': ast.unparse(node)
        })
        self.generic_visit(node)

    def visit_AsyncFunctionDef(self, node):
        # Collect async function name and body
        self.functions.append({
            'name': node.name,
            'type': 'async',
            'body': ast.unparse(node)
        })
        self.generic_visit(node)

def extract_functions_from_code(code):
    tree = ast.parse(code)
    extractor = FunctionExtractor()
    extractor.visit(tree)
    return extractor.functions

In [17]:
async def execute_parsing_script(script: str, html: str) -> list:
    local_namespace = {}
    try:
        # Execute the function body in the local namespace
        exec(script, globals(), local_namespace)
        parse_clothing_sections = local_namespace.get("parse_clothing_sections")
        if not parse_clothing_sections:
            raise ValueError("parse_clothing_sections function not found")
        return await parse_clothing_sections(html)
    except Exception as e:
        print(f"Error executing parsing script: {str(e)}")

In [18]:
def result_extractor(raw_results: str) -> ClothingItemList:
    structured_output_llm = llm.with_structured_output(ClothingItemList)
    
    for attempt in range(MAX_RETRIES):
        try:
            raw_res = structured_output_llm.invoke(raw_results)
            print(f"Raw res: {raw_res}")
            return ClothingItemList.model_validate(raw_res)
        except Exception as e:
            if attempt == MAX_RETRIES - 1:  # Last attempt
                raise
            print(f"Attempt {attempt + 1} failed: {str(e)}. Retrying...")

In [19]:
# raw_res = """
# [{'position': 1, 'url': 'https://www.gucci.com/us/en/pr/men/ready-to-wear-for-men/denim-for-men/denim-tops-for-men/lasered-denim-jacket-p-789129XDC0F4447', 'name': 'Lasered denim jacket', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1720542647/789129_XDC0F_4447_001_100_0000_Light-Lasered-denim-jacket.jpg'}, {'position': 3, 'url': 'https://www.gucci.com/us/en/pr/men/ready-to-wear-for-men/pants-and-shorts-for-men/long-pants-for-men/tapered-denim-pant-with-web-p-789232XDC244447', 'name': 'Tapered denim pant with Web', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1720542652/789232_XDC24_4447_001_100_0000_Light-Tapered-denim-pant-with-Web.jpg'}, {'position': 4, 'url': 'https://www.gucci.com/us/en/pr/men/accessories-for-men/hats-and-gloves-for-men/beanies-for-men/rib-knit-wool-hat-p-8113444G2004000', 'name': 'Rib knit wool hat', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1727434842/811344_4G200_4000_001_100_0000_Light-Rib-knit-wool-hat.jpg'}, {'position': 7, 'url': 'https://www.gucci.com/us/en/pr/men/ready-to-wear-for-men/knitwear-for-men/crewnecks-for-men/wool-sweater-with-gucci-intarsia-p-795834XKD7K4594', 'name': 'Wool sweater with Gucci intarsia', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1711388799/795834_XKD7K_4594_001_100_0000_Light-Wool-sweater-with-Gucci-intarsia.jpg'}, {'position': 8, 'url': 'https://www.gucci.com/us/en/pr/men/shoes-for-men/loafers-for-men/mens-horsebit-1953-loafer-p-307929BLM001000', 'name': "Men's Horsebit 1953 loafer", 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1659723303/307929_BLM00_1000_001_100_0000_Light-Mens-Horsebit-1953-loafer.jpg'}, {'position': 9, 'url': 'https://www.gucci.com/us/en/pr/men/ready-to-wear-for-men/pants-and-shorts-for-men/long-pants-for-men/polyester-gg-jacquard-pant-with-web-p-791166ZAORH4036', 'name': 'Polyester GG jacquard pant with Web', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1713457876/791166_ZAORH_4036_001_100_0000_Light-Polyester-GG-jacquard-pant-with-Web.jpg'}]
# """
# raw_res = """"
# [{'position': 1, 'url': 'https://www.gucci.com/us/en/pr/men/ready-to-wear-for-men/denim-for-men/denim-tops-for-men/lasered-denim-jacket-p-789129XDC0F4447', 'name': 'Lasered denim jacket', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1720542647/789129_XDC0F_4447_001_100_0000_Light-Lasered-denim-jacket.jpg'}, {'position': 3, 'url': 'https://www.gucci.com/us/en/pr/men/ready-to-wear-for-men/pants-and-shorts-for-men/long-pants-for-men/tapered-denim-pant-with-web-p-789232XDC244447', 'name': 'Tapered denim pant with Web', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1720542652/789232_XDC24_4447_001_100_0000_Light-Tapered-denim-pant-with-Web.jpg'}, {'position': 4, 'url': 'https://www.gucci.com/us/en/pr/men/accessories-for-men/hats-and-gloves-for-men/beanies-for-men/rib-knit-wool-hat-p-8113444G2004000', 'name': 'Rib knit wool hat', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1727434842/811344_4G200_4000_001_100_0000_Light-Rib-knit-wool-hat.jpg'}, {'position': 7, 'url': 'https://www.gucci.com/us/en/pr/men/ready-to-wear-for-men/knitwear-for-men/crewnecks-for-men/wool-sweater-with-gucci-intarsia-p-795834XKD7K4594', 'name': 'Wool sweater with Gucci intarsia', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1711388799/795834_XKD7K_4594_001_100_0000_Light-Wool-sweater-with-Gucci-intarsia.jpg'}, {'position': 9, 'url': 'https://www.gucci.com/us/en/pr/men/ready-to-wear-for-men/pants-and-shorts-for-men/long-pants-for-men/polyester-gg-jacquard-pant-with-web-p-791166ZAORH4036', 'name': 'Polyester GG jacquard pant with Web', 'image': 'https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1713457876/791166_ZAORH_4036_001_100_0000_Light-Polyester-GG-jacquard-pant-with-Web.jpg'}]
# """
# print(result_extractor(raw_res))

Raw res: clothing_items=[ClothingItem(id=None, name='Lasered denim jacket', brand=None, category=None, subcategory=None, price=None, original_price=None, image_url='https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1720542647/789129_XDC0F_4447_001_100_0000_Light-Lasered-denim-jacket.jpg', color=None, sizes=None, material=None, gender=None, season=None, style=None, description=None, care_instructions=None, availability=None, average_rating=None, num_reviews=None, tags=None, dimensions=None, weight=None, release_date=None, sustainability_info=None), ClothingItem(id=None, name='Tapered denim pant with Web', brand=None, category=None, subcategory=None, price=None, original_price=None, image_url='https://media.gucci.com/style/DarkGray_Center_0_0_600x314/1720542652/789232_XDC24_4447_001_100_0000_Light-Tapered-denim-pant-with-Web.jpg', color=None, sizes=None, material=None, gender=None, season=None, style=None, description=None, care_instructions=None, availability=None, average_ratin

In [20]:
# search_results = get_search_results(SEARCH_QUERY)
search_results = split_html(get_test_html())
for search_result in search_results:
    print(f"Search result: {search_result}")
    parsing_script = get_parsing_script({"content": search_result})
    if validate_parsing_script(parsing_script):
        # print(f"Raw Parsing script: {parsing_script}")
        extracted_funcs = extract_functions_from_code(parsing_script)
        # print(f"Extracted functions: {extracted_funcs}")
        if len(extracted_funcs) != 1:
            print(f"Expected 1 function, got {len(extracted_funcs)}")
            continue

        parse_clothing_sections_func = extracted_funcs[0]["body"]
        print(f"Parsing script:\n{parse_clothing_sections_func}")
        results_safe = validate_parsing_script(parse_clothing_sections_func)
        if not results_safe:
            print("Parse clothing sections function failed security validation")
            continue

        raw_results = await execute_parsing_script(
            parse_clothing_sections_func, search_result
        )
        print(f"Results: {raw_results}")
        if len(raw_results) > 0:
            try:
                parsed_results = result_extractor(str(raw_results))
                print(f"Parsed results: {parsed_results}")
            except Exception as e:
                print(f"Error parsing results: {str(e)}")
                continue
    else:
        print("Generated script failed security validation")

Search result: <!DOCTYPE html>
<html class="no-js t-hasfindinstore" lang="en" dir="ltr" xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema">
  <head>
	<script type="text/javascript" src="https://www.gucci.com/static/0ff02ae18594f05c60ad3b5f0ef2bf8454a63794300564" async ></script><script>
    function logError(e, errorMessage) {
        try {
            console.error(errorMessage, e);
            window.dataLayer = window.dataLayer || [];
            window.dataLayer.push({
                "event": "csrfError",
                "error": errorMessage + ": " + e
            });
        } catch(e) {
            console.error(e);
        }
    }

    try {
        window.hybris =  window.hybris ?  window.hybris : {};
        window.hybris.CSRFTokenUpdate = {};

        window.hybris.CSRFTokenUpdate.update = function(token) {
            function updateDom() {
                var csrfInputs = document.querySelectorAll("input[name*='CSRFToken']");
     