In [103]:
import os
import re
import json
import io
import requests
import ast
from typing import Optional
from datetime import date

from dotenv import load_dotenv

from pydantic import BaseModel, Field
from bs4 import BeautifulSoup
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_text_splitters import RecursiveCharacterTextSplitter
from unstructured.partition.html import partition_html

In [104]:
# **Approach 2:**

# 1. “Given the following HTML: `html`, write a parser using playwright that pulls out all sections involving clothing”
# 2. Validate and check the generated script for security
# 3. Execute the scripts against the HTML
# 4. “Given the following HTML snippet, extract `ClothingItem` from the data”

In [105]:
load_dotenv()

True

In [106]:
# TODO: Consider an HTML-aware text splitter to not cut off HTML tags
SEARCH_MAX_RESULTS = 1
CHUNK_SIZE = 10000
MIN_CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100
MAX_RETRIES = 3
SEARCH_QUERY = "men's blue jeans"

In [107]:
generate_scraping_script_prompt = """
Given the following HTML:
{html}

Write a parser using BeautifulSoup that pulls out all sections involving clothing.
Return the function to extract in a function called `parse_clothing_sections`.
Put all your python code in a code block in markdown format i.e ```code ...```
Return only the python code, nothing else.

The function should look like this:
```python
async def parse_clothing_sections(html: str):
    ...
```

Parsing Script Python:
"""

In [108]:
parse_results_prompt = """
Given the following clothing search results:

Results:
{results}

Extract a list of ClothingItem objects from the results.
Here is the python class definition for ClothingItem:
{clothing_item_class}

Extract only a List[ClothingItem] and nothing else.

clothing_item_list = 
"""

In [109]:
# TODO: Consider fine tuning a smaller model on GPT-4o's output to train it to
# Parse clothing web pages consistently!!!
# That fine tuning would be the kicker feature to show that I really understand ML/AI engineering!!!
llm = ChatOpenAI(
    model="gpt-4o",
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0.0,
    streaming=True,
)

In [110]:
class ClothingItem(BaseModel):
    """
    A clothing item extracted from the internet.
    Includes a variety of possible metadata fields.
    """

    id: Optional[str] = Field(
        None, description="Unique identifier for the clothing item"
    )
    name: Optional[str] = Field(
        None, description="Descriptive name of the clothing item"
    )
    brand: Optional[str] = Field(
        None, description="Manufacturer or designer of the clothing item"
    )
    category: Optional[str] = Field(
        None, description="General category of the item, e.g., shirts, pants, dresses"
    )
    subcategory: Optional[str] = Field(
        None,
        description="More specific classification, e.g., t-shirt, jeans, cocktail dress",
    )
    price: Optional[float] = Field(None, description="Current price of the item")
    original_price: Optional[float] = Field(
        None, description="Original price if the item is on sale"
    )
    image_url: Optional[str] = Field(None, description="URL of the item's image")
    color: Optional[str] = Field(None, description="Primary color of the item")
    sizes: Optional[list[str]] = Field(
        None, description="Available sizes or size range"
    )
    material: Optional[str] = Field(
        None, description="Main fabric or material composition"
    )
    gender: Optional[str] = Field(
        None, description="Target gender if applicable (men's, women's, unisex)"
    )
    season: Optional[str] = Field(
        None,
        description="Appropriate season for the item, e.g., summer, winter, all-season",
    )
    style: Optional[str] = Field(
        None, description="Style of the item, e.g., casual, formal, sporty"
    )
    description: Optional[str] = Field(
        None, description="Detailed text description of the item"
    )
    care_instructions: Optional[str] = Field(
        None, description="Instructions for washing and maintaining the item"
    )
    availability: Optional[str] = Field(
        None, description="Availability status, e.g., in stock, out of stock, pre-order"
    )
    average_rating: Optional[float] = Field(
        None, description="Average customer rating of the item"
    )
    num_reviews: Optional[int] = Field(None, description="Number of customer reviews")
    tags: Optional[list[str]] = Field(
        None,
        description="Keywords associated with the item for searching and categorization",
    )
    dimensions: Optional[dict] = Field(
        None, description="Measurements of the item, e.g., length, width, sleeve length"
    )
    weight: Optional[float] = Field(
        None, description="Weight of the item, useful for shipping calculations"
    )
    release_date: Optional[date] = Field(
        None, description="Date when the item was first available or added to inventory"
    )
    sustainability_info: Optional[str] = Field(
        None, description="Information about eco-friendly or ethical production"
    )
    
class ClothingItemList(BaseModel):
    clothing_items: list[ClothingItem]

In [111]:
def split_html(html: str) -> list[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        # min_chunk_size=MIN_CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_text(html)

In [112]:
def get_search_results(search_query: str) -> list[str]:
    tavily_search = TavilySearchResults(
        max_results=SEARCH_MAX_RESULTS, include_raw_content=True
    )
    search_results = tavily_search.invoke({"query": search_query})
    extracted_htmls = []
    for res in search_results:
        print(f"Search result: {res}")
        url = res["url"]
        try:
            response = requests.get(url)
            response.raise_for_status()
            html_content = response.text
            extracted_htmls.extend(split_html(html_content))
        except Exception as e:
            print(f"Error fetching HTML from {url}: {e}")
            continue
    return extracted_htmls

In [113]:
def get_test_html() -> str:
    with open("test_web_page.html", "r") as f:
        return f.read()

In [114]:
def extract_code_from_markdown(markdown: str) -> str:
    # Try to extract code blocks marked with ```python first
    if "```python" in markdown:
        return markdown.split("```python")[1].split("```")[0]
    # If no python blocks found, try generic code blocks
    elif "```" in markdown:
        return markdown.split("```")[1].split("```")[0]
    return ""

In [115]:
def get_parsing_script(search_results: dict) -> str:
    prompt = PromptTemplate(
        input_variables=["html"],
        template=generate_scraping_script_prompt,
    )
    extract_prompt = prompt.format(html=search_results["content"])
    # print(f"Extract prompt: {extract_prompt}")
    raw_res = AIMessage.model_validate(llm.invoke(extract_prompt)).content
    # print(f"Raw res: {raw_res}")
    return extract_code_from_markdown(raw_res)

In [116]:
def validate_parsing_script(script: str) -> bool:
    """
    Validate the generated parsing script for security concerns.
    Returns True if script passes security checks, False otherwise.
    """
    # List of forbidden operations/keywords that could be dangerous
    forbidden = [
        "import os",
        "import sys",
        "subprocess",
        "eval(",
        "exec(",
        "__import__",
        "open(",
        "write",
        "delete",
        "remove",
    ]

    # Check for forbidden operations
    for item in forbidden:
        if item in script.lower():
            print(f"Security violation: Found forbidden operation '{item}'")
            return False

    # Only allow importing safe parsing libraries
    allowed_imports = {
        "json",
        "asyncio",
        "playwright",
        "beautifulsoup4",
        "bs4",
        "lxml",
        "html.parser",
        "re",
    }

    import_lines = [
        line.strip()
        for line in script.split("\n")
        if line.strip().startswith("import") or line.strip().startswith("from")
    ]

    for line in import_lines:
        module = line.split()[1].split(".")[0]
        if module not in allowed_imports:
            print(f"Security violation: Unauthorized import '{module}'")
            return False

    return True

In [117]:
class FunctionExtractor(ast.NodeVisitor):
    def __init__(self):
        self.functions = []

    def visit_FunctionDef(self, node):
        # Collect regular function name and body
        self.functions.append({
            'name': node.name,
            'type': 'regular',
            'body': ast.unparse(node)
        })
        self.generic_visit(node)

    def visit_AsyncFunctionDef(self, node):
        # Collect async function name and body
        self.functions.append({
            'name': node.name,
            'type': 'async',
            'body': ast.unparse(node)
        })
        self.generic_visit(node)

def extract_functions_from_code(code):
    tree = ast.parse(code)
    extractor = FunctionExtractor()
    extractor.visit(tree)
    return extractor.functions

In [118]:
async def execute_parsing_script(script: str, html: str) -> list:
    local_namespace = {}
    try:
        # Execute the function body in the local namespace
        exec(script, globals(), local_namespace)
        parse_clothing_sections = local_namespace.get("parse_clothing_sections")
        if not parse_clothing_sections:
            raise ValueError("parse_clothing_sections function not found")
        return await parse_clothing_sections(html)
    except Exception as e:
        print(f"Error executing parsing script: {str(e)}")

In [119]:
def result_extractor(raw_results: str) -> ClothingItemList:
    structured_output_llm = llm.with_structured_output(ClothingItemList)
    
    for attempt in range(MAX_RETRIES):
        try:
            raw_res = structured_output_llm.invoke(raw_results)
            print(f"Raw res: {raw_res}")
            return ClothingItemList.model_validate(raw_res)
        except Exception as e:
            if attempt == MAX_RETRIES - 1:  # Last attempt
                raise
            print(f"Attempt {attempt + 1} failed: {str(e)}. Retrying...")

In [120]:
# search_results = split_html(get_test_html())
# print(f"len(search_results): {len(search_results)}")
# print(f"Search results: {search_results[100:110]}")
# print([res_len for res_len in [len(res) for res in search_results]])
# print(
#     f"Average chunk length: {sum(len(chunk) for chunk in search_results) / len(search_results)}"
# )

len(search_results): 180
Search results: ['</article>\n<!-- productListerGridItem4Cols.tag -->\n<article class="instock-true product-tiles-grid-item product-tiles-grid-item-medium product-tiles-grid-item-small hover-link ">\n\t<a class="product-tiles-grid-item-link js-ga-track"\n        href="/us/en/pr/men/bags-for-men/messengers-crossbody-bags-for-men/super-mini-shoulder-bag-p-800313AADYK1060"\n        data-style-id="800313AADYK1060"\n        dt-dtname="Product Details Page"\n        id="800313AADYK1060"\n        data-position="24" data-list-name="ProductGrid" data-category-path="Whats-New/New-In/This-Week-Men"\n        aria-label="Super mini shoulder bag">\n        <div class="product-tiles-grid-item-image-wrapper">\n            <div class="product-tiles-grid-item-image 24">\n                <picture data-image-type="picture">\n                    <source data-image-size="standard-retina" data-srcset="//media.gucci.com/style/White_South_0_160_540x540/1717002928/800313_AADYK_1060_001_

In [121]:
search_results = get_search_results(SEARCH_QUERY)
# search_results = split_html(get_test_html())
for search_result in search_results:
    print(f"Search result: {search_result}")
    try:
        parsing_script = get_parsing_script({"content": search_result})
    except Exception as e:
        print(f"Error getting parsing script: {str(e)}")
        continue
    if validate_parsing_script(parsing_script):
        # print(f"Raw Parsing script: {parsing_script}")
        extracted_funcs = extract_functions_from_code(parsing_script)
        # print(f"Extracted functions: {extracted_funcs}")
        if len(extracted_funcs) != 1:
            print(f"Expected 1 function, got {len(extracted_funcs)}")
            continue

        parse_clothing_sections_func = extracted_funcs[0]["body"]
        print(f"Parsing script:\n{parse_clothing_sections_func}")
        results_safe = validate_parsing_script(parse_clothing_sections_func)
        if not results_safe:
            print("Parse clothing sections function failed security validation")
            continue

        raw_results = await execute_parsing_script(
            parse_clothing_sections_func, search_result
        )
        print(f"Results: {raw_results}")
        if raw_results and len(raw_results) > 0:
            try:
                parsed_results = result_extractor(str(raw_results))
                print(f"Parsed results: {parsed_results}")
            except Exception as e:
                print(f"Error parsing results: {str(e)}")
                continue
    else:
        print("Generated script failed security validation")

In [122]:
# TODO: Consider using partition_html from unstructured to grab the HTML content for each page
# Seems to be more consistent that the custom parsers the AI is writing
# TODO: Compare partition_html vs. writing custom parsers