In [1]:
import os
import re
import json
import io
import requests
import ast
from typing import Optional
from datetime import date
from collections import deque


from dotenv import load_dotenv

from pydantic import BaseModel, Field
from bs4 import BeautifulSoup
from playwright.async_api import async_playwright
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_text_splitters import RecursiveCharacterTextSplitter
from unstructured.partition.html import partition_html
from bs4 import BeautifulSoup
from urllib.parse import urljoin

In [2]:
# TODO: Implement a web page parser like this
# "Write a playwright script to click on all the product links on the web page and goto those URLs"

In [3]:
load_dotenv()

True

In [4]:
# TODO: Consider an HTML-aware text splitter to not cut off HTML tags
SEARCH_MAX_RESULTS = 2
MAX_SEARCH_DEPTH = 2
CHUNK_SIZE = 20000
MIN_CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100
MAX_RETRIES = 3
MAX_N_LINKS_TO_SEARCH = 3
SEARCH_QUERY = "Ring Jacket coats"

In [5]:
# TODO: Consider fine tuning a smaller model on GPT-4o's output to train it to
# Parse clothing web pages consistently!!!
# That fine tuning would be the kicker feature to show that I really understand ML/AI engineering!!!
llm = ChatOpenAI(
    model="gpt-4o",
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0.0,
    streaming=False,
)

fast_llm = ChatOpenAI(
    model="gpt-4o-mini",
    api_key=os.getenv("OPENAI_API_KEY"),
    temperature=0.0,
    streaming=False,
)

In [6]:
class SimpleClothingItem(BaseModel):
    name: str = Field(..., description="Descriptive name of the clothing item")
    price: Optional[float] = Field(None, description="Current price of the item")
    image_url: Optional[str] = Field(None, description="URL of the item's image")
    link: Optional[str] = Field(None, description="URL of the item's page")


class ClothingItemList(BaseModel):
    clothing_items: list[SimpleClothingItem]

In [7]:
html_contains_clothing_info_prompt = """
Given the following HTML snippet:
{html}

Does this HTML contain information about a clothing item or links that are promising for finding clothing items?
Return "true" if it does, "false" otherwise.
"""

In [8]:
playwright_click_promt = """
Given the following HTML content, write a playwright script to click on all the product links.
Keep a list[str] of all the URLs that you click on in a result list and return it.

HTML CONTENT:
{html_content}

Use the async playwright python API to write your code.
Return the function to extract in a function called `parse_clothing_sections`.
Put all your python code in a code block in markdown format i.e ```code ...```
Return only the python code, nothing else.

The function should look like this:
```python
async def parse_clothing_sections(html: str) -> list[str]:
    ...
```

Parsing Script Python:
"""


In [9]:
generate_scraping_script_prompt = """
Given the following HTML:
{html}

Write a parser using BeautifulSoup that pulls out all sections involving clothing.
Return the function to extract in a function called `parse_clothing_sections`.
Put all your python code in a code block in markdown format i.e ```code ...```
Return only the python code, nothing else.

The function should look like this:
```python
async def parse_clothing_sections(html: str):
    ...
```

Parsing Script Python:
"""

In [10]:
is_clothing_product_link_prompt = """
Given the following URL:
{url}

Is this a URL for a clothing product?
Return "true" if it is, "false" otherwise.
"""

In [11]:
def get_test_html() -> str:
    with open("test_web_page.html", "r") as f:
        return f.read()

In [12]:
def split_html(html: str) -> list[str]:
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=CHUNK_SIZE,
        # min_chunk_size=MIN_CHUNK_SIZE,
        chunk_overlap=CHUNK_OVERLAP,
        length_function=len,
        is_separator_regex=False,
    )
    return text_splitter.split_text(html)

In [13]:
def extract_code_from_markdown(markdown: str) -> str:
    # Try to extract code blocks marked with ```python first
    if "```python" in markdown:
        return markdown.split("```python")[1].split("```")[0]
    # If no python blocks found, try generic code blocks
    elif "```" in markdown:
        return markdown.split("```")[1].split("```")[0]
    return ""

In [14]:
class SearchResult(BaseModel):
    url: str = Field(..., description="URL of the search result")
    html_content: Optional[str] = Field(
        None, description="HTML content of the search result"
    )
    depth: int = Field(0, description="Depth of the search result in the BFS search")


def get_search_results_from_url(url: str, depth: int = 0) -> list[SearchResult]:
    res = []
    response = requests.get(url)
    response.raise_for_status()
    html_content = response.text
    for split_html_content in split_html(html_content):
        res.append(SearchResult(url=url, html_content=split_html_content, depth=depth))
    return res


def get_search_results(search_query: str) -> list[SearchResult]:
    tavily_search = TavilySearchResults(
        max_results=SEARCH_MAX_RESULTS, include_raw_content=True
    )
    search_results = tavily_search.invoke({"query": search_query})
    res = []
    for search_result in search_results:
        print(f"Search result: {search_result}")
        url = search_result["url"]
        try:
            res.extend(get_search_results_from_url(url, 0))
        except Exception as e:
            print(f"Error fetching HTML from {url}: {e}")
            continue
    return res

In [15]:
def get_parsing_script(search_results: dict) -> str:
    prompt = PromptTemplate(
        input_variables=["html_content"],
        template=playwright_click_promt,
    )
    extract_prompt = prompt.format(html_content=search_results["content"])
    # print(f"Extract prompt: {extract_prompt}")
    raw_res = AIMessage.model_validate(llm.invoke(extract_prompt)).content
    # print(f"Raw res: {raw_res}")
    return extract_code_from_markdown(raw_res)

In [16]:
def validate_parsing_script(script: str) -> bool:
    """
    Validate the generated parsing script for security concerns.
    Returns True if script passes security checks, False otherwise.
    """
    # List of forbidden operations/keywords that could be dangerous
    forbidden = [
        "import os",
        "import sys",
        "subprocess",
        "eval(",
        "exec(",
        "__import__",
        "open(",
        "write",
        "delete",
        "remove",
    ]

    # Check for forbidden operations
    for item in forbidden:
        if item in script.lower():
            print(f"Security violation: Found forbidden operation '{item}'")
            return False

    # Only allow importing safe parsing libraries
    allowed_imports = {
        "json",
        "asyncio",
        "playwright",
        "beautifulsoup4",
        "bs4",
        "lxml",
        "html.parser",
        "re",
    }

    import_lines = [
        line.strip()
        for line in script.split("\n")
        if line.strip().startswith("import") or line.strip().startswith("from")
    ]

    for line in import_lines:
        module = line.split()[1].split(".")[0]
        if module not in allowed_imports:
            print(f"Security violation: Unauthorized import '{module}'")
            return False

    return True

In [17]:
class FunctionExtractor(ast.NodeVisitor):
    def __init__(self):
        self.functions = []

    def visit_FunctionDef(self, node):
        # Collect regular function name and body
        self.functions.append({
            'name': node.name,
            'type': 'regular',
            'body': ast.unparse(node)
        })
        self.generic_visit(node)

    def visit_AsyncFunctionDef(self, node):
        # Collect async function name and body
        self.functions.append({
            'name': node.name,
            'type': 'async',
            'body': ast.unparse(node)
        })
        self.generic_visit(node)

def extract_functions_from_code(code):
    tree = ast.parse(code)
    extractor = FunctionExtractor()
    extractor.visit(tree)
    return extractor.functions

In [18]:
async def execute_parsing_script(script: str, html: str) -> list:
    local_namespace = {}
    try:
        # Execute the function body in the local namespace
        exec(script, globals(), local_namespace)
        parse_clothing_sections = local_namespace.get("parse_clothing_sections")
        if not parse_clothing_sections:
            raise ValueError("parse_clothing_sections function not found")
        return await parse_clothing_sections(html)
    except Exception as e:
        print(f"Error executing parsing script: {str(e)}")

In [19]:
def result_extractor(raw_results: str) -> ClothingItemList:
    structured_output_llm = llm.with_structured_output(ClothingItemList)
    
    for attempt in range(MAX_RETRIES):
        try:
            raw_res = structured_output_llm.invoke(raw_results)
            print(f"Raw res: {raw_res}")
            return ClothingItemList.model_validate(raw_res)
        except Exception as e:
            if attempt == MAX_RETRIES - 1:  # Last attempt
                raise
            print(f"Attempt {attempt + 1} failed: {str(e)}. Retrying...")

In [20]:
async def parse_search_result(raw_html: str) -> list[SimpleClothingItem]:
    try:
        return result_extractor(raw_html).clothing_items
    except Exception as e:
        print(f"Error parsing results: {str(e)}")
        return []
    # try:
    #     parsing_script = get_parsing_script({"content": raw_html})
    # except Exception as e:
    #     print(f"Error getting parsing script: {str(e)}")
    #     raise e
    # if validate_parsing_script(parsing_script):
    #     # print(f"Raw Parsing script: {parsing_script}")
    #     extracted_funcs = extract_functions_from_code(parsing_script)
    #     # print(f"Extracted functions: {extracted_funcs}")
    #     if len(extracted_funcs) != 1:
    #         print(f"Expected 1 function, got {len(extracted_funcs)}")
    #         raise e

    #     parse_clothing_sections_func = extracted_funcs[0]["body"]
    #     # print(f"Parsing script:\n{parse_clothing_sections_func}")
    #     results_safe = validate_parsing_script(parse_clothing_sections_func)
    #     if not results_safe:
    #         print("Parse clothing sections function failed security validation")
    #         raise e
    #     print(f"Executing parsing script: {parse_clothing_sections_func}")
    #     raw_results = await execute_parsing_script(
    #         parse_clothing_sections_func, raw_html
    #     )
    #     if raw_results and len(raw_results) > 0:
    #         try:
    #             parsed_results = result_extractor(str(raw_results))
    #             print(f"Parsed results: {parsed_results}")
    #             return parsed_results.clothing_items
    #         except Exception as e:
    #             print(f"Error parsing results: {str(e)}")
    #             raise e
    #     else:
    #         return []
    # else:
    #     print("Generated script failed security validation")
    #     return []

In [21]:
def contains_clothing_item_info_or_links(html_chunk: str) -> bool:
    """
    Returns True if the HTML chunk is about a clothing item
    or links that are promising for finding clothing items, False otherwise.
    """
    prompt = PromptTemplate(
        input_variables=["html"],
        template=html_contains_clothing_info_prompt,
    )
    extract_prompt = prompt.format(html=html_chunk)
    raw_res = AIMessage.model_validate(fast_llm.invoke(extract_prompt)).content
    return "true" in raw_res.lower()  # TODO: Consider structured output in the future

In [22]:
async def click_links_and_get_results(parent_url: str, raw_html: str) -> list[str]:
    """
    Extracts all links from the raw HTML string.

    Args:
        raw_html: Raw HTML string to parse

    Returns:
        List of URLs found in the HTML
    """
    links = []
    try:
        # Parse HTML and get all links
        soup = BeautifulSoup(raw_html, "html.parser")
        anchors = soup.find_all("a")

        for anchor in anchors:
            try:
                # Get the href attribute
                href = anchor.get("href")

                if href and not href.startswith("#"):  # Skip anchor links
                    if not href.lower().startswith(("http://", "https://")):
                        # Relative URL - join with parent URL
                        href = urljoin(parent_url, href)
                    links.append(href)

            except Exception as e:
                print(f"Error processing link: {str(e)}")
                continue

    except Exception as e:
        print(f"Error parsing HTML: {str(e)}")

    return links

In [23]:
def is_clothing_product_link(url: str) -> bool:
    prompt = PromptTemplate(
        input_variables=["url"],
        template=is_clothing_product_link_prompt,
    )
    extract_prompt = prompt.format(url=url)
    raw_res = AIMessage.model_validate(fast_llm.invoke(extract_prompt)).content
    return "true" in raw_res.lower()

In [None]:
# HACK: This code exploits a common pattern in web pages where clothing items are 1 click away from the main page
# Therefore, no BFS is required, we just search 1 level down in the graph and filter for the nodes that involve clothing

In [24]:
# full_html = get_test_html()
# TODO: Consider implementing the link clicking with normal playwright code
search_results = get_search_results(SEARCH_QUERY)
final_res = []
for search_result in search_results:
    for chunk in split_html(search_result.html_content):
        clicked_links = await click_links_and_get_results(search_result.url, chunk)
        # print(f"Clicked Links: {clicked_links}")
        for clicked_link in clicked_links:
            if is_clothing_product_link(clicked_link):
                print(f"Found clothing product link: {clicked_link}")
                try:
                    raw_html_content = requests.get(clicked_link).text
                    for split_html_content in split_html(raw_html_content):
                        if not contains_clothing_item_info_or_links(split_html_content):
                            print("Skipping chunk")
                            continue
                        result = await parse_search_result(split_html_content)
                        print(f"Parsed results: {result}")
                        final_res.extend(result)
                except Exception as e:
                    print(f"Error parsing chunk: {str(e)}")
                    continue

print(f"final res: {final_res}")

Search result: {'url': 'https://www.supplyandadvise.com/collections/ring-jacket', 'content': "RING JACKET | Supply & Advise FREE SHIPPING ON ORDERS OVER $250 FREE SHIPPING ON ORDERS OVER $250 CREATE AN ACCOUNT Account Brands ▾ Brands ENGINEERED GARMENTS LEVI'S VINTAGE CLOTHING RANDOLPH ENGINEERING RING JACKET TANNER GOODS Categories ▾ Shoe Care & Accessories Suits & Jackets RING JACKET Established in 1954, Ring Jacket is regarded as one of the finest ready-to-wear producers of suiting and tailored clothing in the world. Ring Jacket works with the finest mills in the world like VBC, Loro Piana & also with smaller local mills to produce proprietary fabrics that have become iconic. RING JACKET Shipping Brands ENGINEERED GARMENTS LEVI'S VINTAGE CLOTHING RANDOLPH ENGINEERING RING JACKET TANNER GOODS Shoe Care & Accessories Suits & Jackets Account"}
Search result: {'url': 'https://www.therealreal.com/designers/ring-jacket', 'content': "Shop authentic Ring Jacket at up to 90% off. The RealRea