In [151]:
import os
import re
import json
import io
import requests
import ast
from typing import Optional
from datetime import date
import tempfile

from dotenv import load_dotenv

from pydantic import BaseModel, Field
from bs4 import BeautifulSoup
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_text_splitters import RecursiveCharacterTextSplitter
from unstructured.partition.html import partition_html

In [152]:
# test_url = "https://www.nordstrom.com/browse/men/shoes/boots"
# test_url = "https://www.nordstrom.com/browse/men/clothing/jeans?filterByColor=blue"
# test_url = "https://aws.amazon.com/ec2/instance-types/g4/"
# test_url = "https://www.gucci.com/us/en/women/clothing/dresses"
# test_url = "https://www.gucci.com/us/en/ca/women/handbags-c-women-handbags"

In [153]:
def download_html(url: str) -> str:
    """
    Downloads HTML content from a given URL.
    
    Args:
        url: The URL to download HTML from
        
    Returns:
        The HTML content as a string
        
    Raises:
        requests.exceptions.RequestException: If the request fails
    """
    response = requests.get(url)
    response.raise_for_status()
    return response.text


In [154]:
# html_content = download_html(test_url)
# print(f"HTML content: {html_content}")


In [155]:
def get_test_html() -> str:
    with open("test_web_page.html", "r") as f:
        return f.read()

In [156]:
response = requests.get(test_url)
response.raise_for_status()
print(f"Response: {response}")
html_content = response.text
print(f"HTML content: {html_content}")
extracted_elements = partition_html(text=html_content)
print([element.text for element in extracted_elements])

# # Save HTML to temporary file
# with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as tmp_file:
#     tmp_file.write(html_content)
#     tmp_path = tmp_file.name

#     # Read from temporary file
#     extracted_elements = partition_html(filename=tmp_path)
#     print([element.text for element in extracted_elements])

Response: <Response [200]>
HTML content: <!doctype html>
<html class="no-js aws-lng-en_US aws-with-target" lang="en-US" data-static-assets="https://a0.awsstatic.com" data-js-version="1.0.590" data-css-version="1.0.506">
 <head> 
  <meta http-equiv="Content-Security-Policy" content="default-src 'self' data: https://a0.awsstatic.com https://prod.us-east-1.ui.gcr-chat.marketing.aws.dev; base-uri 'none'; connect-src 'self' https://*.analytics.console.aws.a2z.com https://*.panorama.console.api.aws https://*.prod.chc-features.uxplatform.aws.dev https://112-tzm-766.mktoresp.com https://112-tzm-766.mktoutil.com https://a0.awsstatic.com https://a0.p.awsstatic.com https://a1.awsstatic.com https://amazonwebservices.d2.sc.omtrdc.net https://amazonwebservicesinc.tt.omtrdc.net https://api.regional-table.region-services.aws.a2z.com https://api.us-west-2.prod.pricing.aws.a2z.com https://auth.aws.amazon.com https://aws.amazon.com https://aws.amazon.com/p/sf/ https://aws.demdex.net https://b0.p.awsstati

In [157]:
# TODO: Issue with partition_html - doesn't know what elements are associated with each other
# Debatable whether it's better to use the custom parser system or use partition_html out of the box

In [158]:
# def split_html(html: str) -> list[str]:
#     text_splitter = RecursiveCharacterTextSplitter(
#         chunk_size=5000,
#         chunk_overlap=100,
#         length_function=len,
#         is_separator_regex=False,
#     )
#     return text_splitter.split_text(html)

In [159]:
test_file = get_test_html()
elements = partition_html(text=test_file)
print(f"len(elements): {len(elements)}")
print([element.text for element in elements[420:500]])
print([element.metadata.link_urls for element in elements[420:500]])


len(elements): 649
['New In: Men', "Explore new in men's ready-to-wear collection and new arrivals in men's shoes.", 'Lasered denim jacket', '$ 2,550', 'Shop This', "Men's Gucci Re-Web sneaker", '$ 1,150', 'Shop This', 'Tapered denim pant with Web', '$ 1,300', 'Shop This', 'Rib knit wool hat', '$ 480', 'Shop This', 'G-Timeless watch, 38mm', '$ 1,550', 'Shop This', 'Square frame sunglasses', '$ 480', 'Shop This', 'Shop This', 'Wool sweater with Gucci intarsia', '$ 1,250', 'Shop This', 'VIRTUAL TRY-ON', "Men's Horsebit 1953 loafer", '$ 990', 'Shop This', 'Polyester GG jacquard pant with Web', '$ 1,750', 'Shop This', 'Icon 18k GG thin ring', '$ 1,850', 'Shop This', 'RUNWAY', 'Jackie large shoulder bag', '$ 4,800', 'Shop This', 'GG Marmont chain bracelet', '$ 340', 'Shop This', 'Rib knit wool hat', '$ 480', 'Shop This', 'Denim shirt with lasered Gucci detail', '$ 2,650', 'Shop This', "Men's ankle boot with Horsebit", '$ 1,270', 'Shop This', 'RUNWAY', 'GG Marmont thin belt', '$ 530', 'Shop 