In [169]:
import os
import re
import json
from functools import partial
import io
from typing import Any, Callable
import requests
import ast
from typing import Optional
from datetime import date
import tempfile
import concurrent.futures
import time

from dotenv import load_dotenv

from pydantic import BaseModel, Field
from bs4 import BeautifulSoup
from langchain_core.prompts import PromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.messages import AIMessage
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_text_splitters import RecursiveCharacterTextSplitter
from unstructured.partition.html import partition_html

In [None]:
# partition_html seems to be very slow
# Going to use the first method in clothing_item_extraction instead

In [170]:
load_dotenv()

True

In [171]:
SEARCH_MAX_RESULTS = 3
SEARCH_QUERY = "men's jeans"

In [172]:
# test_url = "https://www.nordstrom.com/browse/men/shoes/boots"
# test_url = "https://www.nordstrom.com/browse/men/clothing/jeans?filterByColor=blue"
# test_url = "https://aws.amazon.com/ec2/instance-types/g4/"
# test_url = "https://www.gucci.com/us/en/women/clothing/dresses"
# test_url = "https://www.gucci.com/us/en/ca/women/handbags-c-women-handbags"

In [173]:
def run_with_timeout(function: Callable, timeout_seconds: int = 30) -> Any:
    with concurrent.futures.ThreadPoolExecutor() as executor:
        future = executor.submit(function)
        try:
            # Wait for the result with a timeout of 5 seconds
            result = future.result(timeout=timeout_seconds)
            return result
        except concurrent.futures.TimeoutError:
            print(f"Function timed out ({timeout_seconds}s)")
            return None

In [174]:
def get_search_results(search_query: str) -> list[str]:
    tavily_search = TavilySearchResults(
        max_results=SEARCH_MAX_RESULTS, include_raw_content=True
    )
    search_results = tavily_search.invoke({"query": search_query})
    extracted_htmls = []
    for res in search_results:
        print(f"Search result: {res}")
        url = res["url"]
        try:
            response = requests.get(url)
            response.raise_for_status()
            html_content = response.text
            extracted_htmls.append(html_content)
        except Exception as e:
            print(f"Error fetching HTML from {url}: {e}")
            continue
    return extracted_htmls

In [175]:
def download_html(url: str) -> str:
    """
    Downloads HTML content from a given URL.
    
    Args:
        url: The URL to download HTML from
        
    Returns:
        The HTML content as a string
        
    Raises:
        requests.exceptions.RequestException: If the request fails
    """
    response = requests.get(url)
    response.raise_for_status()
    return response.text


In [176]:
# html_content = download_html(test_url)
# print(f"HTML content: {html_content}")


In [177]:
def get_test_html() -> str:
    with open("test_web_page.html", "r") as f:
        return f.read()

In [178]:
def process_search_results(search_results, timeout_seconds=30) -> list:
    """
    Process search results by partitioning the HTML content with a timeout.

    Args:
        search_results: List of search results to process
        timeout_seconds: Maximum time in seconds to wait for partitioning (default: 30)

    Returns:
        None
    """
    extracted_elements = []
    for search_result in search_results:
        print(f"HTML content: {search_result}")
        try:
            elems = partition_html(text=search_result)
            extracted_elements.extend(elems)
        except TimeoutError:
            print(f"Timeout exceeded ({timeout_seconds}s) while partitioning HTML")
            continue
        except Exception as e:
            print(f"Error partitioning HTML: {e}")
            continue
    return extracted_elements

In [179]:
# process_search_results(get_search_results(SEARCH_QUERY))

In [181]:
query = "Gucci handbags"
extracted_elements = process_search_results(get_search_results(query))
print(f"len(extracted_elements): {len(extracted_elements)}")
print([element.text for element in extracted_elements[0:10]])

Search result: {'url': 'https://www.farfetch.com/shopping/women/gucci/bags-purses-1/items.aspx', 'content': 'Shop Gucci handbags for women at FARFETCH US, featuring the Dionysus, Ophidia, GG Marmont and more. Find your perfect Gucci bag in various sizes, colors and styles.'}
Error fetching HTML from https://www.farfetch.com/shopping/women/gucci/bags-purses-1/items.aspx: ('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))
Search result: {'url': 'https://www.gucci.com/us/en/ca/women/handbags-c-women-handbags', 'content': 'Shop Designer Handbags, Crossbody Bags, Belt Bags & Shoulder Bags for Women at GUCCI.com. Enjoy Free Shipping, Returns & Complimentary Gift Wrapping.'}


KeyboardInterrupt: 