In [1]:
from unstructured import partition
from unstructured.partition.html import partition_html

In [2]:
import requests
import os
from urllib.parse import urlparse
from pathlib import Path

def download_and_save_html(url, save_dir="raw_html", filename=None):
    """
    Download an HTML page and save it to a local file.
    
    Args:
        url (str): The URL of the webpage to download
        save_dir (str): Directory to save the HTML file (default: "raw_html")
        filename (str): Custom filename (optional). If not provided, will generate from URL
    
    Returns:
        str: Path to the saved HTML file
    """
    try:
        # Create directory if it doesn't exist
        Path(save_dir).mkdir(parents=True, exist_ok=True)
        
        # Generate filename if not provided
        if filename is None:
            parsed_url = urlparse(url)
            # Use domain name and path for filename
            domain = parsed_url.netloc.replace('www.', '')
            path = parsed_url.path.replace('/', '_').strip('_')
            if path:
                filename = f"{domain}_{path}.html"
            else:
                filename = f"{domain}.html"
        
        # Ensure filename has .html extension
        if not filename.endswith('.html'):
            filename += '.html'
        
        # Download the webpage
        headers = {
            'User-Agent': "ErfanResearchBot/1.0 (e.ghiyasvand@gmail.com)"
        }
        
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()  # Raise an exception for bad status codes
        
        # Save to file
        file_path = os.path.join(save_dir, filename)
        with open(file_path, 'w', encoding='utf-8') as f:
            f.write(response.text)
        
        print(f"Successfully downloaded and saved: {file_path}")
        print(f"File size: {os.path.getsize(file_path)} bytes")
        
        return file_path
        
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return None
    except Exception as e:
        print(f"Error saving file: {e}")
        return None


In [3]:
# Example usage of the download_and_save_html function

# Download the Stardew Valley Energy page
url = "https://stardewvalleywiki.com/Energy"
saved_file = download_and_save_html(url)

# You can also specify a custom filename
# saved_file = download_and_save_html(url, filename="energy_page.html")

# Or save to a different directory
# saved_file = download_and_save_html(url, save_dir="my_html_files", filename="energy.html")


Successfully downloaded and saved: raw_html\stardewvalleywiki.com_Energy.html
File size: 69369 bytes


In [4]:
def partition_saved_html(html_file_path):
    """
    Partition a saved HTML file using unstructured library.
    
    Args:
        html_file_path (str): Path to the saved HTML file
    
    Returns:
        list: List of partitioned elements from the HTML file
    """
    try:
        # Check if file exists
        if not os.path.exists(html_file_path):
            print(f"Error: File {html_file_path} does not exist")
            return None
        
        # Partition the HTML file
        elements = partition_html(filename=html_file_path)
        
        print(f"Successfully partitioned {html_file_path}")
        print(f"Found {len(elements)} elements")
        
        return elements
        
    except Exception as e:
        print(f"Error partitioning HTML file: {e}")
        return None


In [43]:
def partition_saved_html_with_strategy(html_file_path):

    try:
        # Check if file exists
        if not os.path.exists(html_file_path):
            print(f"Error: File {html_file_path} does not exist")
            return None
        
        # Partition the HTML file
        elements = partition_html(
            filename=html_file_path,
            infer_table_structure=True,
            strategy="hi_res",
            chunking_strategy="by_title",
            include_page_breaks=True,
            # max_characters=10000,
            # combine_text_under_n_chars=1000,
            )
        
        print(f"Successfully partitioned {html_file_path}")
        print(f"Found {len(elements)} elements")
        
        return elements
        
    except Exception as e:
        print(f"Error partitioning HTML file: {e}")
        return None

In [5]:
# Example: Partition the saved HTML file
if saved_file:
    elements = partition_saved_html(saved_file)
    
    # Display some information about the elements
    if elements:
        print("\nElement types found:")
        element_types = {}
        for element in elements:
            element_type = type(element).__name__
            element_types[element_type] = element_types.get(element_type, 0) + 1
        
        for element_type, count in element_types.items():
            print(f"  {element_type}: {count}")
        
        # Show first few elements
        print(f"\nFirst 3 elements:")
        for i, element in enumerate(elements[:3]):
            print(f"  {i+1}. {type(element).__name__}: {str(element)[:100]}...")


Successfully partitioned raw_html\stardewvalleywiki.com_Energy.html
Found 70 elements

Element types found:
  Title: 11
  Text: 4
  NarrativeText: 21
  Table: 3
  Image: 5
  ListItem: 26

First 3 elements:
  1. Title: Energy...
  2. Text: From Stardew Valley Wiki...
  3. NarrativeText: Jump to navigation Jump to search...


In [44]:
elements2 = partition_saved_html_with_strategy(saved_file)

Successfully partitioned raw_html\stardewvalleywiki.com_Energy.html
Found 22 elements


In [45]:
for i , el in enumerate(elements2):
    print("-------- Element", i, "--------")
    print(el)

-------- Element 0 --------
Energy

From Stardew Valley Wiki

Jump to navigation Jump to search

Getting Started The Player Options Controls Mobile Controls Health Energy Skills Day Cycle

EnergyBar.png

A slightly depleted energy bar

A player requires Energy to use tools, with the exception of scythes, weapons, and pans. Picking up or moving a chest with your empty hands also requires energy. Currently available energy is indicated on the player's energy bar, which reflects the decreases as a day progresses.
-------- Element 1 --------
Energy can be replenished up to its maximum by eating positive-energy food. Consuming negative-energy foods such as Sap or Void Mayonnaise reduces energy. To consume food, put it on the hotbar (the top row of inventory) and press action (right-click) with it. A notification in the bottom left of the screen will display energy and health gained by eating food. Typically, more expensive food provides more energy than cheaper food items.
-------- Element 

In [29]:
len(elements2[1].text)

438