In [12]:
import requests
import spacy
from readability import Document
import trafilatura
from bs4 import BeautifulSoup
from html import escape
from urllib.parse import urljoin

def process_webpage_url(url: str) -> dict:
    """
    1) Fetch the webpage HTML via requests.
    2) Use Readability to create a simplified HTML, but keep images.
    3) Convert that HTML to text for spaCy (NER) using Trafilatura or fallback to BeautifulSoup.
    4) Return:
       - 'title'
       - 'readability_html': cleaned HTML (with images hopefully still in place)
       - 'final_text': the text used for NER
       - 'entities': list of named entities from spaCy
    """
    # Step 1: Fetch the raw HTML
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch page (status {response.status_code}): {url}")
    html_content = response.text

    # Step 2: Readability
    # Note: readabilty-lxml does not have a simple "keep_images=True" argument, 
    #       but some images deemed "important" should remain. 
    doc = Document(html_content)
    title = doc.short_title() or "No Title"

    # This is the minimal HTML containing the main content (often includes key images)
    readability_html = doc.summary()

    # OPTIONAL: convert relative <img> URLs to absolute
    # so that images remain valid after saving the HTML locally
    soup_cleaned = BeautifulSoup(readability_html, "html.parser")
    for img_tag in soup_cleaned.find_all("img"):
        src = img_tag.get("src")
        if src:
            # Convert to absolute URL
            img_tag["src"] = urljoin(url, src)
    # The updated, cleaned HTML that keeps images with absolute links
    readability_html_with_images = str(soup_cleaned)

    # Step 3: Extract text for spaCy
    # We can feed the minimal HTML into Trafilatura for robust text extraction
    trafilatura_text = trafilatura.extract(readability_html_with_images)
    if not trafilatura_text:
        # Fallback to BeautifulSoup if Trafilatura returns None/empty
        trafilatura_text = soup_cleaned.get_text(separator="\n", strip=True)
    final_text = trafilatura_text

    # Step 4: Named Entity Recognition with spaCy
    nlp = spacy.load("en_core_web_sm")
    doc_spacy = nlp(final_text)
    entities = [{"text": ent.text, "label": ent.label_} for ent in doc_spacy.ents]

    return {
        "title": title,
        "readability_html": readability_html_with_images,  # Cleaned HTML with images
        "final_text": final_text,  # Plain text for NER
        "entities": entities,
    }

def generate_html_output(title: str, cleaned_html: str, entities: list) -> str:
    """
    Creates a final HTML string containing:
    - The page title (<title>)
    - The cleaned HTML (which may include images)
    - A separate Named Entities section appended at the bottom.
    """
    safe_title = escape(title)
    
    # We'll insert a small block for the named entities after the main content
    # Escape each entity text to be safe in HTML
    entity_block = """<h2>Named Entities</h2>\n<ul>\n"""
    if entities:
        for ent in entities:
            ent_text = escape(ent["text"])
            ent_label = escape(ent["label"])
            entity_block += f"  <li>{ent_text} [{ent_label}]</li>\n"
    else:
        entity_block += "  <li>No entities found.</li>\n"
    entity_block += "</ul>\n"

    # Build the final HTML page:
    final_html = f"""<!DOCTYPE html>
<html>
<head>
  <meta charset="UTF-8">
  <title>{safe_title}</title>
</head>
<body>
  <h1>{safe_title}</h1>
  {cleaned_html}
  {entity_block}
</body>
</html>
"""
    return final_html





In [13]:
if __name__ == "__main__":
    # Example usage
    url_to_process = "https://hebbarskitchen.com/paneer-ki-sabji-quick-paneer-curry/"

    # 1) Process the webpage
    result = process_webpage_url(url_to_process)

    # 2) Generate the final HTML that includes images + entity list
    html_output = generate_html_output(
        title=result["title"],
        cleaned_html=result["readability_html"],
        entities=result["entities"]
    )

    # 3) Save that HTML to a file
    output_filename = "extracted_output_with_images.html"
    with open(output_filename, "w", encoding="utf-8") as f:
        f.write(html_output)
    
    print(f"HTML saved to: {output_filename}")


HTML saved to: extracted_output_with_images.html
