In [13]:
import requests
from readability import Document
import trafilatura
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from html import escape
import ollama

def fetch_and_clean_html(url: str) -> dict:
    """
    1) Fetch raw HTML with requests.
    2) Use readability-lxml to get minimal 'main content' HTML.
    3) Convert relative image URLs to absolute, so they work offline.
    4) Extract text with Trafilatura (fallback: BeautifulSoup).
    """
    # --- Step 1: Fetch raw HTML ---
    response = requests.get(url)
    if response.status_code != 200:
        raise Exception(f"Failed to fetch page (status {response.status_code}): {url}")
    html_content = response.text

    # --- Step 2: Readability ---
    doc = Document(html_content)
    title = doc.short_title() or "No Title"
    readability_html = doc.summary()  # Minimal HTML focusing on main content

    # Beautify + fix images
    soup = BeautifulSoup(readability_html, "html.parser")
    for img_tag in soup.find_all("img"):
        src = img_tag.get("src")
        if src:
            img_tag["src"] = urljoin(url, src)
    cleaned_html = str(soup)

    # --- Step 3: Extract text for summarization ---
    extracted_text = trafilatura.extract(cleaned_html)
    if not extracted_text:
        # Fallback to BeautifulSoup plain text if Trafilatura returns None
        extracted_text = soup.get_text(separator="\n", strip=True)

    return {
        "title": title,
        "cleaned_html": cleaned_html,
        "final_text": extracted_text
    }

def summarize_with_ollama(text: str, model: str = "llama2") -> str:
    """
    Summarize the given text using a locally running Ollama service
    with the specified Llama 2 model.

    If ollama.generate yields tuples, we take the first item of each tuple as the text.
    """
    prompt_text = (
        "You are an expert summarizer. "
        "Read the following text and provide a concise, accurate summary.\n\n"
        f"{text}\n\n"
        "Summary:"
    )

    summary_chunks = []
    for token in ollama.generate(model=model, prompt=prompt_text):
        # If it's a tuple like ("some text", True/False), grab the first element
        if isinstance(token, tuple):
            token = token[0]
        # Ensure it's a string
        token_str = str(token)
        summary_chunks.append(token_str)

    full_summary = "".join(summary_chunks).strip()
    return full_summary if full_summary else "No summary produced."

def generate_final_html(title: str, cleaned_html: str, summary: str) -> str:
    """
    Build a final HTML page with:
      - Original (cleaned) article content (including images)
      - A summary section appended at the bottom
    """
    safe_title = escape(title)
    safe_summary = escape(summary)

    final_html = f"""<!DOCTYPE html>
<html>
<head>
  <meta charset="UTF-8"/>
  <title>{safe_title}</title>
</head>
<body>
  <h1>{safe_title}</h1>

  {cleaned_html}

  <hr/>
  <h2>Summary</h2>
  <p style="white-space: pre-wrap;">{safe_summary}</p>
</body>
</html>
"""
    return final_html

In [14]:
if __name__ == "__main__":
    # Example webpage
    url_to_process = "https://hebbarskitchen.com/paneer-ki-sabji-quick-paneer-curry/"

    # 1) Fetch & Clean
    results = fetch_and_clean_html(url_to_process)
    title = results["title"]
    cleaned_html = results["cleaned_html"]
    article_text = results["final_text"]

    # 2) Summarize using ollama with local Llama 2
    summary_text = summarize_with_ollama(article_text, model="llama2")

    # 3) Generate final HTML
    final_output_html = generate_final_html(
        title=title,
        cleaned_html=cleaned_html,
        summary=summary_text
    )

    # 4) Save to file
    out_file = "article_with_summary.html"
    with open(out_file, "w", encoding="utf-8") as f:
        f.write(final_output_html)

    print(f"HTML saved to: {out_file}")

HTML saved to: article_with_summary.html
