In [None]:
!pip install beautifulsoup4 dateparser

In [1]:
from bs4 import BeautifulSoup
from typing import Dict, Optional
import dateparser

def _traditional_extract_metadata(html_content: str) -> Dict[str, Optional[str]]:
    """Traditional metadata extraction method as fallback"""
    soup = BeautifulSoup(html_content, 'html.parser')
    metadata = {
        'author': None,
        'publication_date': None,
        'title': None,
    }
    
    # Multiple patterns for author
    try:
        author = None
        # First try meta tags
        author_patterns = [
            {'name': 'author'},
            {'property': 'author'},
            {'property': 'article:author'},
            {'name': 'byl'},
            {'name': 'twitter:creator'},
        ]
        
        for pattern in author_patterns:
            meta_tag = soup.find('meta', pattern)
            if meta_tag and meta_tag.get('content'):
                author = meta_tag['content']
                break
        
        # If no meta tag found, try looking for common author HTML patterns
        if not author:
            # Look for authors div with nested spans
            authors_div = soup.find('div', class_=['authors', 'contributor', 'contributors'])
            if authors_div:
                # Try to find nested spans or links
                author_elements = authors_div.find_all(['span', 'a'])
                if author_elements:
                    authors = [elem.get_text().strip() for elem in author_elements if elem.get_text().strip()]
                    author = ', '.join(authors)
            
            # If still no author, try other common patterns
            if not author:
                author_elements = soup.select('a[rel="author"], .author, .byline, .c-author, [itemprop="author"], .contributor, .contributors')
                if author_elements:
                    author = author_elements[0].get_text().strip()
                
        metadata['author'] = author
    except Exception as e:
        print(f"Error extracting author: {str(e)}")
        pass
    
    # Publication date extraction with standardized format
    try:
        date = None
        date_patterns = [
            {'name': 'publication_date'},
            {'property': 'article:published_time'},
            {'property': 'article:published'},
            {'name': 'date'},
            {'itemprop': 'datePublished'},
            {'name': 'publishedDate'},
        ]
        
        for pattern in date_patterns:
            meta_tag = soup.find('meta', pattern)
            if meta_tag and meta_tag.get('content'):
                date = meta_tag['content']
                break
                
        # If no meta tag found, try looking for time elements
        if not date:
            time_elements = soup.find_all('time')
            for time_elem in time_elements:
                if time_elem.get('datetime'):
                    date = time_elem['datetime']
                    break
                elif time_elem.get('data-timestamp'):
                    date = time_elem['data-timestamp']
                    break
                
        # Standardize date format if a date was found
        if date:
            parsed_date = dateparser.parse(date)
            if parsed_date:
                metadata['publication_date'] = parsed_date.strftime('%Y-%m-%d')
        
    except Exception:
        pass
    
    # Title extraction (keeping existing implementation)
    try:
        metadata['title'] = soup.find('title').text.strip()
    except AttributeError:
        pass
    
    return metadata

In [3]:
from bs4 import BeautifulSoup
from typing import Optional, List
import json
import re

def _traditional_extract_article_body(html_content: str) -> Optional[str]:
    """Traditional article-body extraction as a fallback.
    Tries common containers, scores candidates, removes boilerplate, and returns cleaned text.
    """
    soup = BeautifulSoup(html_content, "html.parser")

    # Remove obvious noise upfront
    for tag in soup(["script", "style", "noscript", "svg", "canvas", "form"]):
        tag.decompose()
    # Remove common non-content blocks (ads, nav, etc.)
    for sel in [
        "[role='navigation']",
        "nav",
        "header",
        "footer",
        "aside",
        ".ads, .ad, .advert, [id*='ad-'], [class*='ad-']",
        ".subscribe, .paywall, .newsletter",
        ".share, .social, .comments, #comments",
        ".breadcrumbs, .tags, .related, .recommendations",
        "figure, .figure, .media, .video, .photo, .gallery",
    ]:
        for el in soup.select(sel):
            el.decompose()

    # 1) Try JSON-LD articleBody if available (some sites embed full text here)
    try:
        for script in soup.find_all("script", type="application/ld+json"):
            data = json.loads(script.string or "")
            # Normalize to list for easy iteration
            items: List[dict] = data if isinstance(data, list) else [data]
            for item in items:
                if isinstance(item, dict):
                    # Article-like types
                    t = item.get("@type") or item.get("type")
                    if isinstance(t, list):
                        is_article = any(x.lower().endswith("article") for x in map(str, t))
                    else:
                        is_article = str(t).lower().endswith("article") if t else False
                    if is_article and item.get("articleBody"):
                        text = str(item.get("articleBody")).strip()
                        if text:
                            # Normalize whitespace
                            text = re.sub(r"\s+\n", "\n", text)
                            text = re.sub(r"\n{3,}", "\n\n", text).strip()
                            return text or None
    except Exception:
        pass

    # 2) Candidate selectors for typical CMS structures
    selectors = [
        "main article",
        "article",
        "[itemprop='articleBody']",
        "section[itemprop='articleBody']",
        "div[itemprop='articleBody']",
        "div[data-component='article-body']",
        "section[data-component='article-body']",
        ".article-body, .articleBody, .article__body, .c-article-body, .l-articleBody",
        ".content__article-body, .content-article__body",
        ".entry-content, .post-content, .post__content, .td-post-content",
        ".story-body, #story-body, #article-body",
        ".article-content, .article__content, .article--content, .articleText, .article-text",
        ".field-name-body, .content-body, .single-content",
        ".rich-text, .wysiwyg-content",
        "main .content, main .container, main .post, main .entry",
    ]

    def score_node(node) -> int:
        """Heuristic score: prefer many <p>, long text; penalize link-heavy blocks."""
        if not node:
            return -1
        # Gather paragraph text
        ps = node.find_all("p")
        p_text = "\n".join(p.get_text(separator=" ", strip=True) for p in ps)
        p_len = len(p_text)

        # Link density penalty
        links = node.find_all("a")
        link_text_len = sum(len(a.get_text() or "") for a in links)
        link_density = (link_text_len / p_len) if p_len > 0 else 1.0

        # Headings can be informative; slight boost
        headings = node.find_all(["h2", "h3"])
        h_count = len(headings)

        # Score components
        score = (
            (len(ps) * 30) +           # more paragraphs is good
            (p_len // 50) +            # longer text is good
            (h_count * 5)              # some headings help
        )
        # Penalize high link density
        score -= int(link_density * 20)

        return score

    # 3) Collect candidates and score
    candidates = []
    for sel in selectors:
        for node in soup.select(sel):
            candidates.append((score_node(node), node))

    # 4) If nothing matched, broaden: look for the largest <main> or central column
    if not candidates:
        broad_selectors = ["main", "#main", ".main", ".content", "#content", ".container", ".article"]
        for sel in broad_selectors:
            for node in soup.select(sel):
                candidates.append((score_node(node), node))

    # 5) Choose best candidate and build text
    best_text = ""
    if candidates:
        candidates.sort(key=lambda x: x[0], reverse=True)
        _, best = candidates[0]

        # Inside the best node, prefer paragraphs and simple headings
        parts: List[str] = []
        for el in best.find_all(["h2", "h3", "p", "li"]):
            # Skip very short crumbs / nav-like items
            t = el.get_text(separator=" ", strip=True)
            if not t:
                continue
            if len(t) < 3:
                continue
            parts.append(t)

        best_text = "\n\n".join(parts).strip()

    # 6) Ultimate fallback: all <p> in document (but try to keep it sane)
    if not best_text:
        ps = [p.get_text(separator=" ", strip=True) for p in soup.find_all("p")]
        ps = [t for t in ps if len(t) > 30]  # filter out tiny snippets
        best_text = "\n\n".join(ps).strip()

    # 7) Clean & normalize whitespace
    if best_text:
        best_text = re.sub(r"[ \t]+\n", "\n", best_text)
        best_text = re.sub(r"\n{3,}", "\n\n", best_text)
        best_text = re.sub(r"[ \t]{2,}", " ", best_text).strip()

    return best_text or None


In [None]:
!pip install newspaper4k lxml_html_clean
import nltk
nltk.download('punkt_tab')

(['איתי יעקב'],
 '2025-10-24',
 'בלילה שבין מותם הטרגי של אמה, מיכל זקס, בעלת עסק לבניית ציפורניים, ואחיה, רב טוראי איתן זקס, מפגיעת טיל ישירה בביתם שבבאר שבע בחודש יוני, לבין טקס הלוויה, ישבה אליענה זקס והכינה שתי רשימות. האחת מורכבת מ-20 דברים שהיא רוצה להגשים עבורם, כמו לטייל בכל המדינות שאחיה בן ה-18 חלם ולא הספיק לראות בעיניו; השנייה היא 20 הרגלים אישיים, עשרה לכל אחד, שהיא רוצה לאמץ כדי לברוא אליענה חדשה. "אני חושבת שהם היו אנשים הרבה יותר טובים ממני, אנשים של נתינה", היא אומרת בריאיון מיוחד ל-מגזין סופ"ש של ynet ארבעה חודשים לאחר אותה טרגדיה.\n\nמה היית רוצה שאנשים ידעו וילמדו מהם? "הייתי רוצה שחבר\'ה בגילו של אחי ילמדו ממנו אהבת הארץ. למרות הכול. הוא אהב את המדינה ואהב אנשים. הוא היה אוזן קשבת לכל אדם שהוא פגש והייתה לו יכולת פנומנלית להתחבר לאנשים מכל גיל ומעמד. מאז שהוא נפטר, עשרות החברים שלו לא עזבו את אבא שלי, מה שמראה איזה בן אדם של אנשים הוא היה. גם אמא שלי הייתה אדם מאוד חברותי ומלאה בנתינה, שהקדישה חלק גדול מחייה לאחרים – בין אם זה לאסוף סלי מזון לחיילים ולהסיע אותם לבס