In [None]:
!pip install beautifulsoup4 dateparser

In [2]:
from bs4 import BeautifulSoup
from typing import Dict, Optional
import dateparser

def _traditional_extract_metadata(html_content: str) -> Dict[str, Optional[str]]:
    """Traditional metadata extraction method as fallback"""
    soup = BeautifulSoup(html_content, 'html.parser')
    metadata = {
        'author': None,
        'publication_date': None,
        'title': None,
    }
    
    # Multiple patterns for author
    try:
        author = None
        # First try meta tags
        author_patterns = [
            {'name': 'author'},
            {'property': 'author'},
            {'property': 'article:author'},
            {'name': 'byl'},
            {'name': 'twitter:creator'},
        ]
        
        for pattern in author_patterns:
            meta_tag = soup.find('meta', pattern)
            if meta_tag and meta_tag.get('content'):
                author = meta_tag['content']
                break
        
        # If no meta tag found, try looking for common author HTML patterns
        if not author:
            # Look for authors div with nested spans
            authors_div = soup.find('div', class_=['authors', 'contributor', 'contributors'])
            if authors_div:
                # Try to find nested spans or links
                author_elements = authors_div.find_all(['span', 'a'])
                if author_elements:
                    authors = [elem.get_text().strip() for elem in author_elements if elem.get_text().strip()]
                    author = ', '.join(authors)
            
            # If still no author, try other common patterns
            if not author:
                author_elements = soup.select('a[rel="author"], .author, .byline, .c-author, [itemprop="author"], .contributor, .contributors')
                if author_elements:
                    author = author_elements[0].get_text().strip()
                
        metadata['author'] = author
    except Exception as e:
        print(f"Error extracting author: {str(e)}")
        pass
    
    # Publication date extraction with standardized format
    try:
        date = None
        date_patterns = [
            {'name': 'publication_date'},
            {'property': 'article:published_time'},
            {'property': 'article:published'},
            {'name': 'date'},
            {'itemprop': 'datePublished'},
            {'name': 'publishedDate'},
        ]
        
        for pattern in date_patterns:
            meta_tag = soup.find('meta', pattern)
            if meta_tag and meta_tag.get('content'):
                date = meta_tag['content']
                break
                
        # If no meta tag found, try looking for time elements
        if not date:
            time_elements = soup.find_all('time')
            for time_elem in time_elements:
                if time_elem.get('datetime'):
                    date = time_elem['datetime']
                    break
                elif time_elem.get('data-timestamp'):
                    date = time_elem['data-timestamp']
                    break
                
        # Standardize date format if a date was found
        if date:
            parsed_date = dateparser.parse(date)
            if parsed_date:
                metadata['publication_date'] = parsed_date.strftime('%Y-%m-%d')
        
    except Exception:
        pass
    
    # Title extraction (keeping existing implementation)
    try:
        metadata['title'] = soup.find('title').text.strip()
    except AttributeError:
        pass
    
    return metadata

In [4]:
import requests

url = "https://www.ynet.co.il/news/article/skttx811ba"
r = requests.get(url)
_traditional_extract_metadata(r.content)

{'author': None,
 'publication_date': '2023-11-26',
 'title': 'ארה"ב: יותר מ-1,000 מפגינים יהודים פרו-פלסטינים חסמו את גשר מנהטן בניו יורק'}