In [3]:
import requests
from bs4 import BeautifulSoup, NavigableString
import newspaper
import pandas as pd
from datetime import datetime
import re
import os
from dataclasses import dataclass, asdict
# headers needed to simulate agent
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:97.0) Gecko/20100101 Firefox/97.0'}

# data classes
@dataclass
class LandingPageArticle:
    title: str
    url: str

    def __str__(self):
        return f'LandingPageArticle(title={self.title}, url={self.url})'

@dataclass
class Article:
    title: str
    url: str
    summary: str
    last_edit_date: datetime
    authors: list[str]
    text: str
    topic: str | None = None
    source: str | None = None
    tags: str | None = None

In [4]:
def get_article_links(zeit_online_url) -> list[LandingPageArticle]:
    response = requests.get(zeit_online_url, headers=headers)
    if response.status_code != 200:
        raise Exception("Can't get todays articles")
    soup = BeautifulSoup(response.content, 'html.parser')
    articles : list[LandingPageArticle] = []
    for article_tag in soup.find_all('article'):
        if 'zon-teaser--standard' not in article_tag.get('class', []) or \
            not article_tag.get('data-zplus') or 'zplus' == article_tag.get('data-zplus'):
            continue

        a_tag = article_tag.find('a', href=True)
        if not a_tag or not a_tag.get('href').startswith('https://www.zeit.de/'):
            continue
        
        title = a_tag.get_text(strip=True)
        url = a_tag['href']
        articles.append(LandingPageArticle(title=title, url=url))
    
    return articles


def scrape_article(article_url: str) -> Article:
    response = requests.get(article_url, headers=headers)
    if response.status_code != 200:
        print(f"Error scraping article: {article_url}!")
        return None  
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
        article_heading = soup.find('h1', class_='article-heading')
        title = article_heading.find('span', class_='article-heading__title').text.strip() if article_heading else None
        topic = ''.join(child for child in article_heading.find('span', class_='article-heading__kicker') if isinstance(child, NavigableString)).strip() if article_heading else None

        summary_tag = soup.find('div', class_='summary')
        summary = summary_tag.text.strip() if summary_tag else None

        author_tag = soup.find_all('a', href=lambda href: href and href.startswith('/autoren/'))
        authors = [author.get("title", "") for author in author_tag] if author_tag else []

        metadata_tag = soup.find('div', class_='metadata')
        time_tag = metadata_tag.find('time')
        source_tag = metadata_tag.find('span', class_='metadata__source')
        date = datetime.fromisoformat(time_tag.get('datetime')) if time_tag else None
        source = re.sub(r'\n\s*,', 
                        '', 
                        ''.join([source.text for source in source_tag if source.name != "a" or (source.name == "a" and not source.get("href", "").startswith('/autoren/'))]).strip()).rstrip(',') \
                 if source_tag else None

        paragraphs = soup.find_all('p', class_='paragraph article__item')
        text = "\n".join([p.text.strip().replace('\n', '') for p in paragraphs])

        tag_list = soup.find('ul', class_='article-tags__list')
        tags = [tag.text.strip() for tag in tag_list.find_all('a')] if tag_list else []
    except:
        print(f"Error scraping article: {article_url} with newspaper4k!")
        return None
    
    return Article(title = title,
                  topic=topic,
                  url = article_url, 
                  summary = summary,
                  authors = authors,
                  source= source,
                  last_edit_date = date,
                  text = text,
                  tags=tags)


def scrape_article_with_newspaper4k(article_url: str) -> Article:
    try:
        article = newspaper.article(article_url)
    except Exception as e:
        print(f"Error scraping article: {article_url} with newspaper4k!")
        return None
    return Article(title = article.title,
                              url = article_url, 
                              summary = article.meta_description,
                              authors = article.authors,
                              last_edit_date = article.publish_date,
                              text = article.text)
    
def scrape_articles_and_save_as_csv(article_urls: list[str], newspaper: bool = False) -> pd.DataFrame:
    scraped_articles = []
    for url in article_urls:
        scraped_article = scrape_article_with_newspaper4k(url) if newspaper else scrape_article(url)
        if scraped_article is not None:
            scraped_articles.append(scraped_article)
            
    df = pd.DataFrame([asdict(scraped_article) for scraped_article in scraped_articles])
    if len(df) == 0:
        return None
    
    if newspaper:
        df.drop(columns=['topic', 'source', 'tags'], inplace=True)
    
    os.makedirs('./data/newspaper4k', exist_ok=True)
    os.makedirs('./data/self', exist_ok=True)
    df.to_csv(f'./data/{"newspaper4k" if newspaper else "self"}/articles{datetime.now().strftime("%Y%m%d%H%M")}.csv', index=False)
    
    return df

In [5]:
articles = get_article_links("https://www.zeit.de/")
df_current_articles = pd.DataFrame([asdict(article) for article in articles])
old_articles_path = './data/all_articles.csv'
df_old_articles = pd.read_csv(old_articles_path) if os.path.exists(old_articles_path) else pd.DataFrame(columns=df_current_articles.columns)
df_new_articles = df_current_articles[~df_current_articles['url'].isin(set(df_old_articles['url'].tolist()))]

new_articles_list = df_new_articles['url'].tolist()
scrape_articles_and_save_as_csv(new_articles_list) # own scraping method
scrape_articles_and_save_as_csv(new_articles_list, True) # with newspaper4k

df_all_articles = pd.concat([df_old_articles, df_new_articles])
df_all_articles.to_csv(old_articles_path, index=False)
print("All Articles Count: " + str(len(df_all_articles)))

All Articles Count: 71


In [6]:
df_new_articles

Unnamed: 0,title,url
0,Deutschland – Schweiz: Retro zum Gruppensieg,https://www.zeit.de/sport/2024-06/deutschland-...
3,Bundeshaushalt 2025: FDP lehnt Lockerung der S...,https://www.zeit.de/politik/deutschland/2024-0...
4,Hadsch: Mehr als 1.300 Tote durch Hitze in Mekka,https://www.zeit.de/gesellschaft/zeitgeschehen...
7,Frankreich: Macron will unabhängig vom Ausgang...,https://www.zeit.de/politik/ausland/2024-06/fr...
