In [2]:
import datetime
import sqlite3
from bs4 import BeautifulSoup
import pandas as pd
from selenium import webdriver as wd
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import TimeoutException
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

# global vars for local system
BASEPATH = r"C:\Users\sdola\Nextcloud\Technikum\NLP\Project"
TRACKER_FILE = rf"{BASEPATH}\derstandard_tracker.txt"
SQLITE_DATABASE = rf"{BASEPATH}\derstandard.db"
CHROMEDRIVER_PATH = r"C:\Users\sdola\Documents\chromedriver-win64\chromedriver.exe"
FRONTPAGE_URL = "https://www.derstandard.at/frontpage/"


def get_most_recent_download():
    try:
        with open(TRACKER_FILE, mode="r") as f:
            line = f.readlines()[-1]
            dl_dt = line.split("\t")[-1].replace("\n", "")
            return datetime.datetime.strptime(dl_dt, "%Y-%m-%d").date()
    except FileNotFoundError:
        return False

def create_database_and_tables():
    conn = sqlite3.connect(SQLITE_DATABASE)
    cursor = conn.cursor()
    cursor.execute("""
    CREATE TABLE IF NOT EXISTS urls (
        URL_ID INTEGER PRIMARY KEY AUTOINCREMENT,
        URL TEXT UNIQUE,
        publication_date DATE,
        url_download_date DATE
    )
    """)
    conn.commit()
    conn.close()


def setup_driver(headless=True):
    chrome_options = wd.ChromeOptions()
    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36")
    if headless:
        chrome_options.add_argument("--headless")
    
    chrome_options.page_load_strategy = 'none' 
    chrome_options.add_argument("--blink-settings=imagesEnabled=false")

    service = Service(executable_path=CHROMEDRIVER_PATH)
    driver = wd.Chrome(service=service, options=chrome_options)

    # POPUP WEGKLICKEN
    driver.get(FRONTPAGE_URL + datetime.datetime.today().strftime("%Y/%m/%d"))
    time.sleep(5)
    try:
        WebDriverWait(driver, 10).until(
            lambda driver: driver.execute_script("return document.readyState") == 'complete'
        )
        driver.switch_to.frame(driver.find_element(By.XPATH, "/html/body/div/iframe"))
        driver.find_element(By.XPATH, "/html/body/div[1]/div[2]/div[3]/div[1]/button").click()
        driver.switch_to.parent_frame()
    except NoSuchElementException:
        print("popup nicht gefunden")

    return driver


def scrape_derstandard():

    # prepare driver
    start = (get_most_recent_download() + datetime.timedelta(days=1)) or datetime.date(1998, 11, 29)
    driver = setup_driver()

    # connect to db
    conn = sqlite3.connect(SQLITE_DATABASE)
    cursor = conn.cursor()
    
    while start < datetime.date.today():
        archive_url = FRONTPAGE_URL + start.strftime("%Y/%m/%d")
        try:
            driver.get(archive_url)
        except TimeoutException:
            driver.quit()
            driver = setup_driver()
            continue
        
        soup = BeautifulSoup(driver.page_source, "html5lib")
        for article in soup.findAll("article"):
            try:
                url = article.find("a").get("href")
                cursor.execute("INSERT INTO urls (URL, publication_date, url_download_date) VALUES (?, ?, ?)", (url, start, None))
            except sqlite3.IntegrityError:
                continue
            except Exception as e:
                print(f"Error: {e}")
                continue
            
        print(f"{start} erfolgreich gescraped." )
        
        with open(TRACKER_FILE, "a") as tracker_file:
            tracker_file.write(f"{start}\n")

        start += datetime.timedelta(days=1)
        
    driver.quit()
    conn.commit()
    conn.close()




In [3]:
def create_comments_table():
    conn = sqlite3.connect('derstandard.db')
    cursor = conn.cursor()
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS comments (
        commentID INTEGER PRIMARY KEY,
        articleID INTEGER,
        username TEXT,
        datetime DATETIME,
        comment_header TEXT,
        comment TEXT,
        upvotes INTEGER,
        downvotes INTEGER,
        user_followers INTEGER,
        reply_on_comment INTEGER,
        FOREIGN KEY(articleID) REFERENCES Articles(articleID),
        FOREIGN KEY(reply_on_comment) REFERENCES comments(commentID)
    )
    ''')
    conn.commit()
    conn.close()

create_comments_table()

In [4]:
def select_articles_for_party(conn, party, year, limit=10):
    # Stellen Sie sicher, dass 'limit' ein Integer ist
    limit = int(limit)

    start_date = f'{year}-01-01'
    end_date = f'{year}-12-31'

    query = f'''
    SELECT a.articleID, u.url
    FROM Articles a
    JOIN Urls u ON a.urlID = u.urlID
    WHERE a.articleID IN (
        -- Artikel mit genau einem Keyword
        SELECT ak.articleID
        FROM Article_Keywords ak
        GROUP BY ak.articleID
        HAVING COUNT(*) = 1
    )
    AND a.articleID IN (
        -- Artikel, deren einziges Keyword die gewünschte Partei ist
        SELECT ak.articleID
        FROM Article_Keywords ak
        JOIN Keywords k ON ak.keywordID = k.keywordID
        WHERE k.keyword = ?
    )
    AND a.datetime BETWEEN ? AND ?
    AND a.kicker != ?
    LIMIT {limit}
    '''

    params = [party, start_date, end_date, 'Switchlist']
    df = pd.read_sql_query(query, conn, params=params)
    return df


In [5]:
import dateparser

def parse_comment_datetime(datetime_str):
    return dateparser.parse(datetime_str, languages=['de'])


In [6]:
def scrape_comments_for_article(driver, article_url, articleID):
    comments_data = []
    driver.get(article_url)
    time.sleep(5)  # Warten, bis die Seite geladen ist

    soup = BeautifulSoup(driver.page_source, 'html.parser')
    postings = soup.find_all('div', class_='posting', attrs={'data-postingid': True})

    for posting in postings[:10]:  # Bis zu 10 Kommentare sammeln
        try:
            commentID = posting.get('data-postingid')
            username = posting.get('data-communityname') or 'gelöschtes Profil'
            reply_on_comment = posting.get('data-parentpostingid')
            reply_on_comment = int(reply_on_comment) if reply_on_comment else None

            # Datum und Uhrzeit des Kommentars extrahieren
            datetime_tag = posting.find('span', class_='js-timestamp')
            if datetime_tag and datetime_tag.text:
                datetime_str = datetime_tag.text.strip()
                datetime_obj = parse_comment_datetime(datetime_str)
            else:
                datetime_obj = None  # Oder ein Standardwert

            # Kommentarüberschrift extrahieren
            comment_header_tag = posting.find('h4', class_='upost-title')
            comment_header = comment_header_tag.text.strip() if comment_header_tag else None

            # Kommentartext extrahieren
            comment_body = posting.find('div', class_='upost-text')
            comment_text = comment_body.get_text(separator=' ', strip=True) if comment_body else None

            # Upvotes extrahieren
            upvotes_tag = posting.find('span', class_='js-ratings-positive-count')
            upvotes = int(upvotes_tag.text.strip()) if upvotes_tag and upvotes_tag.text else 0

            # Downvotes extrahieren
            downvotes_tag = posting.find('span', class_='js-ratings-negative-count')
            downvotes = int(downvotes_tag.text.strip()) if downvotes_tag and downvotes_tag.text else 0

            # Anzahl der Follower des Nutzers extrahieren
            user_followers_tag = posting.find('span', class_='upost-follower')
            user_followers = int(user_followers_tag.text.strip()) if user_followers_tag and user_followers_tag.text else 0

            comments_data.append({
                'commentID': int(commentID),
                'articleID': articleID,
                'username': username,
                'datetime': datetime_obj,
                'comment_header': comment_header,
                'comment': comment_text,
                'upvotes': upvotes,
                'downvotes': downvotes,
                'user_followers': user_followers,
                'reply_on_comment': reply_on_comment
            })

        except Exception as e:
            print(f"Fehler beim Verarbeiten eines Kommentars in Artikel {articleID}: {e}")
            continue 

    return comments_data


In [7]:
def insert_comments_to_db(conn, comments_data):
    cursor = conn.cursor()
    for comment in comments_data:
        cursor.execute('''
        INSERT OR IGNORE INTO comments (
            commentID, articleID, username, datetime, comment_header,
            comment, upvotes, downvotes, user_followers, reply_on_comment
        ) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
        ''', (
            comment['commentID'],
            comment['articleID'],
            comment['username'],
            comment['datetime'],
            comment['comment_header'],
            comment['comment'],
            comment['upvotes'],
            comment['downvotes'],
            comment['user_followers'],
            comment['reply_on_comment']
        ))
    conn.commit()


In [8]:
def scrape_comments():
    conn = sqlite3.connect('derstandard.db')
    create_comments_table()
    parties = ['ÖVP', 'SPÖ', 'FPÖ', 'Grüne', 'NEOS']
    years = range(2015, datetime.datetime.now().year + 1)

    driver = setup_driver(headless=True)

    for party in parties:
        for year in years:
            print(f"Scrape Kommentare für {party} im Jahr {year}")
            articles_df = select_articles_for_party(conn, party, year, limit=10)
            for _, row in articles_df.iterrows():
                articleID = row['articleID']
                article_url = 'https://www.derstandard.at' + row['url']
                comments_data = scrape_comments_for_article(driver, article_url, articleID)
                if comments_data:
                    insert_comments_to_db(conn, comments_data)
                    print(f"{len(comments_data)} Kommentare für Artikel {articleID} gesammelt")
                else:
                    print(f"Keine Kommentare gefunden für Artikel {articleID}")

    driver.quit()
    conn.close()


In [None]:

%time create_database_and_tables()
%time create_comments_table()
%time scrape_derstandard()
%time scrape_comments()
