In [1]:
from transformers import pipeline
from nltk.tokenize import sent_tokenize, word_tokenize
import torch
import pandas as pd
import numpy as np
import sqlite3
from tqdm import tqdm
import nltk

# Lade die Pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert")
tokenizer = sentiment_pipeline.tokenizer

In [2]:
def create_features_table(conn):
    """
    Erstellt die Tabelle Article_Features, falls sie nicht existiert.
    """
    cursor = conn.cursor()
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS Article_Features (
        articleID INTEGER PRIMARY KEY,
        sentence_count INTEGER,
        word_count INTEGER,
        avg_word_length REAL,
        token_count INTEGER,
        FOREIGN KEY(articleID) REFERENCES Articles(articleID)
    )
    ''')
    conn.commit()

def compute_article_features(text):
    """
    sentence_count, word_count, avg_word_length, token_count
    """
    if not isinstance(text, str) or not text.strip():
        return 0, 0, 0.0, 0

    # Satzanzahl
    sentences = sent_tokenize(text)
    sentence_count = len(sentences)

    # Wortanzahl und durchschnittliche Wortlänge
    words = word_tokenize(text)
    word_count = len(words)
    if word_count > 0:
        avg_word_length = np.mean([len(word) for word in words])
    else:
        avg_word_length = 0.0

    # Tokenanzahl (kann ähnlich wie Wortanzahl sein, je nach Definition)
    tokens = tokenizer.encode(text, add_special_tokens=False)
    token_count = len(tokens)

    return sentence_count, word_count, avg_word_length, token_count

def process_and_store_features(conn, df, batch_size=100):
    """
    Verarbeitet den DataFrame in Batches, berechnet Artikelmerkmale und speichert die Ergebnisse in der Datenbank.
    """
    cursor = conn.cursor()

    # Iteriere über den DataFrame in Batches
    for start in tqdm(range(0, len(df), batch_size), desc="Processing Feature Batches"):
        end = start + batch_size
        batch_df = df.iloc[start:end]

        # Liste für die Ergebnisse
        feature_results = []

        for _, row in batch_df.iterrows():
            article_id = row['articleID']
            text = row['article_text']
            try:
                sentence_count, word_count, avg_word_length, token_count = compute_article_features(text)
            except Exception as e:
                print(f"Fehler bei Artikel-ID {article_id}: {e}")
                sentence_count, word_count, avg_word_length, token_count = (np.nan, np.nan, np.nan, np.nan)

            feature_results.append((article_id, sentence_count, word_count, avg_word_length, token_count))

        # Einfügen der Ergebnisse in die Datenbank
        try:
            cursor.executemany('''
                INSERT OR REPLACE INTO Article_Features (articleID, sentence_count, word_count, avg_word_length, token_count)
                VALUES (?, ?, ?, ?, ?)
            ''', feature_results)
            conn.commit()
        except sqlite3.Error as e:
            print(f"SQLite Fehler beim Einfügen der Feature-Daten: {e}")

In [3]:
# Verbindung zur SQLite-Datenbank herstellen
conn = sqlite3.connect('derstandard.db')
cursor = conn.cursor()

# Tabellen erstellen, falls sie nicht existieren
create_features_table(conn)

# Liste der aktuell relevanten Parteien
current_parties = ['ÖVP', 'FPÖ', 'NEOS', 'Grüne', 'SPÖ', 'KPÖ']

# Platzhalter und params für Abfrage erstellen
placeholders = ','.join(['?'] * len(current_parties))
params = current_parties + ['2015-01-01', 'Switchlist']

# SQL-Abfrage zum Abrufen der Artikel, die eine der aktuellen Parteien behandeln
query = f'''
SELECT DISTINCT a.*
FROM Articles a
JOIN Article_Keywords ak ON a.articleID = ak.articleID
JOIN Keywords k ON ak.keywordID = k.keywordID
WHERE k.keyword IN ({placeholders}) AND a.datetime > ? AND a.kicker != ?
'''

# Ausführen der Abfrage
df = pd.read_sql_query(query, conn, params=params)
df['datetime'] = pd.to_datetime(df['datetime'])

# Artikelmerkmale paketweise berechnen und Ergebnisse speichern
process_and_store_features(conn, df, batch_size=100)

# Verbindung schließen
conn.close()

Processing Feature Batches:   0%|          | 0/746 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (560 > 512). Running this sequence through the model will result in indexing errors
Processing Feature Batches: 100%|██████████| 746/746 [04:29<00:00,  2.77it/s]
