In [8]:
from transformers import pipeline
import torch
import pandas as pd
import sqlite3
import numpy as np
from tqdm import tqdm
from nltk.tokenize import sent_tokenize
import nltk

nltk.download('punkt')

# Lade die Sentiment-Analyse Pipeline mit dem gewünschten Modell
sentiment_pipeline = pipeline("sentiment-analysis", model="oliverguhr/german-sentiment-bert")
tokenizer = sentiment_pipeline.tokenizer

In [9]:
def split_text_into_chunks(text, max_chunk_size=480):
    sentences = sent_tokenize(text)
    chunks = []
    current_chunk = ""
    current_length = 0

    for sentence in sentences:
        tokens = tokenizer.encode(sentence, add_special_tokens=False)
        sentence_length = len(tokens)

        if sentence_length > max_chunk_size:
            # Splitte den langen Satz in kleinere Teile
            split_indices = range(0, sentence_length, max_chunk_size)
            for i in split_indices:
                sub_tokens = tokens[i:i + max_chunk_size]
                sub_chunk = tokenizer.decode(sub_tokens, clean_up_tokenization_spaces=True)
                sub_length = len(sub_tokens)
                chunks.append((sub_chunk, sub_length))
        else:
            if current_length + sentence_length > max_chunk_size:
                # wenn die aktuelle länge des chunks mit dem neuen satz, die maximale länger überschreiten würde... 
                if current_chunk:
                    # füge den aktuellen chunk der finalen chunkliste hinzu und der neue satz ist der erste des nächsten Chunks.
                    chunks.append((current_chunk, current_length))
                current_chunk = sentence
                current_length = sentence_length
            else:
                # sonst erweitere den aktuellen Chunk um den neuen Satz..
                if current_chunk:
                    current_chunk += " " + sentence
                else:
                    current_chunk = sentence
                current_length += sentence_length

    # wenn am Ende der current Chunk noch nicht leer ist, dann füge ihn der finalen chunk-Liste hinzu
    if current_chunk:
        chunks.append((current_chunk, current_length))

    return chunks

def analyze_sentiment(text, max_chunk_size=480):
    # Schritt 1: Text in Chunks aufteilen
    chunks = split_text_into_chunks(text, max_chunk_size)

    if not chunks:
        return 0.0, 'neutral'  # Standardwert, falls kein Text vorhanden ist

    # entpacke chunks in jeweils eine liste aus den Texten und eine liste aus der Chunklängen
    chunk_texts, chunk_lengths = zip(*chunks)

    # Schritt 2: Sentiment-Analyse durchführen
    results = sentiment_pipeline(list(chunk_texts))

    sentiments = []
    weights = []

    for result, length in zip(results, chunk_lengths):
        label = result['label'].lower()
        score = result['score']

        # Mapping der Sentiment-Klassen auf numerische Werte
        if label == 'negative':
            sentiment_value = -1
        elif label == 'neutral':
            sentiment_value = 0
        elif label == 'positive':
            sentiment_value = 1
        else:
            sentiment_value = 0  # Standard auf neutral, falls unbekannt

        # Berechne den gewichteten Sentiment-Score
        sentiment_score = sentiment_value * score
        sentiments.append(sentiment_score)
        weights.append(length)

    # Schritt 3: Gewichteten Durchschnitt des Sentiments berechnen
    weighted_sum = sum(s * w for s, w in zip(sentiments, weights))
    total_weights = sum(weights)
    average_sentiment = weighted_sum / total_weights if total_weights != 0 else 0

    # Schritt 4: Klassifiziere den durchschnittlichen Sentiment-Wert in die drei Klassen
    if average_sentiment < -0.5:
        sentiment_class = 'negative'
    elif -0.5 <= average_sentiment < 0.5:
        sentiment_class = 'neutral'
    else:
        sentiment_class = 'positive'

    return average_sentiment, sentiment_class


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sdola\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
def create_sentiment_table(conn):
    """
    Erstellt die Tabelle Article_Sentiment, falls sie nicht existiert.
    """
    cursor = conn.cursor()
    cursor.execute('''
    CREATE TABLE IF NOT EXISTS Article_Sentiment (
        articleID INTEGER PRIMARY KEY,
        sentiment_value REAL,
        sentiment_category TEXT,
        FOREIGN KEY(articleID) REFERENCES Articles(articleID)
    )
    ''')
    conn.commit()

In [12]:
def process_and_store_sentiment(conn, df, batch_size=100):
    """
    Verarbeitet den DataFrame in Batches, berechnet Sentiment und speichert die Ergebnisse in der Datenbank.
    """
    cursor = conn.cursor()

    # Iteriere über den DataFrame in Batches
    for start in tqdm(range(0, len(df), batch_size), desc="Processing Batches"):
        end = start + batch_size
        batch_df = df.iloc[start:end]

        # Liste für die Ergebnisse
        sentiment_results = []

        for _, row in batch_df.iterrows():
            article_id = row['articleID']
            text = row['article_text']
            if not isinstance(text, str) or not text.strip():
                average_sentiment = np.nan
                sentiment_class = 'NA'
            else:
                try:
                    average_sentiment, sentiment_class = analyze_sentiment(text)
                except Exception as e:
                    print(f"Fehler bei Artikel-ID {article_id}: {e}")
                    average_sentiment = np.nan
                    sentiment_class = 'NA'

            sentiment_results.append((article_id, average_sentiment, sentiment_class))

        # Einfügen in die Datenbank
        try:
            cursor.executemany('''
                INSERT OR REPLACE INTO Article_Sentiment (articleID, sentiment_value, sentiment_category)
                VALUES (?, ?, ?)
            ''', sentiment_results)
            conn.commit()
        except sqlite3.Error as e:
            print(f"SQLite Fehler beim Einfügen der Sentiment-Daten: {e}")

In [13]:
# Verbindung zur SQLite-Datenbank herstellen
conn = sqlite3.connect('derstandard.db')
cursor = conn.cursor()

# Tabelle Article_Sentiment erstellen, falls sie nicht existiert
create_sentiment_table(conn)

# Liste der aktuell relevanten Parteien
current_parties = ['ÖVP', 'FPÖ', 'NEOS', 'Grüne', 'SPÖ', 'KPÖ']

# Platzhalter und params für Abfrage erstellen
placeholders = ','.join(['?'] * len(current_parties))
params = current_parties + ['2015-01-01', 'Switchlist']

# SQL-Abfrage zum Abrufen der Artikel, die eine der aktuellen Parteien behandeln UND ab 2015 UND nicht Switchlist
query = f'''
SELECT DISTINCT a.*
FROM Articles a
JOIN Article_Keywords ak ON a.articleID = ak.articleID
JOIN Keywords k ON ak.keywordID = k.keywordID
WHERE k.keyword IN ({placeholders}) AND a.datetime > ? AND a.kicker != ?
'''

# Abfrage ausführen
df = pd.read_sql_query(query, conn, params=params)
df['datetime'] = pd.to_datetime(df['datetime'])

# Sentiment-Analyse batchweise durchführen und Ergebnisse speichern
process_and_store_sentiment(conn, df, batch_size=100)

# Verbindung schließen
conn.close()

Processing Batches:  17%|█▋        | 127/746 [1:28:52<5:03:06, 29.38s/it] Token indices sequence length is longer than the specified maximum sequence length for this model (516 > 512). Running this sequence through the model will result in indexing errors
Processing Batches: 100%|██████████| 746/746 [8:53:14<00:00, 42.89s/it]    
