In [None]:
import pandas as pd

df = pd.read_parquet("cleaned_data/cleaned_data.parquet", engine="pyarrow")

In [None]:
gefundene_zeilen = df[df["head"].str.contains("Statistiken und Vergleich", case=False, na=False)]

# Ausgabe
gefundene_zeilen

### Analyse von Dokumenten an Tagen mit geringerer Nachrichtenaktivität: bsp Sonntagen

In [None]:
# 16.03 nur 179 ansonsten um die 600
# TODO Daten aus vortag noch nicht verfügbar
subset_df = df[df['pubtime'].dt.date == pd.to_datetime('2025-04-01').date()]
subset_df.shape


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import plotly.express as px
import groq
import re
import json
from collections import Counter
import nltk
import os
from dotenv import load_dotenv


load_dotenv(dotenv_path='../.env')

# Ensure that you have downloaded the punkt tokenizer for sentence splitting
nltk.download('punkt')

groq_key = os.getenv("GROQ_API_KEY")
# Initialize Groq client
client = groq.Groq(api_key=groq_key)

def split_text_sentencewise(text, max_length=1000):
    """Split the text into sentence-wise chunks that do not exceed max_length"""
    sentences = nltk.sent_tokenize(text)  # Tokenize into sentences
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())  # Count words (approximate tokens)

        # If adding this sentence exceeds the max length, start a new chunk
        if current_length + sentence_length > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length

    # Add the last chunk if any
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def parse_json_response(response):
    """Extract JSON from Groq response"""
    try:
        json_str = re.search(r'\{.*\}', response, re.DOTALL)
        if json_str:
            return json.loads(json_str.group())
        return {}
    except json.JSONDecodeError:
        return {}

def get_wikipedia_suggestions(text):
    """Get Wikipedia suggestions from Groq for a single text, ensuring the titles exist on Wikipedia"""

    # Split the text into chunks sentence-wise to avoid exceeding LLM input size limit
    chunks = split_text_sentencewise(text)

    all_titles = []

    for chunk in chunks:
        prompt = (
            "Gib 3-5 relevante Wikipedia-Artikel-Titel für diesen Text als JSON. "
            "Die Titel müssen echte Wikipedia-Artikel sein, d.h. sie müssen genau übereinstimmen. "
            "Format: {'titles': ['Artikel1', 'Artikel2']}. Nur exakte Artikelnamen, keine Vermutungen oder Platzhalter:\n\n"
            f"{chunk}"  # Each chunk of the text
        )

        try:
            completion = client.chat.completions.create(
                model="llama3-70b-8192",
                messages=[
                    {"role": "system", "content": "Du bist ein hilfreicher Assistent für Wikipedia-Recherche."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.5,
                max_tokens=200,
                response_format={"type": "json_object"}
            )
            response = completion.choices[0].message.content
            titles = parse_json_response(response).get('titles', [])
            all_titles.extend(titles)
        except Exception as e:
            print(f"API Fehler bei Chunk: {str(e)}")

    return all_titles

def get_common_suggestions(cluster_texts):
    """Analyze multiple texts to find common Wikipedia articles"""
    all_articles = []

    for text in cluster_texts:
        articles = get_wikipedia_suggestions(text)
        if articles:
            all_articles.extend(articles)

    if not all_articles:
        return "Keine relevanten Artikel gefunden"

    # Count and filter articles
    counter = Counter(all_articles)
    min_occurrences = max(2, len(cluster_texts) // 3)
    common_articles = [
        f"{art} ({cnt}x)"
        for art, cnt in counter.most_common()
        if cnt >= min_occurrences
    ]

    return "\n".join(common_articles[:5]) if common_articles else "Keine konsistenten Artikel"

In [None]:
def df_plot_dbscan(df, target_clusters=(4, 6)):
    # Create a copy of the DataFrame to avoid modifying the original
    working_df = df.copy()

    if len(working_df) <= 400:
        eps = 0.05
        min_samples = 4
    else:
        eps = 0.04
        min_samples = 6

    # Add combined_text column
    working_df["combined_text"] = working_df["head"] + " " + working_df["content"]

    model = SentenceTransformer('all-MiniLM-L12-v2')
    embeddings = model.encode(working_df['combined_text'].tolist())

    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    working_df['dbscan_cluster'] = dbscan.fit_predict(embeddings)

    num_clusters = len(working_df['dbscan_cluster'].unique()) - (1 if -1 in working_df['dbscan_cluster'].unique() else 0)

    while num_clusters < target_clusters[0] or num_clusters > target_clusters[1]:
        if num_clusters < target_clusters[0]:
            eps += 0.025
        elif num_clusters > target_clusters[1]:
            eps -= 0.025

        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        working_df['dbscan_cluster'] = dbscan.fit_predict(embeddings)
        num_clusters = len(working_df['dbscan_cluster'].unique()) - (1 if -1 in working_df['dbscan_cluster'].unique() else 0)

    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)
    working_df['tsne_x'] = tsne_results[:, 0]
    working_df['tsne_y'] = tsne_results[:, 1]

    filtered_df = working_df[working_df["dbscan_cluster"] >= 0]

    fig_dbscan = px.scatter(filtered_df, x='tsne_x', y='tsne_y', color='dbscan_cluster', hover_data=['head'])

    for cluster_id in sorted(filtered_df["dbscan_cluster"].unique()):
        cluster_data = filtered_df[filtered_df['dbscan_cluster'] == cluster_id]
        print(f"Cluster {cluster_id} heads:")
        for title in cluster_data['head'].tolist():
            print("   " + title)

    fig_dbscan.show()

    # Analyze each cluster
    for cluster_id in sorted(filtered_df["dbscan_cluster"].unique()):
        cluster_data = filtered_df[filtered_df["dbscan_cluster"] == cluster_id]
        print(f"\nCluster {cluster_id} (Size: {len(cluster_data)})")

        # Get common Wikipedia articles
        suggestions = get_common_suggestions(
            cluster_data["combined_text"].str[:1000].tolist()
        )
        print("\nHäufigste Wikipedia-Artikel:")
        print(suggestions)

In [None]:
# Process by date
if 'pubtime' in df.columns:
    for date in df['pubtime'].dt.date.unique():
        date_df = df[df['pubtime'].dt.date == date]
        print(f"\n\n=== Themen für {date} ===")
        df_plot_dbscan(date_df)
else:
    print("\n=== Analyse des gesamten Datensatzes ===")
    df_plot_dbscan(df)

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN, KMeans
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import plotly.express as px
import plotly.graph_objects as go
import groq
import re
import json
from collections import Counter
import nltk
import os
import numpy as np
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer

# Load environment variables
load_dotenv(dotenv_path='../.env')

# Ensure NLTK resources are downloaded
nltk.download('punkt')

# Initialize Groq client
groq_key = os.getenv("GROQ_API_KEY")
client = groq.Groq(api_key=groq_key)

def split_text_sentencewise(text, max_length=1000):
    """Split the text into sentence-wise chunks that do not exceed max_length"""
    sentences = nltk.sent_tokenize(text)  # Tokenize into sentences
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())  # Count words (approximate tokens)

        # If adding this sentence exceeds the max length, start a new chunk
        if current_length + sentence_length > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length

    # Add the last chunk if any
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def parse_json_response(response):
    """Extract JSON from Groq response"""
    try:
        json_str = re.search(r'\{.*\}', response, re.DOTALL)
        if json_str:
            return json.loads(json_str.group())
        return {}
    except json.JSONDecodeError:
        return {}

def extract_keywords_from_text(text, top_n=15):
    """Extract key terms from text using TF-IDF"""
    # Define German stopwords
    german_stopwords = set([
        'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einen', 'einem',
        'eines', 'und', 'oder', 'aber', 'auch', 'als', 'zu', 'bei', 'mit', 'von',
        'für', 'ist', 'sind', 'war', 'wird', 'werden', 'wurde', 'wurden', 'dass',
        'daß', 'hat', 'haben', 'hatte', 'hätte', 'auf', 'aus', 'nach', 'über', 'unter',
        'vor', 'in', 'an', 'am', 'um', 'durch', 'gegen', 'so', 'da', 'wie', 'wo',
        'wann', 'was', 'wer', 'warum', 'wieso', 'welche', 'welcher', 'welches'
    ])

    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer(
        min_df=1, max_df=0.95,
        stop_words=german_stopwords,
        ngram_range=(1, 2)  # Include single words and bigrams
    )

    # Fit to the text
    try:
        X = vectorizer.fit_transform([text])
        feature_names = vectorizer.get_feature_names_out()

        # Get top terms
        tfidf_scores = zip(feature_names, X.toarray()[0])
        sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)

        return [term for term, score in sorted_scores[:top_n]]
    except:
        return []

def get_wikipedia_suggestions(text):
    """Get Wikipedia suggestions from Groq for a single text, ensuring the titles exist on Wikipedia"""
    # Extract keywords to help Groq focus on important terms
    keywords = extract_keywords_from_text(text[:5000])
    keywords_text = ", ".join(keywords[:10])

    # Split the text into chunks
    chunks = split_text_sentencewise(text[:3000])  # Limit to first 3000 chars for efficiency
    all_titles = []

    for chunk in chunks:
        prompt = (
            "Identifiziere 3-5 relevante deutsche Wikipedia-Artikel für diesen Text. "
            "Die Titel MÜSSEN exakt mit echten deutschsprachigen Wikipedia-Artikeln übereinstimmen. "
            f"Wichtige Begriffe im Text: {keywords_text}\n\n"
            "Format: {'titles': ['Artikeltitel1', 'Artikeltitel2', ...]}\n\n"
            f"Text: {chunk}"
        )

        try:
            completion = client.chat.completions.create(
                model="llama3-70b-8192",
                messages=[
                    {"role": "system", "content": "Du bist ein präziser Assistent für die Identifikation von deutschen Wikipedia-Artikeln zu Nachrichtentexten. Du gibst nur exakte, existierende Artikeltitel zurück."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                max_tokens=250,
                response_format={"type": "json_object"}
            )
            response = completion.choices[0].message.content
            titles = parse_json_response(response).get('titles', [])
            all_titles.extend(titles)
        except Exception as e:
            print(f"API Fehler bei Chunk: {str(e)}")

    return all_titles

def get_common_suggestions(cluster_texts):
    """Analyze multiple texts to find common Wikipedia articles"""
    all_articles = []

    # Limit to 3 texts per cluster for efficiency
    for text in cluster_texts[:3]:
        articles = get_wikipedia_suggestions(text)
        if articles:
            all_articles.extend(articles)

    if not all_articles:
        return "Keine relevanten Artikel gefunden"

    # Count and filter articles
    counter = Counter(all_articles)
    # Lower threshold for smaller datasets
    min_occurrences = 1
    common_articles = [
        f"{art} ({cnt}x)"
        for art, cnt in counter.most_common(8)
        if cnt >= min_occurrences
    ]

    return "\n".join(common_articles[:7]) if common_articles else "Keine konsistenten Artikel"

def extract_cluster_key_terms(texts, n_terms=8):
    """Extract key terms that characterize a cluster using TF-IDF"""
    # German stopwords
    german_stopwords = set([
        'der', 'die', 'das', 'den', 'dem', 'des', 'ein', 'eine', 'einen', 'einem',
        'eines', 'und', 'oder', 'aber', 'auch', 'als', 'zu', 'bei', 'mit', 'von',
        'für', 'ist', 'sind', 'war', 'wird', 'werden', 'wurde', 'wurden', 'dass',
        'daß', 'hat', 'haben', 'hatte', 'hätte', 'auf', 'aus', 'nach', 'über', 'unter',
        'vor', 'in', 'an', 'am', 'um', 'durch', 'gegen', 'so', 'da', 'wie', 'wo',
        'wann', 'was', 'wer', 'warum', 'wieso', 'welche', 'welcher', 'welches',
        'mehr', 'noch', 'sehr', 'schon', 'wieder', 'immer', 'nur', 'etwa', 'bereits'
    ])

    # Create a TF-IDF vectorizer specific for German news
    vectorizer = TfidfVectorizer(
        min_df=1, max_df=0.9,  # More lenient to work with smaller clusters
        stop_words=list(german_stopwords),
        ngram_range=(1, 2)  # Include both single words and bigrams
    )

    # If not enough texts, return empty
    if len(texts) < 1:
        return "Nicht genügend Artikel für Schlüsselwortextraktion"

    try:
        # Fit the vectorizer to all texts in the cluster
        X = vectorizer.fit_transform(texts)
        feature_names = vectorizer.get_feature_names_out()

        # Calculate average TF-IDF for each term
        tfidf_means = X.mean(axis=0).A1

        # Get top terms by mean TF-IDF score
        top_indices = tfidf_means.argsort()[-n_terms:][::-1]
        top_terms = [feature_names[i] for i in top_indices]

        return ", ".join(top_terms)
    except Exception as e:
        print(f"Fehler bei Schlüsselwortextraktion: {str(e)}")
        return "Fehler bei der Extraktion von Schlüsselwörtern"

def get_cluster_summary(texts, max_length=150):
    """Get a summary of the cluster using Groq"""
    # Combine the first sentences of each article for context
    first_sentences = []
    for text in texts[:3]:  # Use up to 3 texts
        sentences = nltk.sent_tokenize(text)
        if sentences:
            first_sentences.append(sentences[0])

    context = " ".join(first_sentences)

    # Extract keywords to guide summary
    keywords = extract_keywords_from_text(" ".join(texts), top_n=10)

    prompt = (
        "Fasse den Hauptnachrichtentrend dieser Artikelgruppe in 1-2 Sätzen zusammen "
        "(maximal 150 Zeichen). Benutze einen neutralen, nachrichtlichen Stil:\n\n"
        f"Schlüsselwörter: {', '.join(keywords)}\n\n"
        f"Kontext: {context[:1000]}"
    )

    try:
        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "Du bist ein präziser Nachrichtenredakteur. Fasse den Haupttrend in 1-2 kurzen, prägnanten Sätzen zusammen."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=100
        )
        summary = completion.choices[0].message.content
        # Truncate if too long
        if len(summary) > max_length:
            summary = summary[:max_length] + "..."
        return summary
    except Exception as e:
        print(f"Fehler bei Zusammenfassung: {str(e)}")
        return "Zusammenfassung nicht verfügbar"

def df_cluster_news(df, n_clusters=None):
    """Create clusters using KMeans and visualize them"""
    # Create a copy of the DataFrame
    working_df = df.copy()

    # Set number of clusters based on dataset size
    if n_clusters is None:
        if len(working_df) <= 50:
            n_clusters = 3
        elif len(working_df) <= 200:
            n_clusters = 5
        elif len(working_df) <= 500:
            n_clusters = 6
        else:
            n_clusters = min(8, len(working_df) // 200 + 5)  # Scale with dataset size

    print(f"Clustering {len(working_df)} Artikel in {n_clusters} Cluster...")

    # Weight headlines more heavily in the combined text
    working_df["combined_text"] = working_df["head"].str.repeat(3) + " " + working_df["content"]

    # Use a better model for German text
    model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
    print("Erstelle Embeddings...")
    embeddings = model.encode(working_df['combined_text'].tolist(), show_progress_bar=True)

    # Apply KMeans clustering
    print(f"Wende KMeans mit {n_clusters} Clustern an...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    working_df['cluster'] = kmeans.fit_predict(embeddings)

    # Reduce dimensionality for visualization
    print("Reduziere Dimensionen für Visualisierung...")
    # First reduce to 50 dimensions with PCA for efficiency
    if len(working_df) > 100:
        pca = PCA(n_components=min(50, len(working_df)-1))
        reduced_data = pca.fit_transform(embeddings)
    else:
        reduced_data = embeddings

    # Then apply t-SNE
    perplexity_value = min(30, max(5, len(working_df) // 10))
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity_value)
    tsne_results = tsne.fit_transform(reduced_data)

    working_df['tsne_x'] = tsne_results[:, 0]
    working_df['tsne_y'] = tsne_results[:, 1]

    # Create a more visually appealing scatter plot
    fig = px.scatter(
        working_df,
        x='tsne_x',
        y='tsne_y',
        color='cluster',
        color_continuous_scale=px.colors.qualitative.Bold,
        hover_data=['head'],
        title=f"Themen-Clustering ({n_clusters} Cluster)"
    )

    # Enhance the visualization
    fig.update_traces(marker=dict(size=10, opacity=0.7))
    fig.update_layout(
        legend_title_text='Cluster',
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
    )

    # Add annotations for cluster centers
    cluster_centers = []
    for cluster_id in range(n_clusters):
        cluster_df = working_df[working_df['cluster'] == cluster_id]
        center_x = cluster_df['tsne_x'].mean()
        center_y = cluster_df['tsne_y'].mean()
        cluster_centers.append((center_x, center_y))

        # Add a text label at the cluster center
        fig.add_annotation(
            x=center_x, y=center_y,
            text=f"Cluster {cluster_id}",
            showarrow=False,
            font=dict(size=14, color="black", family="Arial Black"),
            bgcolor="rgba(255, 255, 255, 0.7)",
            bordercolor="black",
            borderwidth=1
        )

    # Sort clusters by size (descending)
    clusters_by_size = working_df['cluster'].value_counts().sort_values(ascending=False).index

    # Process each cluster
    for cluster_id in clusters_by_size:
        cluster_data = working_df[working_df['cluster'] == cluster_id]
        cluster_size = len(cluster_data)

        print(f"\n=== Cluster {cluster_id} (Größe: {cluster_size}) ===")

        # Extract key terms
        key_terms = extract_cluster_key_terms(cluster_data["combined_text"].tolist())
        print(f"Schlüsselwörter: {key_terms}")

        # Get cluster summary
        if cluster_size >= 2:
            summary = get_cluster_summary(cluster_data["combined_text"].tolist())
            print(f"Zusammenfassung: {summary}")

        # Show representative headlines
        print("\nSchlagzeilen:")
        for title in cluster_data['head'].tolist()[:5]:  # Show top 5 headlines
            print(f"• {title}")

        # Get common Wikipedia articles
        print("\nWikipedia-Artikel:")
        suggestions = get_common_suggestions(
            cluster_data["combined_text"].str[:1500].tolist()
        )
        print(suggestions)

    # Show plot
    fig.show()

    return working_df

# Function to process by date
def process_by_date(df):
    if 'pubtime' in df.columns:
        for date in sorted(df['pubtime'].dt.date.unique()):
            date_df = df[df['pubtime'].dt.date == date]
            if len(date_df) > 0:
                print(f"\n\n=== Themen für {date} ===")
                df_cluster_news(date_df)
            else:
                print(f"Keine Artikel für {date} verfügbar.")
    else:
        print("\n=== Analyse des gesamten Datensatzes ===")
        df_cluster_news(df)
process_by_date(df)