In [None]:
import pandas as pd

df = pd.read_parquet("cleaned_data/cleaned_data.parquet", engine="pyarrow")

In [None]:
gefundene_zeilen = df[df["head"].str.contains("Statistiken und Vergleich", case=False, na=False)]

# Ausgabe
gefundene_zeilen

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import plotly.express as px
import groq
import re
import json
from collections import Counter
import nltk
import os
import numpy as np
from dotenv import load_dotenv

# Load environment variables
load_dotenv(dotenv_path='../.env')

# Ensure that you have downloaded the punkt tokenizer for sentence splitting
nltk.download('punkt')

# Initialize Groq client
groq_key = os.getenv("GROQ_API_KEY")
client = groq.Groq(api_key=groq_key)

def split_text_sentencewise(text, max_length=1000):
    """Split the text into sentence-wise chunks that do not exceed max_length"""
    if not text or not isinstance(text, str):
        return []

    sentences = nltk.sent_tokenize(text)
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())
        if current_length + sentence_length > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length

    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def analyze_cluster_with_groq(texts, titles, max_chars=2000):
    """Get a comprehensive analysis of a cluster using Groq"""
    # Create a sample of texts and titles
    sample_size = min(5, len(texts))
    sample_texts = texts[:sample_size]
    sample_titles = titles[:sample_size]

    # Create a condensed representation
    combined_sample = "\n\n".join([f"Titel: {title}\nText: {text[:300]}..."
                                 for title, text in zip(sample_titles, sample_texts)])

    # Trim if too long
    if len(combined_sample) > max_chars:
        combined_sample = combined_sample[:max_chars] + "..."

    prompt = f"""Analysiere diese Gruppe von Nachrichtenartikeln und erstelle eine detaillierte Zusammenfassung:

{combined_sample}

Liefere folgende Informationen im JSON-Format:
1. "hauptthema": Ein prägnanter Titel für das Hauptthema (max. 10 Wörter)
2. "zusammenfassung": Eine klare Zusammenfassung des Themas (50-100 Wörter)
3. "schlüsselwörter": 5-8 zentrale Begriffe oder Konzepte
4. "entitäten": Wichtige Personen, Organisationen oder Orte
5. "perspektiven": Unterschiedliche Blickwinkel oder Meinungen (falls vorhanden)
6. "kontext": Wichtiger gesellschaftlicher, wirtschaftlicher oder politischer Kontext

Antwort nur im JSON-Format."""

    try:
        completion = client.chat.completions.create(
            model="llama3-70b-8192",
            messages=[
                {"role": "system", "content": "Du bist ein Experte für Medienanalyse und Themenkategorisierung."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            max_tokens=1000,
            response_format={"type": "json_object"}
        )
        return json.loads(completion.choices[0].message.content)
    except Exception as e:
        print(f"Fehler bei Clusteranalyse: {str(e)}")
        return {
            "hauptthema": "Analysefehler",
            "zusammenfassung": f"Fehler bei der Analyse: {str(e)}",
            "schlüsselwörter": [],
            "entitäten": [],
            "perspektiven": [],
            "kontext": ""
        }

def optimize_dbscan_params(embeddings, target_clusters=(4, 8)):
    """Find optimal DBSCAN parameters through grid search"""
    print("Suche optimale DBSCAN-Parameter...")

    # Try a range of parameters
    eps_values = [0.5, 0.45, 0.4, 0.35, 0.3, 0.25, 0.2, 0.18, 0.16, 0.14, 0.12, 0.1]
    min_samples_values = [2, 3, 4, 5, 6]

    best_params = None
    best_score = float('inf')
    best_num_clusters = 0
    best_noise_ratio = 1.0

    for eps in eps_values:
        for min_samples in min_samples_values:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples)
            labels = dbscan.fit_predict(embeddings)

            unique_labels = set(labels)
            num_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
            noise_points = list(labels).count(-1)
            noise_ratio = noise_points / len(labels)

            print(f"DBSCAN mit eps={eps}, min_samples={min_samples}: {num_clusters} Cluster, {noise_ratio:.1%} Ausreißer")

            # Skip if no clusters or too many noise points
            if num_clusters == 0 or num_clusters > target_clusters[1] * 2 or noise_ratio > 0.7:
                continue

            # Calculate score based on target range and noise
            if target_clusters[0] <= num_clusters <= target_clusters[1]:
                # Perfect cluster count gets best score
                cluster_score = 0
            else:
                # Penalize being outside target range
                cluster_score = min(
                    abs(num_clusters - target_clusters[0]),
                    abs(num_clusters - target_clusters[1])
                )

            # Final score is a weighted sum of cluster count penalty and noise ratio
            score = cluster_score + (noise_ratio * 2)

            if best_params is None or score < best_score:
                best_params = (eps, min_samples)
                best_score = score
                best_num_clusters = num_clusters
                best_noise_ratio = noise_ratio
                print(f"  ✓ Neue beste Parameter gefunden (Score: {score:.2f})")

    if best_params:
        print(f"Optimale Parameter: eps={best_params[0]}, min_samples={best_params[1]}")
        print(f"  → {best_num_clusters} Cluster mit {best_noise_ratio:.1%} Ausreißern")
        return best_params
    else:
        print("Keine optimalen Parameter gefunden. Verwende Standardwerte.")
        return (0.3, 3)  # Conservative default

def df_plot_dbscan(df, target_clusters=(4, 8)):
    """Improved DBSCAN clustering with better visualization and analysis"""
    print(f"Clustering {len(df)} Artikel...")

    # Weight headlines more in the combined text (3x)
    df.loc[:, "combined_text"] = df["head"].str.repeat(3) + " " + df["content"]

    # Use a better model for German text
    print("Erstelle Embeddings...")
    model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')
    embeddings = model.encode(df['combined_text'].tolist(), show_progress_bar=True)

    # Find optimal parameters
    eps, min_samples = optimize_dbscan_params(embeddings, target_clusters)

    # Apply DBSCAN with best parameters
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    df.loc[:, 'cluster'] = dbscan.fit_predict(embeddings)

    # Get cluster statistics
    unique_labels = set(df['cluster'])
    num_clusters = len(unique_labels) - (1 if -1 in unique_labels else 0)
    noise_points = list(df['cluster']).count(-1)
    print(f"Clustering ergab {num_clusters} Cluster und {noise_points} Ausreißer ({noise_points/len(df):.1%})")

    # Create t-SNE visualization
    print("Erstelle t-SNE Visualisierung...")
    perplexity = min(30, max(5, len(df) // 15))
    tsne = TSNE(n_components=2, random_state=42, perplexity=perplexity)
    tsne_results = tsne.fit_transform(embeddings)
    df.loc[:, 'tsne_x'] = tsne_results[:, 0]
    df.loc[:, 'tsne_y'] = tsne_results[:, 1]

    # Filter out noise points
    filtered_df = df[df['cluster'] >= 0]

    # Create plot
    fig = px.scatter(
        filtered_df,
        x='tsne_x',
        y='tsne_y',
        color='cluster',
        hover_data=['head'],
        title=f"Themencluster ({num_clusters} Cluster)",
        color_discrete_sequence=px.colors.qualitative.Bold,
        opacity=0.7
    )

    fig.update_layout(
        legend_title_text='Thema',
        xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, title=""),
        yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, title="")
    )

    # Add cluster annotations
    for cluster_id in filtered_df['cluster'].unique():
        cluster_df = filtered_df[filtered_df['cluster'] == cluster_id]
        center_x = cluster_df['tsne_x'].mean()
        center_y = cluster_df['tsne_y'].mean()

        fig.add_annotation(
            x=center_x,
            y=center_y,
            text=f"Cluster {cluster_id}",
            showarrow=False,
            font=dict(size=14, color="black"),
            bgcolor="rgba(255, 255, 255, 0.7)",
            bordercolor="black",
            borderwidth=1,
            borderpad=4
        )

    fig.show()

    # Analyze clusters by size
    clusters_by_size = filtered_df['cluster'].value_counts().sort_values(ascending=False).index

    for cluster_id in clusters_by_size:
        cluster_data = filtered_df[filtered_df['cluster'] == cluster_id]
        cluster_size = len(cluster_data)

        # Skip tiny clusters
        if cluster_size < 2:
            continue

        print(f"\n\n=== Cluster {cluster_id} (Größe: {cluster_size}) ===")

        # Analyze cluster with Groq
        print("Analysiere Cluster...")
        analysis = analyze_cluster_with_groq(
            cluster_data["combined_text"].tolist(),
            cluster_data["head"].tolist()
        )

        # Print analysis results
        print(f"Hauptthema: {analysis.get('hauptthema', 'Nicht verfügbar')}")
        print(f"Zusammenfassung: {analysis.get('zusammenfassung', 'Nicht verfügbar')}")
        print(f"Schlüsselwörter: {', '.join(analysis.get('schlüsselwörter', []))}")
        print(f"Entitäten: {', '.join(analysis.get('entitäten', []))}")

        # Print additional insights if available
        if analysis.get('perspektiven'):
            print(f"Perspektiven: {analysis.get('perspektiven')}")
        if analysis.get('kontext'):
            print(f"Kontext: {analysis.get('kontext')}")

        # Show sample headlines
        print("\nSchlagzeilen:")
        for title in cluster_data['head'].tolist()[:5]:
            print(f"• {title}")

    return filtered_df

def df_cluster_news(df, target_clusters=(4, 8)):
    """Wrapper for clustering functionality"""
    return df_plot_dbscan(df, target_clusters)

In [None]:
def process_by_date(df, target_clusters=(4, 8)):
    """Process the dataframe either as a whole or by date if pubtime column is available"""
    if 'pubtime' in df.columns:
        # Convert to datetime if it isn't already
        if not pd.api.types.is_datetime64_any_dtype(df['pubtime']):
            df['pubtime'] = pd.to_datetime(df['pubtime'])

        # Find all dates in the dataframe
        all_dates = sorted(df['pubtime'].dt.date.unique())

        # Process each date separately
        for date in all_dates:
            date_df = df[df['pubtime'].dt.date == date]
            if len(date_df) > 0:
                print(f"\n\n{'='*50}")
                print(f"=== Themen für {date} ({len(date_df)} Artikel) ===")
                print(f"{'='*50}")
                df_cluster_news(date_df, target_clusters)
            else:
                print(f"Keine Artikel für {date} verfügbar.")
    else:
        print(f"\n\n{'='*50}")
        print(f"=== Analyse des gesamten Datensatzes ({len(df)} Artikel) ===")
        print(f"{'='*50}")
        df_cluster_news(df, target_clusters)

# Example usage with the dataframe
# process_by_date(df, target_clusters=(3, 7))

In [None]:
# For all articles
process_by_date(df)

# Or with custom cluster targets
# process_by_date(df, target_clusters=(3, 6))