In [None]:
### Data Cleaning

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import re

# Load the file
file_path = '58ecabf6-9618-407a-86a5-0d162c326ed8__2025_03_28T07_03_57.tsv.xz'
df = pd.read_csv(file_path, sep='\t', compression='xz')

# Clean column names (remove extra spaces)
df.columns = df.columns.str.strip()


# Function to clean text by removing HTML tags, URLs, and extra spaces
def clean_text(text):
    if pd.isna(text):  # Return empty string if value is NaN
        return ''
    text = re.sub(r'<[^>]+>', ' ', text)  # Remove HTML tags
    text = re.sub(r'https?://\S+|www\.\S+', ' ', text)  # Remove URLs
    text = re.sub(r'&[a-zA-Z0-9#]+;', ' ', text)  # Remove HTML entities
    text = re.sub(r'\s+', ' ', text).strip()  # Reduce multiple spaces
    return text


def remove_similar_rows(df, threshold=0.995):
    # Drop rows where the content_id is the same
    df = df.drop_duplicates(subset=['content_id'])



    # Clean the content column
    df.loc[:, 'content'] = df['content'].apply(clean_text)


    # Drop rows where the head is exactly the same
    df = df.drop_duplicates(subset=["head"])


    # Vectorize the content using TF-IDF
    vectorizer = TfidfVectorizer().fit_transform(df['content'])
    vectors = vectorizer.toarray()

    # Compute cosine similarity matrix
    cosine_sim_matrix = cosine_similarity(vectors)

    # Identify pairs of articles with similarity above the threshold
    similar_pairs = np.where(cosine_sim_matrix > threshold)

    # Create a set of indices to drop
    indices_to_drop = set()
    for i, j in zip(*similar_pairs):
        if i != j:
            indices_to_drop.add(j)

    # Drop the duplicates using .loc to avoid SettingWithCopyWarning
    df = df.loc[~df.index.isin(indices_to_drop)]

    # Reset index
    df.reset_index(drop=True, inplace=True)

    return df

# Drop same or nearly same articles
df = remove_similar_rows(df, 0.65)

# make the pubtime a df datetime format
df['pubtime'] = pd.to_datetime(df['pubtime'])

df.shape

# save as Parquet-File
df.to_parquet("bereinigte_daten.parquet", engine="pyarrow", index=False)




In [None]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import plotly.express as px
import torch

def df_plot_dbscan(df, target_clusters=(4, 6)):
    # Überprüfe die Größe des DataFrames und setze die DBSCAN-Parameter entsprechend
    if len(df) <= 400:
        eps = 0.88
        min_samples = 4
    else:
        eps = 0.85
        min_samples = 6

    # Kombiniere den Header und den bereinigten Inhalt
    df['combined_text'] = df['head'] + ' ' + df['content']

    # Lade das vortrainierte LeoLM Modell und den Tokenizer
    tokenizer = AutoTokenizer.from_pretrained('LeoLM/leo-hessianai-13b')
    model = AutoModel.from_pretrained('LeoLM/leo-hessianai-13b')

    # Wandle die Texte in Embeddings um
    embeddings = []
    for text in df['combined_text'].tolist():
        inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=8000)
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())

    # DBSCAN Clustering mit den aktuellen Hyperparametern
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    df['dbscan_cluster'] = dbscan.fit_predict(embeddings)

    # Überprüfe, wie viele Cluster es gibt
    num_clusters = len(df['dbscan_cluster'].unique()) - (1 if -1 in df['dbscan_cluster'].unique() else 0)

    # Falls weniger als 4 Cluster, erhöhen wir eps, falls mehr als 6 Cluster, verringern wir eps
    while num_clusters < target_clusters[0] or num_clusters > target_clusters[1]:
        if num_clusters < target_clusters[0]:
            eps += 0.025  # Erhöhe eps
        elif num_clusters > target_clusters[1]:
            eps -= 0.025  # Verringere eps

        # Führe DBSCAN erneut aus
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        df['dbscan_cluster'] = dbscan.fit_predict(embeddings)

        # Überprüfe die Anzahl der Cluster erneut
        num_clusters = len(df['dbscan_cluster'].unique()) - (1 if -1 in df['dbscan_cluster'].unique() else 0)

    # Anwendung von t-SNE zur Reduktion auf 2 Dimensionen
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)

    # Füge die t-SNE-Ergebnisse und die Cluster zum DataFrame hinzu
    df['tsne_x'] = tsne_results[:, 0]
    df['tsne_y'] = tsne_results[:, 1]

    # Erstelle eine interaktive Plotly-Visualisierung für DBSCAN
    fig_dbscan = px.scatter(df[df['dbscan_cluster'] >= 0], x='tsne_x', y='tsne_y', color='dbscan_cluster', hover_data=['head'])

    # Zeige die Cluster-Titel an
    for cluster_id in range(len(df['dbscan_cluster'].unique()) - 1):
        cluster_data = df[df['dbscan_cluster'] == cluster_id]
        print(f"Cluster {cluster_id} heads:")
        for title in cluster_data['head'].tolist():
            print("   " + title)

    # Zeige die DBSCAN-Grafik an
    fig_dbscan.show()

# Beispielhafte Anwendung
# Angenommen, df ist Ihr DataFrame mit den Spalten 'head', 'content' und 'pubtime'
# unique_dates = df['pubtime'].dt.date.unique()
# for date in unique_dates:
#     subset_df = df[df['pubtime'].dt.date == date]
#     print(f"Topics of {date}:")
#     df_plot_dbscan(subset_df, target_clusters=(3, 6))


In [None]:
df_plot_dbscan(df)

### Analyse von Dokumenten an Tagen mit geringerer Nachrichtenaktivität: bsp Sonntagen

In [None]:
# 16.03 nur 179 ansonsten um die 600
subset_df = df[df['pubtime'].dt.date == pd.to_datetime('2025-03-12').date()]
subset_df.shape

In [None]:
import pandas as pd
from transformers import LongformerModel, LongformerTokenizer
import torch
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import plotly.express as px

def df_plot_dbscan(df, eps=0.85, min_samples=6):
    # Kombiniere den Header und den bereinigten Inhalt
    df.loc[:, "combined_text"] = df["head"] + " " + df["content"]

    # Lade das Longformer-Modell und den Tokenizer
    model_name = 'allenai/longformer-base-4096'
    tokenizer = LongformerTokenizer.from_pretrained(model_name)
    model = LongformerModel.from_pretrained(model_name)

    # Funktion zur Erstellung von Embeddings mit Longformer
    def get_longformer_embedding(text):
        inputs = tokenizer(text, return_tensors='pt', truncation=True, max_length=4096, padding=True)
        # Verwende CPU/GPU für die Berechnung
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        inputs = {key: value.to(device) for key, value in inputs.items()}
        model.to(device)

        with torch.no_grad():
            outputs = model(**inputs)

        # Berechne den Durchschnitt der Hidden States als Embedding
        return outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()

    # Wandle die Texte in Embeddings um
    df.loc[:, "embeddings"] = df["combined_text"].apply(get_longformer_embedding)
    embeddings = list(df["embeddings"])

    # DBSCAN Clustering mit den angegebenen Parametern
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    df.loc[:, 'dbscan_cluster'] = dbscan.fit_predict(embeddings)

    # Anwendung von t-SNE zur Reduktion auf 2 Dimensionen
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)

    # Füge die t-SNE-Ergebnisse und die Cluster zu den DataFrame hinzu
    df.loc[:, 'tsne_x'] = tsne_results[:, 0]
    df.loc[:, 'tsne_y'] = tsne_results[:, 1]

    # Erstelle eine interaktive Plotly-Visualisierung für DBSCAN
    fig_dbscan = px.scatter(df[df["dbscan_cluster"] >= 0], x='tsne_x', y='tsne_y', color='dbscan_cluster', hover_data=['head'])

    # Zeige die Cluster-Titel an
    for cluster_id in sorted(df['dbscan_cluster'].unique()):
        if cluster_id == -1:
            continue  # Überspringe den Noise-Cluster
        cluster_data = df[df['dbscan_cluster'] == cluster_id]
        print(f"Cluster {cluster_id} heads:")
        for title in cluster_data['head'].tolist():
            print("   " + title)

    # Zeige die DBSCAN Grafik an
    fig_dbscan.show()


In [None]:
df_plot_dbscan(subset_df)

### Algorithmus Tägliche Themenanalyse basierend auf Nachrichtenaktivität: DBSCAN-Cluster für den {date}

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import plotly.express as px

import re
import pandas as pd
import plotly.express as px
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE

def split_into_sentence_windows(text, window_size=3, overlap=1):
    """ Zerlegt den Text in vollständige Sätze und erstellt überlappende Fenster. """
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split nach Satzzeichen
    return [
        " ".join(sentences[i:i + window_size])
        for i in range(0, len(sentences) - window_size + 1, window_size - overlap)
    ] if len(sentences) >= window_size else [" ".join(sentences)]

def is_outlier(row, means, stds, max_distance_factor):
    """ Prüft, ob ein Punkt als Ausreißer gilt. """
    cluster_id = row['dbscan_cluster']
    if cluster_id not in means.index:
        return False
    dist_x = abs(row['tsne_x'] - means.loc[cluster_id, 'tsne_x'])
    dist_y = abs(row['tsne_y'] - means.loc[cluster_id, 'tsne_y'])
    return dist_x > max_distance_factor * stds.loc[cluster_id, 'tsne_x'] or \
           dist_y > max_distance_factor * stds.loc[cluster_id, 'tsne_y']

def df_plot_dbscan(df, target_clusters=(4, 6), max_distance_factor=1.5):
    if len(df) <= 400:
        eps = 0.05
        min_samples = 4
    else:
        eps = 0.04
        min_samples = 6

    df.loc[:, "combined_text"] = df["head"] + " " + df["content"]

    # Erzeuge Sätze mit Überlappung für jeden Eintrag
    df["sentence_windows"] = df["combined_text"].apply(lambda text: split_into_sentence_windows(text))

    # Flach die Liste ab und erzeuge ein DataFrame mit den Fenstern
    expanded_df = df.explode("sentence_windows").reset_index(drop=True)

    # Berechne Embeddings für die Satzfenster
    model = SentenceTransformer('all-MiniLM-L12-v2')
    embeddings = model.encode(expanded_df['sentence_windows'].tolist())

    # Führe DBSCAN-Clustering durch
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    expanded_df.loc[:, 'dbscan_cluster'] = dbscan.fit_predict(embeddings)

    # Aggregiere das Cluster-Ergebnis zurück auf das Original-DataFrame (Mehrheitsentscheidung)
    expanded_df = expanded_df.reset_index()  # Index in eine Spalte umwandeln
    df_clusters = expanded_df.groupby("index")["dbscan_cluster"].agg(lambda x: x.value_counts().idxmax())

    df["dbscan_cluster"] = df_clusters

    num_clusters = len(df['dbscan_cluster'].unique()) - (1 if -1 in df['dbscan_cluster'].unique() else 0)

    while num_clusters < target_clusters[0] or num_clusters > target_clusters[1]:
        if num_clusters < target_clusters[0]:
            eps += 0.025
        elif num_clusters > target_clusters[1]:
            eps -= 0.025

        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        df["dbscan_cluster"] = dbscan.fit_predict(embeddings)
        num_clusters = len(df['dbscan_cluster'].unique()) - (1 if -1 in df['dbscan_cluster'].unique() else 0)

    # TSNE für die Visualisierung
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)
    df.loc[:, 'tsne_x'] = tsne_results[:, 0]
    df.loc[:, 'tsne_y'] = tsne_results[:, 1]

    # Entferne Ausreißer
    filtered_df = df[df["dbscan_cluster"] >= 0]
    cluster_means = filtered_df.groupby("dbscan_cluster")[["tsne_x", "tsne_y"]].mean()
    cluster_stds = filtered_df.groupby("dbscan_cluster")[["tsne_x", "tsne_y"]].std()

    filtered_df = filtered_df[~filtered_df.apply(lambda row: is_outlier(row, cluster_means, cluster_stds, max_distance_factor), axis=1)]

    # Erstelle Scatter-Plot
    fig_dbscan = px.scatter(filtered_df, x='tsne_x', y='tsne_y', color='dbscan_cluster', hover_data=['head'])

    for cluster_id in sorted(filtered_df["dbscan_cluster"].unique()):
        cluster_data = filtered_df[filtered_df['dbscan_cluster'] == cluster_id]
        print(f"Cluster {cluster_id} heads:")
        for title in cluster_data['head'].tolist():
            print("   " + title)

    fig_dbscan.show()

unique_dates = df['pubtime'].dt.date.unique()

for date in unique_dates:
    subset_df = df[df['pubtime'].dt.date == date]
    print(f"Topics of {date}:")
    df_plot_dbscan(subset_df, target_clusters=(3, 6))

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE
import plotly.express as px
import groq
import re
import json
from collections import Counter
import nltk

# Ensure that you have downloaded the punkt tokenizer for sentence splitting
nltk.download('punkt_tab')
nltk.download('punkt')

# Initialize Groq client
client = groq.Groq(api_key="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")

def split_text_sentencewise(text, max_length=1000):
    """Split the text into sentence-wise chunks that do not exceed max_length"""
    sentences = nltk.sent_tokenize(text)  # Tokenize into sentences
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        sentence_length = len(sentence.split())  # Count words (approximate tokens)

        # If adding this sentence exceeds the max length, start a new chunk
        if current_length + sentence_length > max_length:
            if current_chunk:
                chunks.append(' '.join(current_chunk))
            current_chunk = [sentence]
            current_length = sentence_length
        else:
            current_chunk.append(sentence)
            current_length += sentence_length

    # Add the last chunk if any
    if current_chunk:
        chunks.append(' '.join(current_chunk))

    return chunks

def parse_json_response(response):
    """Extract JSON from Groq response"""
    try:
        json_str = re.search(r'\{.*\}', response, re.DOTALL)
        if json_str:
            return json.loads(json_str.group())
        return {}
    except json.JSONDecodeError:
        return {}

def get_wikipedia_suggestions(text):
    """Get Wikipedia suggestions from Groq for a single text, ensuring the titles exist on Wikipedia"""

    # Split the text into chunks sentence-wise to avoid exceeding LLM input size limit
    chunks = split_text_sentencewise(text)

    all_titles = []

    for chunk in chunks:
        prompt = (
            "Gib 3-5 relevante Wikipedia-Artikel-Titel für diesen Text als JSON. "
            "Die Titel müssen echte Wikipedia-Artikel sein, d.h. sie müssen genau übereinstimmen. "
            "Format: {'titles': ['Artikel1', 'Artikel2']}. Nur exakte Artikelnamen, keine Vermutungen oder Platzhalter:\n\n"
            f"{chunk}"  # Each chunk of the text
        )

        try:
            completion = client.chat.completions.create(
                model="llama3-70b-8192",
                messages=[
                    {"role": "system", "content": "Du bist ein hilfreicher Assistent für Wikipedia-Recherche."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.5,
                max_tokens=200,
                response_format={"type": "json_object"}
            )
            response = completion.choices[0].message.content
            titles = parse_json_response(response).get('titles', [])
            all_titles.extend(titles)
        except Exception as e:
            print(f"API Fehler bei Chunk: {str(e)}")

    return all_titles

def get_common_suggestions(cluster_texts):
    """Analyze multiple texts to find common Wikipedia articles"""
    all_articles = []

    for text in cluster_texts:
        articles = get_wikipedia_suggestions(text)
        if articles:
            all_articles.extend(articles)

    if not all_articles:
        return "Keine relevanten Artikel gefunden"

    # Count and filter articles
    counter = Counter(all_articles)
    min_occurrences = max(2, len(cluster_texts) // 3)
    common_articles = [
        f"{art} ({cnt}x)"
        for art, cnt in counter.most_common()
        if cnt >= min_occurrences
    ]

    return "\n".join(common_articles[:5]) if common_articles else "Keine konsistenten Artikel"

import re
import pandas as pd
import plotly.express as px
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
from sklearn.manifold import TSNE

def split_into_sentence_windows(text, window_size=3, overlap=1):
    """ Zerlegt den Text in vollständige Sätze und erstellt überlappende Fenster. """
    sentences = re.split(r'(?<=[.!?])\s+', text)  # Split nach Satzzeichen
    return [
        " ".join(sentences[i:i + window_size])
        for i in range(0, len(sentences) - window_size + 1, window_size - overlap)
    ] if len(sentences) >= window_size else [" ".join(sentences)]

def is_outlier(row, means, stds, max_distance_factor):
    """ Prüft, ob ein Punkt als Ausreißer gilt. """
    cluster_id = row['dbscan_cluster']
    if cluster_id not in means.index:
        return False
    dist_x = abs(row['tsne_x'] - means.loc[cluster_id, 'tsne_x'])
    dist_y = abs(row['tsne_y'] - means.loc[cluster_id, 'tsne_y'])
    return dist_x > max_distance_factor * stds.loc[cluster_id, 'tsne_x'] or \
           dist_y > max_distance_factor * stds.loc[cluster_id, 'tsne_y']

def df_plot_dbscan(df, target_clusters=(4, 6), max_distance_factor=1.5):
    if len(df) <= 400:
        eps = 0.05
        min_samples = 4
    else:
        eps = 0.04
        min_samples = 6

    # Create combined text column
    df = df.copy()  # Avoid SettingWithCopyWarning
    df.loc[:, "combined_text"] = df["head"] + " " + df["content"]

    # Create sentence windows for each entry
    df.loc[:, "sentence_windows"] = df["combined_text"].apply(lambda text: split_into_sentence_windows(text))

    # Expand the DataFrame with sentence windows
    expanded_df = df.explode("sentence_windows").reset_index()

    # Calculate embeddings for the sentence windows
    model = SentenceTransformer('all-MiniLM-L12-v2')
    embeddings = model.encode(expanded_df['sentence_windows'].tolist())

    # Perform DBSCAN clustering
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)
    expanded_df.loc[:, 'dbscan_cluster'] = dbscan.fit_predict(embeddings)

    # Aggregate cluster results back to original DataFrame (majority voting)
    df_clusters = expanded_df.groupby("index")["dbscan_cluster"].agg(lambda x: x.value_counts().idxmax())
    df.loc[:, "dbscan_cluster"] = df_clusters

    # Adjust eps to reach target cluster range
    num_clusters = len(df['dbscan_cluster'].unique()) - (1 if -1 in df['dbscan_cluster'].unique() else 0)

    while num_clusters < target_clusters[0] or num_clusters > target_clusters[1]:
        if num_clusters < target_clusters[0]:
            eps += 0.025
        elif num_clusters > target_clusters[1]:
            eps -= 0.025

        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        expanded_df.loc[:, 'dbscan_cluster'] = dbscan.fit_predict(embeddings)
        df_clusters = expanded_df.groupby("index")["dbscan_cluster"].agg(lambda x: x.value_counts().idxmax())
        df.loc[:, "dbscan_cluster"] = df_clusters
        num_clusters = len(df['dbscan_cluster'].unique()) - (1 if -1 in df['dbscan_cluster'].unique() else 0)

    # TSNE for visualization
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)
    expanded_df.loc[:, 'tsne_x'] = tsne_results[:, 0]
    expanded_df.loc[:, 'tsne_y'] = tsne_results[:, 1]

    # Filter out noise points (cluster = -1)
    filtered_df = expanded_df[expanded_df["dbscan_cluster"] >= 0]

    # Calculate cluster statistics
    if not filtered_df.empty:
        cluster_means = filtered_df.groupby("dbscan_cluster")[["tsne_x", "tsne_y"]].mean()
        cluster_stds = filtered_df.groupby("dbscan_cluster")[["tsne_x", "tsne_y"]].std()

        # Remove outliers
        filtered_df = filtered_df[
            ~filtered_df.apply(
                lambda row: is_outlier(row, cluster_means, cluster_stds, max_distance_factor),
                axis=1
            )
        ]

        # Create scatter plot
        fig_dbscan = px.scatter(
            filtered_df,
            x='tsne_x',
            y='tsne_y',
            color='dbscan_cluster',
            hover_data=['sentence_windows']
        )
        fig_dbscan.show()

        # Print cluster information
        for cluster_id in sorted(filtered_df["dbscan_cluster"].unique()):
            cluster_data = filtered_df[filtered_df['dbscan_cluster'] == cluster_id]
            print(f"\nCluster {cluster_id} (Size: {len(cluster_data)})")

            # Get common Wikipedia articles
            suggestions = get_common_suggestions(
                cluster_data["sentence_windows"].tolist()
            )
            print("\nHäufigste Wikipedia-Artikel:")
            print(suggestions)
# Process by date
if 'pubtime' in df.columns:
    for date in sorted(df['pubtime'].dt.date.unique()):
        date_df = df[df['pubtime'].dt.date == date]
        print(f"\n\n=== Themen für {date} ===")
        df_plot_dbscan(date_df)

