
Dieses Notebook vektorisiert Reddit Posts mit dem GTE-Multilingual-Base Model.

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import torch
from tqdm import tqdm
import pickle
import os
from datetime import datetime

In [None]:
# GPU verfügbarkeit prüfen
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

# Model laden
print("Lade GTE-Multilingual-Base Model...")
model = SentenceTransformer('Alibaba-NLP/gte-multilingual-base', device=device, trust_remote_code=True)
print(f"Model geladen. Embedding Dimension: {model.get_sentence_embedding_dimension()}")

In [None]:
# Daten laden
print("Lade CSV Datei...")
df = pd.read_csv('data/Reddit_Data.csv')
print(f"Datensatz geladen: {len(df)} Zeilen")
print(f"Spalten: {list(df.columns)}")
print(f"Erste 3 Zeilen:")
print(df.head(3))

In [None]:
# Text für Embeddings vorbereiten
print("Bereite Texte vor...")

# Kombiniert Title und Text für bessere Embeddings
df['combined_text'] = df['title'].fillna('') + ' ' + df['text'].fillna('')
df['combined_text'] = df['combined_text'].str.strip()

# Entfernt leere Texte
df = df[df['combined_text'] != '']

print(f"Nach Bereinigung: {len(df)} Texte")
print(f"Durchschnittliche Textlänge: {df['combined_text'].str.len().mean():.1f} Zeichen")

In [None]:
# Batch-Processing Setup
BATCH_SIZE = 32  # Anpassbar je nach GPU Memory
texts = df['combined_text'].tolist()
total_batches = len(texts) // BATCH_SIZE + (1 if len(texts) % BATCH_SIZE else 0)

print(f"Verarbeite {len(texts)} Texte in {total_batches} Batches (Batch Size: {BATCH_SIZE})")
print(f"Geschätzte Zeit: {total_batches * 2:.1f} - {total_batches * 5:.1f} Minuten")

In [None]:
# Vektorisierung
print("Starte Vektorisierung...")
start_time = datetime.now()

all_embeddings = []

# Process in batches
for i in tqdm(range(0, len(texts), BATCH_SIZE), desc="Vektorisierung"):
    batch_texts = texts[i:i+BATCH_SIZE]
    
    # Embeddings für den Batch erstellen
    batch_embeddings = model.encode(
        batch_texts, 
        batch_size=BATCH_SIZE,
        show_progress_bar=False,
        convert_to_numpy=True
    )
    
    all_embeddings.append(batch_embeddings)
    
    # Memory cleanup alle 100 Batches
    if (i // BATCH_SIZE + 1) % 100 == 0:
        torch.cuda.empty_cache() if device == 'cuda' else None

# Alle Embeddings zusammenführen
embeddings_array = np.vstack(all_embeddings)

end_time = datetime.now()
duration = end_time - start_time

print(f"\nVektorisierung abgeschlossen!")
print(f"Dauer: {duration}")
print(f"Embeddings Shape: {embeddings_array.shape}")
print(f"Durchschnittliche Zeit pro Text: {duration.total_seconds() / len(texts):.4f} Sekunden")

In [None]:
# Embeddings speichern
print("Speichere Embeddings...")

# Als NumPy Array speichern
np.save('data/Reddit_embeddings.npy', embeddings_array)

# Metadaten speichern
metadata = {
    'model_name': 'Alibaba-NLP/gte-multilingual-base',
    'embedding_dimension': embeddings_array.shape[1],
    'num_texts': embeddings_array.shape[0],
    'processing_time': str(duration),
    'batch_size': BATCH_SIZE,
    'device_used': device
}

with open('data/embedding_metadata.pkl', 'wb') as f:
    pickle.dump(metadata, f)

# DataFrame mit IDs speichern für spätere Zuordnung
df_reduced = df[['title','text','score','created']].reset_index(drop=True)
df_reduced.to_csv('data/Reddit_metadata.csv', index=False)

print(f"Embeddings gespeichert als 'data/Reddit_embeddings.npy'")
print(f"Metadaten gespeichert als 'data/embedding_metadata.pkl'")
print(f"Post-Metadaten gespeichert als 'data/Reddit_metadata.csv'")
print(f"\nDateigröße Embeddings: {os.path.getsize('data/reddit_embeddings.npy') / 1024 / 1024:.1f} MB")

In [None]:
# Test: Embeddings laden und ähnlichste Posts finden
print("\nTest: Lade Embeddings und finde ähnliche Posts...")

# Embeddings laden
loaded_embeddings = np.load('data/Reddit_embeddings.npy')
loaded_metadata = pd.read_csv('data/Reddit_metadata.csv')

print(f"Geladene Embeddings Shape: {loaded_embeddings.shape}")

# Beispiel: Finde ähnliche Posts zu "Tesla stock"
query = "Tesla stock price prediction"
query_embedding = model.encode([query])

# Cosine Similarity berechnen
from sklearn.metrics.pairwise import cosine_similarity
similarities = cosine_similarity(query_embedding, loaded_embeddings)[0]

# Top 5 ähnlichste Posts
top_indices = np.argsort(similarities)[-5:][::-1]

print(f"\nTop 5 ähnlichste Posts zu '{query}':")
for i, idx in enumerate(top_indices):
    print(f"{i+1}. Similarity: {similarities[idx]:.3f}")
    print(f"   Subreddit: {loaded_metadata.iloc[idx]['subreddit']}")
    print(f"   Title: {loaded_metadata.iloc[idx]['title'][:100]}...")
    print()