In [None]:
# %%
import os
import pandas as pd
import numpy as np
from dotenv import load_dotenv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

load_dotenv()

# Pfade aus .env
TRAIN_PATH = os.getenv("TRAIN_PATH", "data/train.csv")
VAL_PATH = os.getenv("VAL_PATH", "data/val.csv")
TEST_PATH = os.getenv("TEST_PATH", "data/test.csv")
RETRIEVER_OUTPUT_DIR = os.getenv("RETRIEVER_OUTPUT_DIR", "data/retriever")

Path(RETRIEVER_OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

print("âœ“ Imports und Konfiguration geladen")
print(f"  Train: {TRAIN_PATH}")
print(f"  Val:   {VAL_PATH}")
print(f"  Test:  {TEST_PATH}")

# %%

In [None]:
# Lade die Splits
print("\nðŸ“– Lade Daten-Splits...")

train_df = pd.read_csv(TRAIN_PATH)
val_df = pd.read_csv(VAL_PATH)
test_df = pd.read_csv(TEST_PATH)

print(f"  âœ“ Train-Set:      {len(train_df)} Dokumente")
print(f"  âœ“ Validation-Set: {len(val_df)} Dokumente")
print(f"  âœ“ Test-Set:       {len(test_df)} Dokumente")

total_docs = len(train_df) + len(val_df) + len(test_df)
print(f"\n  Gesamt: {total_docs} Dokumente")
print(f"  Spalten: {list(train_df.columns)}")

# %%

In [None]:
# TF-IDF Vectorizer trainieren (nur auf Training-Set)
print("\nðŸ”§ Trainiere TF-IDF Vectorizer...")

# Kombiniere relevante Text-Spalten
train_texts = train_df['oz_bez']

tfidf = TfidfVectorizer(
    max_features=10000,
    min_df=2,
    max_df=0.8,
    ngram_range=(1, 2),
    analyzer='char',
    strip_accents='unicode',
    lowercase=True
)

# Fit auf Training-Set
tfidf_matrix_train = tfidf.fit_transform(train_texts)
print(f"  âœ“ TF-IDF Vocabulary-GrÃ¶ÃŸe: {len(tfidf.get_feature_names_out())}")
print(f"  âœ“ Training-Matrix Shape: {tfidf_matrix_train.shape}")

# Speichere Vectorizer
with open(f"{RETRIEVER_OUTPUT_DIR}/tfidf_vectorizer.pkl", "wb") as f:
    pickle.dump(tfidf, f)

print("  âœ“ Vectorizer gespeichert")

# %%

In [None]:
# Transformer fÃ¼r Validation & Test
print("\nðŸ“Š Transformiere Validation & Test Sets...")

tfidf_matrix_val = tfidf.transform(val_df['oz_bez'])
tfidf_matrix_test = tfidf.transform(test_df['oz_bez'])

print(f"  âœ“ Validation-Matrix Shape: {tfidf_matrix_val.shape}")
print(f"  âœ“ Test-Matrix Shape: {tfidf_matrix_test.shape}")

# Speichern fÃ¼r spÃ¤teren Gebrauch
import scipy.sparse as sp
sp.save_npz(f"{RETRIEVER_OUTPUT_DIR}/tfidf_train.npz", tfidf_matrix_train)
sp.save_npz(f"{RETRIEVER_OUTPUT_DIR}/tfidf_val.npz", tfidf_matrix_val)
sp.save_npz(f"{RETRIEVER_OUTPUT_DIR}/tfidf_test.npz", tfidf_matrix_test)

# %%

In [None]:
# Retriever Funktion: Top-K Ã¤hnliche Dokumente finden
def retrieve_documents(query_text, k=5, tfidf_vect=None, doc_matrix=None, doc_ids=None):
    """
    Findet die Top-K Ã¤hnlichsten Dokumente zu einer Query

    Args:
        query_text: Query String
        k: Anzahl der Top-K Ergebnisse
        tfidf_vect: Trainierter TF-IDF Vectorizer
        doc_matrix: TF-IDF Matrix der Dokumente
        doc_ids: Indices der Dokumente

    Returns:
        List of (doc_id, score, text) tuples
    """
    query_vec = tfidf_vect.transform([query_text])
    similarities = cosine_similarity(query_vec, doc_matrix).flatten()

    top_k_indices = np.argsort(similarities)[-k:][::-1]

    results = []
    for idx in top_k_indices:
        results.append({
            'index': idx,
            'score': similarities[idx],
            'doc_id': doc_ids.iloc[idx] if hasattr(doc_ids, 'iloc') else doc_ids[idx]
        })

    return results

# %%

In [None]:
# Test: Retriever auf Validation-Set
print("\nðŸ§ª Teste Retriever auf Validation-Set...")

# Beispiel-Queries
test_queries = [
    val_df.iloc[10]['oz_bez'][:50] if len(val_df) > 10 else "Test",
    val_df.iloc[50]['oz_bez'][:50] if len(val_df) > 50 else "Test",
    "schraube bolzen"
]

for i, query in enumerate(test_queries):
    print(f"\n  Query {i+1}: '{query}'")
    results = retrieve_documents(
        query,
        k=3,
        tfidf_vect=tfidf,
        doc_matrix=tfidf_matrix_val,
        doc_ids=val_df.reset_index(drop=True).index
    )

    for j, result in enumerate(results):
        print(f"    {j+1}. Score: {result['score']:.4f} | Doc-ID: {result['doc_id']}")

print("\nâœ“ Retriever-Test abgeschlossen")

# %%

In [None]:
# Reranker Funktion (Optional: KÃ¶nnte spÃ¤ter mit LLM erweitert werden)
def rerank_documents(query_text, retrieved_docs, val_df, method='cosine'):
    """
    Rerankt die bereits retrievten Dokumente

    Args:
        query_text: Original Query
        retrieved_docs: Liste von retrivierten Dokumenten
        val_df: DataFrame mit Dokumente
        method: Reranking-Methode ('cosine', 'bm25', etc.)

    Returns:
        Reranked Documents
    """
    # Hier kÃ¶nnten komplexere Reranking-Methoden implementiert werden
    # FÃ¼r jetzt: Sortiere nach Score (bereits gemacht im Retriever)

    return sorted(retrieved_docs, key=lambda x: x['score'], reverse=True)

# %%

In [None]:
# Evaluierungs-Metriken
print("\nðŸ“ˆ Evaluierungs-Metriken...")

def evaluate_retriever(queries, ground_truth_indices, retriever_func, k=5):
    """
    Evaluiert den Retriever mit MRR und NDCG
    """
    mrr_scores = []

    for query, gt_idx in zip(queries, ground_truth_indices):
        results = retriever_func(query, k=k)
        retrieved_indices = [r['index'] for r in results]

        # Mean Reciprocal Rank
        if gt_idx in retrieved_indices:
            rank = retrieved_indices.index(gt_idx) + 1
            mrr_scores.append(1 / rank)
        else:
            mrr_scores.append(0)

    mean_mrr = np.mean(mrr_scores)
    return {'MRR': mean_mrr, 'scores': mrr_scores}

print("âœ“ Evaluierungs-Funktionen vorbereitet")

# %%

In [None]:
print("\nðŸŽ‰ Retriever & Reranker Setup abgeschlossen!")
print(f"\n   Speicherort: {RETRIEVER_OUTPUT_DIR}")
print(f"   - train.csv, val.csv, test.csv")
print(f"   - tfidf_vectorizer.pkl")
print(f"   - tfidf_train.npz, tfidf_val.npz, tfidf_test.npz")