In [None]:
# Notebook 3: Test RAG Pipeline
# Ce notebook teste le pipeline complet RAG

# Cellule 1: Imports
import sys
sys.path.append('../src')

from modules.ingestion import DocumentIngestion
from modules.chunking import TextChunker
from modules.embeddings import EmbeddingModel
from modules.retrieval import FAISSRetriever
from modules.generation import ResponseGenerator
from modules.config import config
from pathlib import Path

print("‚úÖ Tous les modules import√©s")

# Cellule 2: Initialiser tous les composants
print("üöÄ Initialisation du pipeline RAG...")

ingestion = DocumentIngestion()
chunker = TextChunker()
retriever = FAISSRetriever()
generator = ResponseGenerator()

print("‚úÖ Pipeline initialis√©")

# Cellule 3: Fonction helper pour le pipeline complet
def process_document_full_pipeline(file_path: Path):
    """
    Pipeline complet : ingestion ‚Üí chunking ‚Üí embedding ‚Üí indexation
    """
    print(f"\nüìÑ Traitement de: {file_path.name}")
    
    # 1. Ingestion
    print("1Ô∏è‚É£ Extraction du texte...")
    doc_info = ingestion.process_document(file_path)
    print(f"   ‚úÖ {doc_info['num_characters']} caract√®res extraits")
    
    # 2. Chunking
    print("2Ô∏è‚É£ Cr√©ation des chunks...")
    chunks = chunker.create_chunks_with_metadata(
        doc_info['content'],
        doc_info['filename']
    )
    print(f"   ‚úÖ {len(chunks)} chunks cr√©√©s")
    
    # 3. Embeddings
    print("3Ô∏è‚É£ G√©n√©ration des embeddings...")
    chunk_texts = [c['content'] for c in chunks]
    embeddings = retriever.embedding_model.encode(chunk_texts)
    print(f"   ‚úÖ Embeddings: {embeddings.shape}")
    
    # 4. Indexation
    print("4Ô∏è‚É£ Ajout √† l'index FAISS...")
    retriever.add_to_index(embeddings, chunks)
    print(f"   ‚úÖ Index: {retriever.index.ntotal} vecteurs")
    
    return {
        'doc_info': doc_info,
        'chunks': chunks,
        'num_vectors': retriever.index.ntotal
    }

# Cellule 4: Traiter un ou plusieurs documents
documents_dir = config.DOCUMENTS_DIR
pdf_files = list(documents_dir.glob("*.pdf"))
txt_files = list(documents_dir.glob("*.txt"))

all_files = pdf_files + txt_files

if all_files:
    print(f"\nüìö {len(all_files)} document(s) trouv√©(s)")
    
    for file_path in all_files[:3]:  # Limiter √† 3 pour le test
        try:
            result = process_document_full_pipeline(file_path)
            print(f"‚úÖ {file_path.name} trait√© avec succ√®s")
        except Exception as e:
            print(f"‚ùå Erreur avec {file_path.name}: {e}")
    
    # Sauvegarder l'index
    retriever.save_index()
    print(f"\nüíæ Index sauvegard√©")
else:
    print("‚ö†Ô∏è Aucun document trouv√© dans data/documents/")

# Cellule 5: Fonction pour tester une question
def ask_question(question: str, top_k: int = 5, verbose: bool = True):
    """
    Pose une question au syst√®me RAG
    """
    if verbose:
        print(f"\n‚ùì Question: {question}")
        print("=" * 80)
    
    # 1. Recherche
    if verbose:
        print("\nüîç Recherche des passages pertinents...")
    retrieved_chunks = retriever.search(question, top_k=top_k)
    
    if verbose:
        print(f"‚úÖ {len(retrieved_chunks)} chunks r√©cup√©r√©s")
        print("\nüìÑ Top 3 passages:")
        for i, chunk in enumerate(retrieved_chunks[:3], 1):
            print(f"\n{i}. Score: {chunk['score']:.4f}")
            print(f"   Document: {chunk['document_name']}")
            print(f"   Extrait: {chunk['content'][:200]}...")
    
    # 2. G√©n√©ration
    if verbose:
        print("\nü§ñ G√©n√©ration de la r√©ponse...")
    response = generator.generate_answer(question, retrieved_chunks)
    
    if verbose:
        print("\nüí° R√âPONSE:")
        print("=" * 80)
        print(response['answer'])
        print("=" * 80)
        print(f"\nüìö Sources: {response['context_used']} chunks utilis√©s")
    
    return response

# Cellule 6: Tests avec diff√©rentes questions
if retriever.index and retriever.index.ntotal > 0:
    questions = [
        "Quel est le sujet principal du document ?",
        "Quels sont les points cl√©s abord√©s ?",
        "Y a-t-il des dates importantes mentionn√©es ?",
        "Qui sont les personnes cit√©es ?",
        "Quelles sont les conclusions ?"
    ]
    
    print("\n" + "=" * 80)
    print("üß™ TEST DU PIPELINE RAG")
    print("=" * 80)
    
    for question in questions[:2]:  # Tester 2 questions
        response = ask_question(question)
        print("\n" + "-" * 80 + "\n")
else:
    print("‚ö†Ô∏è Aucun document index√©. Ex√©cutez d'abord la cellule 4.")

# Cellule 7: √âvaluation de la pertinence
def evaluate_retrieval(question: str, top_k: int = 10):
    """
    √âvalue la qualit√© de la recherche
    """
    results = retriever.search(question, top_k=top_k)
    
    scores = [r['score'] for r in results]
    
    print(f"\nüìä Analyse pour: '{question}'")
    print(f"Nombre de r√©sultats: {len(results)}")
    print(f"Score moyen: {sum(scores)/len(scores):.4f}")
    print(f"Score max: {max(scores):.4f}")
    print(f"Score min: {min(scores):.4f}")
    
    # Afficher les scores
    import matplotlib.pyplot as plt
    
    plt.figure(figsize=(10, 5))
    plt.bar(range(len(scores)), scores)
    plt.xlabel("Rang du r√©sultat")
    plt.ylabel("Score de similarit√©")
    plt.title(f"Distribution des scores de pertinence\n'{question}'")
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return results

# Cellule 8: Test d'√©valuation
if retriever.index and retriever.index.ntotal > 0:
    test_query = "Quelles sont les informations principales ?"
    eval_results = evaluate_retrieval(test_query)

# Cellule 9: Comparaison de diff√©rentes questions
def compare_queries(queries: list):
    """
    Compare les performances pour diff√©rentes questions
    """
    import pandas as pd
    
    comparison = []
    
    for query in queries:
        results = retriever.search(query, top_k=5)
        avg_score = sum(r['score'] for r in results) / len(results)
        max_score = max(r['score'] for r in results)
        
        comparison.append({
            'Question': query[:50] + "...",
            'Score moyen': f"{avg_score:.4f}",
            'Score max': f"{max_score:.4f}",
            'Nb r√©sultats': len(results)
        })
    
    df = pd.DataFrame(comparison)
    return df

# Test de comparaison
if retriever.index and retriever.index.ntotal > 0:
    test_queries = [
        "Quel est le contexte ?",
        "Quels sont les objectifs ?",
        "Quelles sont les m√©thodes utilis√©es ?",
        "Quels sont les r√©sultats obtenus ?"
    ]
    
    comparison_df = compare_queries(test_queries)
    print("\nüìä Comparaison des requ√™tes:")
    display(comparison_df)

# Cellule 10: Export des r√©sultats
import json
from datetime import datetime

def export_test_results(question: str, response: dict, output_dir: Path = None):
    """
    Exporte les r√©sultats d'un test
    """
    output_dir = output_dir or config.DATA_DIR / "test_results"
    output_dir.mkdir(exist_ok=True)
    
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    output_file = output_dir / f"test_{timestamp}.json"
    
    test_data = {
        'timestamp': timestamp,
        'question': question,
        'answer': response['answer'],
        'sources': response['sources'],
        'context_used': response['context_used']
    }
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(test_data, f, ensure_ascii=False, indent=2)
    
    print(f"‚úÖ R√©sultats export√©s: {output_file}")

# Exemple d'export
if 'response' in locals():
    export_test_results(questions[0], response)