# SnapLearn - D√©mo Pipeline Complet

Ce notebook d√©montre le pipeline complet de SnapLearn:
1. Traitement de documents (PDF/Image)
2. Recherche RAG dans la base de connaissance
3. G√©n√©ration de script audio
4. Synth√®se vocale
5. Quiz interactif

In [None]:
# Imports
import sys
sys.path.append('..')

from src.universal_document_processor import UniversalDocumentProcessor
from src.amu_knowledge_base import AMUKnowledgeBase
from src.audio_script_generator import AudioScriptGenerator
from src.audio_generator import AudioGenerator

import os
from dotenv import load_dotenv

load_dotenv()

print("‚úÖ Imports r√©ussis")

## 1. Test de la Base de Connaissance RAG

In [None]:
# Initialiser la base de connaissance
kb = AMUKnowledgeBase('../data/amu_datascience_corpus.json')

# Test de recherche
query = "Qu'est-ce qu'un transformer?"
results = kb.search(query, top_k=3)

print(f"Recherche: '{query}'\n")
for i, result in enumerate(results, 1):
    print(f"R√©sultat {i} (score: {result['score']:.2f})")
    print(f"Type: {result['metadata']['type']}")
    print(f"Texte: {result['text'][:200]}...")
    print("-" * 80)

## 2. Test du Traitement de Documents

In [None]:
# Initialiser le processeur
processor = UniversalDocumentProcessor(os.getenv('GEMINI_API_KEY'))

# Test avec un fichier (remplacer par votre fichier de test)
# test_file = '../data/course_materials/m2/s2/week_1_intro.pdf'
# result = processor.process_document(test_file)

# print(f"Type: {result['type']}")
# if result['type'] == 'pdf':
#     print(f"Pages: {result['metadata']['total_pages']}")
#     print(f"Mots totaux: {result['total_words']}")

print("‚ö†Ô∏è D√©commenter et fournir un fichier de test")

## 3. Test de G√©n√©ration de Script

In [None]:
# Initialiser le g√©n√©rateur
script_gen = AudioScriptGenerator(os.getenv('GEMINI_API_KEY'), kb)

# Document de test (simul√©)
test_doc = {
    'type': 'pdf',
    'metadata': {'title': 'Introduction aux Transformers'},
    'pages': [
        {'text': 'Les transformers sont une architecture r√©volutionnaire en NLP...', 'word_count': 500}
    ],
    'total_words': 500,
    'enhanced_metadata': {'main_topic': 'transformers'}
}

# G√©n√©rer le script
script = script_gen.generate_script(test_doc, target_duration=180, style='conversational')

print("Script g√©n√©r√©:")
print(f"\nIntro: {script['intro'][:150]}...")
print(f"\nQuiz: {len(script.get('quiz_questions', []))} questions")
if script.get('quiz_questions'):
    print(f"Question 1: {script['quiz_questions'][0]['question']}")

## 4. Test de G√©n√©ration Audio

In [None]:
# Initialiser le g√©n√©rateur audio
audio_gen = AudioGenerator()

# G√©n√©rer l'audio
audio_path = audio_gen.generate_podcast(script)

print(f"‚úÖ Audio g√©n√©r√©: {audio_path}")

# Afficher dans le notebook (si IPython)
from IPython.display import Audio
Audio(audio_path)

## 5. Test Complet du Pipeline

In [None]:
def test_complete_pipeline(file_path):
    """Test du pipeline complet"""
    print("üöÄ Test du pipeline complet\n")
    
    # 1. Traitement document
    print("1Ô∏è‚É£ Traitement du document...")
    doc_result = processor.process_document(file_path)
    print(f"   ‚úÖ Type: {doc_result['type']}\n")
    
    # 2. G√©n√©ration script
    print("2Ô∏è‚É£ G√©n√©ration du script...")
    script = script_gen.generate_script(doc_result, target_duration=180)
    print(f"   ‚úÖ {len(script.get('main_content', ''))} caract√®res\n")
    
    # 3. G√©n√©ration audio
    print("3Ô∏è‚É£ G√©n√©ration de l'audio...")
    audio_path = audio_gen.generate_podcast(script)
    print(f"   ‚úÖ {audio_path}\n")
    
    # 4. Quiz
    print("4Ô∏è‚É£ Quiz g√©n√©r√©:")
    for i, q in enumerate(script.get('quiz_questions', []), 1):
        print(f"   Q{i}: {q['question']}")
    
    return audio_path

# Tester avec votre fichier
# audio = test_complete_pipeline('../data/course_materials/m2/s2/week_1_intro.pdf')

print("‚ö†Ô∏è Fournir un fichier de test pour ex√©cuter")