In [1]:
pip install ollama


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import ollama


In [3]:
import psycopg
from psycopg import Cursor
import os
import glob

In [16]:
data_dir = os.path.join(os.path.dirname(os.getcwd()), "data")

OLLAMA_MODEL = "nomic-embed-text"

db_connection_str = "postgresql://postgres:postgres@localhost:5433/chatbot"

print(f"‚úì Dossier de donn√©es : {data_dir}")
print(f"‚úì Mod√®le Ollama : {OLLAMA_MODEL}")
print(f"‚úì Configuration charg√©e")

‚úì Dossier de donn√©es : C:\Users\bahri\OneDrive\Bureau\chabot-old\Chatbot-RAG\data
‚úì Mod√®le Ollama : nomic-embed-text
‚úì Configuration charg√©e


In [17]:
try:
    models = ollama.list()
    
    # Extraire correctement tous les noms disponibles
    if isinstance(models, dict) and 'models' in models:
        model_names = [m.get('name') or m.get('model') for m in models['models']]
    elif isinstance(models, list):
        model_names = [m.get('name') or m.get('model') for m in models]
    else:
        model_names = []

    if OLLAMA_MODEL in model_names:
        print(f"‚úì Mod√®le {OLLAMA_MODEL} disponible")
    else:
        print(f"‚ö†Ô∏è  Mod√®le {OLLAMA_MODEL} non trouv√©")
        print("   Mod√®les disponibles :", ", ".join(model_names))

    # Test embeddings
    print("\nüß™ Test d'embedding...")
    test_response = ollama.embeddings(model=OLLAMA_MODEL, prompt="test")
    print(f"‚úì Ollama fonctionne correctement")

except Exception as e:
    print(f"‚ùå Erreur de connexion √† Ollama : {e}")

‚ö†Ô∏è  Mod√®le nomic-embed-text non trouv√©
   Mod√®les disponibles : 

üß™ Test d'embedding...
‚úì Ollama fonctionne correctement


In [18]:
def create_conversation_list(file_path: str) -> list[str]:
    """Lit le fichier avec le bon encodage et filtre les lignes"""
    try:
        with open(file_path, "r", encoding="utf-8") as file:
            text = file.read()
    except UnicodeDecodeError:
        print(f"‚ö†Ô∏è  Encodage UTF-8 invalide pour {file_path} ‚Äî r√©essayage avec cp1252")
        with open(file_path, "r", encoding="cp1252", errors="replace") as file:
            text = file.read()

    text_list = text.split("\n")
    filtered_list = [
        chaine.removeprefix("     ")
        for chaine in text_list
        if not chaine.startswith("<") and chaine.strip()
    ]
    print(f"‚úì {len(filtered_list)} lignes extraites")
    return filtered_list


def calculate_embeddings(corpus: str) -> list[float]:
    """Calcule les embeddings avec Ollama"""
    if not corpus or not corpus.strip():
        raise ValueError("Le corpus ne peut pas √™tre vide")
    
    # Appel √† Ollama pour g√©n√©rer l'embedding
    response = ollama.embeddings(
        model=OLLAMA_MODEL,
        prompt=corpus
    )
    return response["embedding"]


def save_embedding(corpus: str, embedding: list[float], cursor: Cursor) -> None:
    """Sauvegarde le corpus et son embedding"""
    cursor.execute(
        '''INSERT INTO embeddings (corpus, embedding) VALUES (%s, %s)''',
        (corpus, embedding)
    )


def similar_corpus(input_corpus: str, db_connection_str: str, top_k: int = 5) -> list[tuple]:
    """
    Recherche les textes similaires dans la base de donn√©es
    """
    query_embedding = calculate_embeddings(input_corpus)
    
    with psycopg.connect(db_connection_str) as conn:
        with conn.cursor() as cur:
            cur.execute("""
                SELECT id, corpus, 
                       1 - (embedding <=> %s::vector) as similarity
                FROM embeddings
                ORDER BY embedding <=> %s::vector
                LIMIT %s
            """, (query_embedding, query_embedding, top_k))
            
            return cur.fetchall()

print("‚úì Fonctions d√©finies")


‚úì Fonctions d√©finies


In [19]:
try:
    test_text = "Ceci est un test d'embedding"
    test_embedding = calculate_embeddings(test_text)
    print(f"‚úì Test r√©ussi !")
    print(f"  Texte : '{test_text}'")
    print(f"  Dimension de l'embedding : {len(test_embedding)}")
    print(f"  Premiers valeurs : {test_embedding[:5]}")
    
    # Stocker la dimension pour la cr√©ation de table
    EMBEDDING_DIM = len(test_embedding)
    print(f"\n‚úì Dimension d√©tect√©e : {EMBEDDING_DIM}")
except Exception as e:
    print(f"‚ùå Erreur lors du test : {e}")
    EMBEDDING_DIM = 768  # Valeur par d√©faut pour mistral

‚úì Test r√©ussi !
  Texte : 'Ceci est un test d'embedding'
  Dimension de l'embedding : 768
  Premiers valeurs : [0.6738312840461731, 0.7894376516342163, -3.0094993114471436, -1.5439304113388062, 1.3889840841293335]

‚úì Dimension d√©tect√©e : 768


In [23]:
print("=" * 70)
print("üöÄ CR√âATION DE LA BASE D'EMBEDDINGS")
print("=" * 70)

with psycopg.connect(db_connection_str) as conn:
    conn.autocommit = True
    with conn.cursor() as cur:
        # Supprimer l'index d'abord (pour √©viter les erreurs)
        cur.execute("""DROP INDEX IF EXISTS embeddings_embedding_idx""")
        print("‚úì Index existant supprim√©")
        
        # Supprimer la table si elle existe
        cur.execute("""DROP TABLE IF EXISTS embeddings CASCADE""")
        print("‚úì Table existante supprim√©e")
        
        # Cr√©er l'extension pgvector
        cur.execute("""CREATE EXTENSION IF NOT EXISTS vector""")
        print("‚úì Extension pgvector cr√©√©e")
        
        # Cr√©er la table avec la dimension d√©tect√©e
        cur.execute(f"""
            CREATE TABLE embeddings (
                id SERIAL PRIMARY KEY, 
                corpus TEXT,
                embedding VECTOR({EMBEDDING_DIM})
            )
        """)
        print(f"‚úì Table embeddings cr√©√©e avec VECTOR({EMBEDDING_DIM})")
        
        # Cr√©er un index pour acc√©l√©rer les recherches
        # Note: L'index sera cr√©√© apr√®s l'insertion des donn√©es pour de meilleures performances
        print("‚úì Table pr√™te (l'index sera cr√©√© apr√®s insertion)")

üöÄ CR√âATION DE LA BASE D'EMBEDDINGS
‚úì Index existant supprim√©
‚úì Table existante supprim√©e
‚úì Extension pgvector cr√©√©e
‚úì Table embeddings cr√©√©e avec VECTOR(768)
‚úì Table pr√™te (l'index sera cr√©√© apr√®s insertion)


In [24]:
print("\n" + "=" * 70)
print("üìÇ CHARGEMENT DES DONN√âES")
print("=" * 70)

text_files = sorted(glob.glob(os.path.join(data_dir, "*.txt")))

if not text_files:
    print(f"‚ö†Ô∏è  Aucun fichier .txt trouv√© dans le dossier {data_dir}")
else:
    print(f"‚úì {len(text_files)} fichier(s) trouv√©(s)")
    for i, file in enumerate(text_files[:5], 1):  # Afficher les 5 premiers
        print(f"  {i}. {os.path.basename(file)}")
    if len(text_files) > 5:
        print(f"  ... et {len(text_files) - 5} autre(s)")


üìÇ CHARGEMENT DES DONN√âES
‚úì 41 fichier(s) trouv√©(s)
  1. 017_00000012.txt
  2. 018_00000013.txt
  3. 019_00000014.txt
  4. 020_00000015.txt
  5. 022_00000017.txt
  ... et 36 autre(s)


In [15]:
print("\n" + "=" * 70)
print("‚öôÔ∏è  TRAITEMENT DES EMBEDDINGS")
print("=" * 70)

success_count = 0
error_count = 0

with psycopg.connect(db_connection_str) as conn:
    conn.autocommit = True
    with conn.cursor() as cur:
        total_files = len(text_files)
        
        for file_idx, file_path in enumerate(text_files, 1):
            print(f"\nüî∏ Traitement du fichier [{file_idx}/{total_files}] : {os.path.basename(file_path)}")
            corpus_list = create_conversation_list(file_path=file_path)

            for i, corpus in enumerate(corpus_list, 1):
                try:
                    embedding = calculate_embeddings(corpus)
                    save_embedding(corpus=corpus, embedding=embedding, cursor=cur)
                    success_count += 1
                
                    # Afficher un aper√ßu
                    preview = corpus[:50] + "..." if len(corpus) > 50 else corpus
                    print(f"‚úì [{i}/{len(corpus_list)}] {preview}")
                
                except Exception as e:
                    error_count += 1
                    print(f"‚úó [{i}/{len(corpus_list)}] ERREUR: {e}")


‚öôÔ∏è  TRAITEMENT DES EMBEDDINGS


OperationalError: connection failed: connection to server at "127.0.0.1", port 5433 failed: FATAL:  database "chatbot" does not exist
Multiple connection attempts failed. All failures were:
- host: 'localhost', port: '5433', hostaddr: '::1': connection timeout expired
- host: 'localhost', port: '5433', hostaddr: '127.0.0.1': connection failed: connection to server at "127.0.0.1", port 5433 failed: FATAL:  database "chatbot" does not exist

In [None]:
print("\n" + "=" * 70)
print("üìä R√âSUM√â")
print("=" * 70)
print(f"‚úì Succ√®s: {success_count}")
print(f"‚úó Erreurs: {error_count}")
print(f"üì¶ Total sauvegard√©: {success_count}")

# Cr√©er l'index maintenant que les donn√©es sont ins√©r√©es
if success_count > 0:
    print("\nüîß Cr√©ation de l'index de recherche...")
    with psycopg.connect(db_connection_str) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute("""
                CREATE INDEX IF NOT EXISTS embeddings_embedding_idx 
                ON embeddings USING ivfflat (embedding vector_cosine_ops)
                WITH (lists = 100)
            """)
            print("‚úì Index de recherche cr√©√©")

In [None]:
if success_count > 0:
    print("\n" + "=" * 70)
    print("üîç TEST DE RECHERCHE")
    print("=" * 70)
    
    test_query = "stage anglais espagnol"
    print(f"Requ√™te: '{test_query}'")
    
    try:
        results = similar_corpus(test_query, db_connection_str, top_k=3)
        print(f"\nüìå Top 3 r√©sultats:")
        for doc_id, corpus, similarity in results:
            preview = corpus[:60] + "..." if len(corpus) > 60 else corpus
            print(f"  [Score: {similarity:.4f}] {preview}")
    except Exception as e:
        print(f"‚ùå Erreur: {e}")

In [None]:
def recherche_interactive(query: str, top_k: int = 5):
    """
    Fonction pour faire des recherches facilement dans le notebook
    """
    print(f"üîç Recherche: '{query}'")
    print("-" * 70)
    
    try:
        results = similar_corpus(query, db_connection_str, top_k=top_k)
        
        if not results:
            print("Aucun r√©sultat trouv√©")
            return
        
        for i, (doc_id, corpus, similarity) in enumerate(results, 1):
            print(f"\nüìÑ R√©sultat {i} (Score: {similarity:.4f})")
            print(f"   ID: {doc_id}")
            print(f"   Texte: {corpus[:100]}{'...' if len(corpus) > 100 else ''}")
            
    except Exception as e:
        print(f"‚ùå Erreur: {e}")

# Exemple d'utilisation :
# recherche_interactive("votre requ√™te ici", top_k=3)

print("‚úì Fonction de recherche interactive disponible")
print("  Usage: recherche_interactive('votre requ√™te', top_k=5)")