Nettoyer les Données

Installation des dépendances, téléchargement du modèle spaCy et exécution du script de nettoyage des données

In [2]:
# Installer les dépendances
!pip install pandas spacy fr_core_news_sm  # ou en_core_web_sm

# Télécharger le modèle spaCy
!python -m spacy download fr_core_news_sm

# Lancer le script
!python clean_data.py

[31mERROR: Could not find a version that satisfies the requirement fr_core_news_sm (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for fr_core_news_sm[0m[31m
[0mCollecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m94.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
python3: can't open file '/content/clean_dat

In [3]:
!pip install langdetect

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/981.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m153.6/981.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m972.8/981.5 kB[0m [31m15.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m13.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25l[?25hdone
  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993223 sha256=f53d8b32bac3bc4651057e005a298c4c6934a854c4dfb2eb57fc29477e36488f
  Stored in directory: /root/.cache/pip/wheels/0a/f2/b2/e5ca405801e05eb7c8ed5b3b4bcf1fcabcd62

In [8]:
!pip install pandas spacy langdetect tqdm
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m97.2 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Installation des dépendances, téléchargement du modèle spaCy et exécution du script de nettoyage des données

In [10]:
# clean_data.py
import pandas as pd
import re
import spacy
from typing import List, Dict
from unicodedata import normalize
from langdetect import detect
import warnings
from tqdm import tqdm  # Pour une barre de progression

# Configuration initiale
warnings.filterwarnings("ignore", category=UserWarning)
tqdm.pandas()  # Active la barre de progression pour pandas

# Initialiser spaCy avec une limite étendue
try:
    nlp = spacy.load("fr_core_news_sm")
    nlp.max_length = 2000000  # Double la limite de caractères
except OSError:
    print("⚠️ Modèle spaCy non trouvé. Veuillez l'installer avec :")
    print("python -m spacy download fr_core_news_sm")
    exit(1)

def is_chinese(text: str) -> bool:
    """Détecte si le texte est en chinois"""
    try:
        if not isinstance(text, str) or len(text.strip()) < 10:
            return False
        # On vérifie seulement les premiers 1000 caractères pour la détection de langue
        return detect(text[:1000]) == 'zh'
    except:
        return False

def normalize_text(text: str) -> str:
    """
    Nettoie et normalise le texte avec gestion des longs textes
    """
    if not isinstance(text, str) or is_chinese(text):
        return ""

    # Limite la taille du texte pour spaCy (500k caractères max)
    processing_text = text[:500000]

    # Normalisation Unicode
    processing_text = normalize("NFKD", processing_text).encode("ASCII", "ignore").decode("utf-8")

    # Nettoyage de base
    processing_text = re.sub(r"http\S+|@\w+|#\w+", "", processing_text)
    processing_text = re.sub(r"[^a-zA-Z0-9\séèêëàâäîïôöùûüç]", " ", processing_text)

    # Traitement par morceaux si le texte est trop long
    chunk_size = 100000
    text_chunks = [processing_text[i:i+chunk_size]
                  for i in range(0, len(processing_text), chunk_size)]

    final_tokens = []
    for chunk in text_chunks:
        doc = nlp(chunk.lower())
        final_tokens.extend([token.lemma_ for token in doc
                           if not token.is_stop and not token.is_punct])

    return " ".join(final_tokens).strip()

def preprocess_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """Applique le prétraitement de manière optimisée"""
    # 1. Nettoyage initial
    df = df.drop_duplicates(subset=["URL"])
    df = df.dropna(subset=["Contenu", "Titre"])
    print(f"📊 Après nettoyage initial : {len(df)} articles")

    # 2. Filtrage des articles en chinois
    print("🔍 Filtrage des articles en chinois...")
    df['is_chinese'] = df['Contenu'].progress_apply(is_chinese)
    chinese_count = df['is_chinese'].sum()
    print(f"🚮 {chinese_count} articles en chinois détectés et supprimés")
    df = df[~df['is_chinese']].copy()

    # 3. Normalisation du texte avec barre de progression
    text_columns = ["Titre", "Auteur", "Description", "Contenu"]
    for col in text_columns:
        if col in df.columns:
            print(f"🔄 Normalisation de la colonne {col}...")
            df[col] = df[col].progress_apply(normalize_text)

    # 4. Formatage des dates
    if "Date" in df.columns:
        df["Date"] = pd.to_datetime(df["Date"], errors="coerce").dt.strftime("%Y-%m-%d")

    # 5. Ajout d'ID
    df["ID"] = range(1, len(df) + 1)

    return df[["ID", "Titre", "Auteur", "Date", "Source", "URL", "Contenu"]]

if __name__ == "__main__":
    try:
        print("📂 Chargement des données...")
        df = pd.read_csv("/content/drive/MyDrive/merged_articles.csv")
        print(f"🔍 Données brutes chargées : {len(df)} articles")

        cleaned_df = preprocess_dataframe(df)

        print("💾 Sauvegarde des données nettoyées...")
        cleaned_df.to_csv("/content/drive/MyDrive/cleaned_articles.csv", index=False)
        print(f"✅ {len(cleaned_df)} articles sauvegardés")
        print("📊 Aperçu final :")
        print(cleaned_df.head(3).to_markdown(tablefmt="grid"))

    except Exception as e:
        print(f"❌ Erreur : {str(e)}")

📂 Chargement des données...
🔍 Données brutes chargées : 3755 articles
📊 Après nettoyage initial : 2480 articles
🔍 Filtrage des articles en chinois...


100%|██████████| 2480/2480 [00:13<00:00, 180.93it/s]


🚮 0 articles en chinois détectés et supprimés
🔄 Normalisation de la colonne Titre...


100%|██████████| 2480/2480 [00:42<00:00, 58.36it/s]


🔄 Normalisation de la colonne Auteur...


100%|██████████| 2480/2480 [00:32<00:00, 76.43it/s]


🔄 Normalisation de la colonne Description...


100%|██████████| 2480/2480 [00:45<00:00, 54.55it/s]


🔄 Normalisation de la colonne Contenu...


100%|██████████| 2480/2480 [06:41<00:00,  6.17it/s]


💾 Sauvegarde des données nettoyées...
✅ 2480 articles sauvegardés
📊 Aperçu final :
+----+------+----------------------------------------------+----------------------------------+------------+---------------+-------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

Construction du Système RAG

Installation des packages essentiels pour traitement de données et indexation vectorielle (FAISS CPU et GPU optionnel)

In [11]:
!pip install sentence-transformers faiss-cpu numpy pandas tqdm
# Pour GPU NVIDIA (optionnel) :
!pip install faiss-gpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.8 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_6

In [13]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/cleaned_articles.csv')
df

Unnamed: 0,ID,Titre,Auteur,Date,Source,URL,Contenu
0,1,mumps programming language,contributor to wikimedia project,2025-06-13,Wikipedia.org,https://en.wikipedia.org/wiki/MUMPS,mumps massachusetts general hospital utilit...
1,2,yet another insignificer programming note,,2025-06-23,Bitbucket.io,https://chua.bitbucket.io,avaliabl and \n last modified march 20...
2,3,multi stage programming with splice variable,,2025-06-27,Tsung-ju.org,https://tsung-ju.org/icfp25/,thi is an interactif demonstration of the icfp...
3,4,oxcaml set of extension to the ocaml progra...,,2025-06-13,Oxcaml.org,https://oxcaml.org/,it is both jan street production compiler we...
4,5,programming language design in the era of llm ...,kiran gopinathan,2025-06-17,Kirancodes.me,https://kirancodes.me/posts/log-lang-design-ll...,the most exciting part of programming language...
...,...,...,...,...,...,...,...
2475,2476,custom dialog closure waterflow nesting st...,share to x,2025-06-28,Dev.to,https://dev.to/qingkouwei/custom-dialog-closur...,privacy policy pop up is set the launch page w...
2476,2477,dialog and navigation conflict rich text s...,share to x,2025-06-28,Dev.to,https://dev.to/qingkouwei/dialog-and-navigatio...,when the app starts for the first tim privac...
2477,2478,state variable common page visible area ch...,share to x,2025-06-28,Dev.to,https://dev.to/qingkouwei/state-variables-comm...,wrapping the function in proper closure allows...
2478,2479,drag to adjust list order tab rebound cust...,share to x,2025-06-28,Dev.to,https://dev.to/qingkouwei/drag-to-adjust-list-...,refer to \n\n when the tab componer slide to t...


In [14]:
print(df.columns)

Index(['ID', 'Titre', 'Auteur', 'Date', 'Source', 'URL', 'Contenu'], dtype='object')


Système RAG complet : génération d’embeddings, indexation FAISS, recherche sémantique interactive et sauvegarde dans Google Drive

In [15]:
# rag_system.py
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm
import os

# Configuration des chemins Google Drive
DRIVE_PATH = "/content/drive/MyDrive/rag_project"
os.makedirs(DRIVE_PATH, exist_ok=True)

# 1. Charger les données nettoyées
INPUT_PATH = "/content/drive/MyDrive/cleaned_articles.csv"
print(f"📂 Chargement des données depuis {INPUT_PATH}...")
df = pd.read_csv(INPUT_PATH)

# Vérification des colonnes disponibles
print("📊 Colonnes disponibles:", df.columns.tolist())

# Utilisation des colonnes existantes
texts = df["Contenu"].fillna("").tolist()
metadata_columns = ["ID", "Titre", "URL"]  # Colonnes obligatoires
metadata = df[metadata_columns].to_dict('records')

# 2. Génération des embeddings
EMBEDDINGS_PATH = f"{DRIVE_PATH}/article_embeddings.npy"
print("🔧 Création des embeddings...")
model = SentenceTransformer('all-MiniLM-L6-v2')

# Génération par batch pour les grands datasets
embeddings = []
batch_size = 32 if len(texts) > 1000 else 64

for i in tqdm(range(0, len(texts), batch_size), desc="Embedding des articles"):
    batch = texts[i:i + batch_size]
    embeddings.append(model.encode(batch, show_progress_bar=False))

embeddings = np.vstack(embeddings)
np.save(EMBEDDINGS_PATH, embeddings)
print(f"💾 Embeddings sauvegardés dans {EMBEDDINGS_PATH}")

# 3. Création de l'index FAISS
INDEX_PATH = f"{DRIVE_PATH}/faiss_index.index"
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension)
index.add(embeddings)
faiss.write_index(index, INDEX_PATH)
print(f"🏗️ Index FAISS sauvegardé dans {INDEX_PATH}")

# 4. Fonction de recherche améliorée
def search(query: str, top_k: int = 5, min_similarity: float = 0.5):
    """Recherche les articles les plus pertinents"""
    try:
        query_embedding = model.encode([query])
        distances, indices = index.search(query_embedding, top_k)

        results = []
        for idx, score in zip(indices[0], distances[0]):
            if idx >= 0 and score >= min_similarity:
                result = metadata[idx].copy()
                result["score"] = float(score)

                # Ajout d'un extrait du contenu (premières 100 caractères)
                result["extrait"] = texts[idx][:100] + "..." if len(texts[idx]) > 100 else texts[idx]
                results.append(result)

        return sorted(results, key=lambda x: x["score"], reverse=True)
    except Exception as e:
        print(f"❌ Erreur lors de la recherche: {str(e)}")
        return []

# 5. Test du système
print("\n🧪 Phase de test - Tapez 'exit' pour quitter")
while True:
    query = input("\n🔎 Entrez votre requête: ")
    if query.lower() == 'exit':
        break

    results = search(query)

    if not results:
        print("Aucun résultat trouvé. Essayez avec d'autres termes.")
        continue

    print(f"\n📚 Meilleurs résultats pour '{query}':")
    for i, res in enumerate(results, 1):
        print(f"\n{i}. {res['Titre']} (score: {res['score']:.2f})")
        print(f"   📝 Extrait: {res['extrait']}")
        print(f"   🔗 Lien: {res['URL']}")

print("\n✅ Système RAG prêt! Tous les fichiers sont sauvegardés dans Google Drive:")
print(f"- Index FAISS: {INDEX_PATH}")
print(f"- Embeddings: {EMBEDDINGS_PATH}")
print(f"- Données originales: {INPUT_PATH}")

📂 Chargement des données depuis /content/drive/MyDrive/cleaned_articles.csv...
📊 Colonnes disponibles: ['ID', 'Titre', 'Auteur', 'Date', 'Source', 'URL', 'Contenu']
🔧 Création des embeddings...


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Embedding des articles: 100%|██████████| 78/78 [06:22<00:00,  4.90s/it]


💾 Embeddings sauvegardés dans /content/drive/MyDrive/rag_project/article_embeddings.npy
🏗️ Index FAISS sauvegardé dans /content/drive/MyDrive/rag_project/faiss_index.index

🧪 Phase de test - Tapez 'exit' pour quitter

🔎 Entrez votre requête: exit

✅ Système RAG prêt! Tous les fichiers sont sauvegardés dans Google Drive:
- Index FAISS: /content/drive/MyDrive/rag_project/faiss_index.index
- Embeddings: /content/drive/MyDrive/rag_project/article_embeddings.npy
- Données originales: /content/drive/MyDrive/cleaned_articles.csv


Script de test du système RAG : chargement de l’index FAISS, recherche sémantique et affichage interactif des résultats

In [16]:
# rag_test.py
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
import pandas as pd

# Chemins vers les fichiers sauvegardés
DRIVE_PATH = "/content/drive/MyDrive/rag_project"
INDEX_PATH = f"{DRIVE_PATH}/faiss_index.index"
EMBEDDINGS_PATH = f"{DRIVE_PATH}/article_embeddings.npy"
DATA_PATH = "/content/drive/MyDrive/cleaned_articles.csv"

# Charger les ressources existantes
print("⚙️ Chargement des ressources...")
model = SentenceTransformer('all-MiniLM-L6-v2')
index = faiss.read_index(INDEX_PATH)
df = pd.read_csv(DATA_PATH)
texts = df["Contenu"].fillna("").tolist()
metadata = df[["ID", "Titre", "URL"]].to_dict('records')

def search(query: str, top_k: int = 3):
    """Fonction de recherche optimisée"""
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)

    results = []
    for idx, score in zip(indices[0], distances[0]):
        if idx >= 0:
            result = metadata[idx].copy()
            result["score"] = float(score)
            result["extrait"] = texts[idx][:150] + "..." if len(texts[idx]) > 150 else texts[idx]
            results.append(result)

    return sorted(results, key=lambda x: x["score"], reverse=True)

# Interface de test
print("\n🔍 Testez votre système RAG (tapez 'exit' pour quitter)")
while True:
    query = input("\nEntrez votre requête : ")
    if query.lower() == 'exit':
        break

    results = search(query)

    if not results:
        print("Aucun résultat trouvé.")
        continue

    print(f"\n🔎 {len(results)} résultats pour '{query}':")
    for i, res in enumerate(results, 1):
        print(f"\n{i}. [{res['score']:.2f}] {res['Titre']}")
        print(f"   {res['extrait']}")
        print(f"   {res['URL']}")

print("\n✅ Test terminé")

⚙️ Chargement des ressources...

🔍 Testez votre système RAG (tapez 'exit' pour quitter)

Entrez votre requête : Python pour débutants

🔎 3 résultats pour 'Python pour débutants':

1. [0.36] mastering python in 2025   simple 6 step guide for beginners    dev community
   python is one of the most popular programming language in the worldand for good reason   it beginner friendly   incredibly versatile   and used in eve...
   https://dev.to/vishal_more_02990955c9358/mastering-python-in-2025-a-simple-6-step-guide-for-beginners-2m3d

2. [0.32] getting started with nexios asgi python framework    dev community
   in the ever evolving landscape of python web framework   developer ar constantly seeking solution that offer the perfect balancer of performance   sim...
   https://dev.to/techwithdunamix/getting-started-with-nexios-asgi-python-framework-1f7g

3. [0.31] what id do to get python backend job in 2025   if were starting today     dev community
   hey everyone reading thi 
 im pretty ne

In [18]:
!pip install langchain-community openai

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.10.1-py3-none-any.whl.metadata (3.4 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.1-py3-none-any.whl.metadata (9.4 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 k

In [22]:
!pip install langchain-community openai langchain-huggingface faiss-cpu

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.0-py3-none-any.whl.metadata (996 bytes)
Downloading langchain_huggingface-0.3.0-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.0


In [24]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [26]:
import os
print("Fichiers dans rag_project:", os.listdir("/content/drive/MyDrive/rag_project"))

Fichiers dans rag_project: ['article_embeddings.npy', 'faiss_index.index']


Démarrage asynchrone du serveur Ollama via un thread séparé

In [32]:
import subprocess
import threading

def run_ollama():
    subprocess.run(["ollama", "serve"], check=True)

# Démarrer dans un thread séparé
threading.Thread(target=run_ollama, daemon=True).start()

In [33]:
!curl http://localhost:11434
# Doit retourner "Ollama is running"

Ollama is running

In [34]:
!ollama pull mistral

[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h[?25l[A[1G[?25h[?2026l[?2026h

Chatbot interactif pour recherche d’articles avec embeddings SentenceTransformer et index FAISS

In [36]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer

# Charger les ressources
DRIVE_PATH = "/content/drive/MyDrive/rag_project"
INDEX_PATH = f"{DRIVE_PATH}/faiss_index.index"
DATA_PATH = "/content/drive/MyDrive/cleaned_articles.csv"

print("⚙️ Chargement des ressources...")
model = SentenceTransformer('all-MiniLM-L6-v2')
index = faiss.read_index(INDEX_PATH)
df = pd.read_csv(DATA_PATH)
texts = df["Contenu"].fillna("").tolist()
metadata = df[["ID", "Titre", "URL"]].to_dict('records')

def search_articles(query: str, top_k: int = 5):
    """Recherche les articles les plus pertinents"""
    query_embedding = model.encode([query])
    distances, indices = index.search(query_embedding, top_k)
    results = []
    for idx, score in zip(indices[0], distances[0]):
        if idx >= 0:
            result = metadata[idx].copy()
            result["score"] = float(score)
            result["extrait"] = texts[idx][:200] + "..." if len(texts[idx]) > 200 else texts[idx]
            results.append(result)
    return sorted(results, key=lambda x: x["score"], reverse=True)

def main():
    print("\n=== CHATBOT D'ARTICLES ===")
    while True:
        question = input("\nPose ta question (ou 'exit'): ").strip()
        if question.lower() == 'exit':
            break
        articles = search_articles(question, top_k=5)
        if not articles:
            print("Aucun article trouvé.")
            continue
        print(f"\n{len(articles)} articles trouvés pour ta question :\n")
        for i, art in enumerate(articles, 1):
            print(f"{i}. {art['Titre']}")
            print(f"   🔗 Lien : {art['URL']}")
            print(f"   📝 Résumé : {art['extrait']}\n")

if __name__ == "__main__":
    main()

⚙️ Chargement des ressources...

=== CHATBOT D'ARTICLES ===

5 articles trouvés pour ta question :

1. mastering python in 2025   simple 6 step guide for beginners    dev community
   🔗 Lien : https://dev.to/vishal_more_02990955c9358/mastering-python-in-2025-a-simple-6-step-guide-for-beginners-2m3d
   📝 Résumé : python is one of the most popular programming language in the worldand for good reason   it beginner friendly   incredibly versatile   and used in everything from web developmer to   whether you r sta...

2. python isnt just survivingit thriving     dev community
   🔗 Lien : https://dev.to/jayesh_malviya_50f3081df5/python-isnt-just-surviving-its-thriving-2jij
   📝 Résumé : readability    faster learning 
 python clean   english like syntax mean you spend les tim debugging and more time building   comparer 
 python 
 def greet name  
     print f hello    name    
 v 
 j...

3. day 9 100   whil loops with real world example    dev community
   🔗 Lien : https://dev.to/therahul_gu

In [37]:
from google.colab import files
files.download('/content/drive/MyDrive/cleaned_articles.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>