In [15]:
# ================================================
# IMPORTS
# ================================================
import os
import re
import json
import numpy as np
from dotenv import load_dotenv
from mistralai import Mistral
import faiss
import time
import random

# ================================================
# 0. Charger cl√© API
# ================================================
load_dotenv()
api_key = os.getenv("MISTRAL_API_KEY")
client = Mistral(api_key=api_key)

# ================================================
# 1. Fonctions utilitaires
# ================================================
def show_tree(root):
    """G√©n√®re l'arborescence en texte brut, en excluant les dossiers / fichiers parasites."""
    lines = []
    for dirpath, dirnames, filenames in os.walk(root):
        # On filtre les dossiers / fichiers exclus
        dirnames[:] = [d for d in dirnames if d not in excluded_dirs]
        filenames[:] = [f for f in filenames if f not in excluded_files]

        level = dirpath.replace(root, "").count(os.sep)
        indent = " " * 4 * level
        lines.append(f"{indent}{os.path.basename(dirpath)}/")
        subindent = " " * 4 * (level + 1)
        for f in filenames:
            lines.append(f"{subindent}{f}")
    return "\n".join(lines)



def read_code(file_path):
    """Lit .py ou .ipynb et renvoie le code."""
    ext = os.path.splitext(file_path)[1].lower()

    if ext == ".ipynb":
        with open(file_path, "r", encoding="utf-8") as f:
            nb = json.load(f)
            content = []
            for cell in nb.get("cells", []):
                if cell.get("cell_type") == "code":
                    content.append("".join(cell.get("source", [])))
            return "\n".join(content)

    else:
        with open(file_path, "r", encoding="utf-8") as f:
            return f.read()


def split_functions(code):
    """D√©coupe le code en fonctions/chunks."""
    pattern = r"(def [\w_]+\s*\(.*?\):(?:\n(?:\s+.+))*)"
    chunks = re.findall(pattern, code, re.DOTALL)
    return chunks if chunks else [code]


def gitignore(dossier_path):
    """Extrait les fichiers ignor√©s par .gitignore."""
    gitignore_path = os.path.join(dossier_path, ".gitignore")
    if not os.path.exists(gitignore_path):
        return "Aucun fichier .gitignore trouv√©."
    ignored_files = []
    with open(gitignore_path, "r", encoding="utf-8") as f:
        ignored_files = [line.strip() for line in f if line.strip() and not line.startswith("#")]
    return "Fichiers ignor√©s par .gitignore :\n" + "\n".join(ignored_files)


def extract_imports(code):
    """Extrait toutes les biblioth√®ques import√©es."""
    imports = set()
    for line in code.splitlines():
        line = line.strip()
        if line.startswith("import "):
            imports.add(line.replace("import ", "").split()[0])
        elif line.startswith("from "):
            imports.add(line.split()[1])
    return imports


# ================================================
# 2. Embeddings + gestion des erreurs
# ================================================
def create_embeddings(chunks, batch_size=16, max_retries=5):
    embeddings = []

    for i in range(0, len(chunks), batch_size):
        batch = chunks[i:i+batch_size]
        for attempt in range(max_retries):
            try:
                resp = client.embeddings.create(
                    model="codestral-embed",
                    inputs=batch
                )
                for emb in resp.data:
                    embeddings.append(np.array(emb.embedding, dtype=np.float32))
                break

            except Exception as e:
                if "429" in str(e):
                    wait = 2 ** attempt + random.random()
                    print(f"Erreur 429‚Ä¶ Retry dans {wait:.2f}s")
                    time.sleep(wait)
                else:
                    raise e

    return embeddings


# ================================================
# 3. Pipe RAG : indexation + g√©n√©ration README
# ================================================

# üîπ Liste des dossiers √† exclure (Python, Node.js, Java, etc.)
excluded_dirs = {
    # Python
    "venv", ".venv", "__pycache__", "site-packages", "env", ".env",
    # Node.js / JS
    "node_modules", "bower_components", ".npm", ".yarn",
    # Java / JVM
    "target", "build", ".gradle", ".mvn", "out",
    # C/C++ / Rust / Go / .NET
    "cmake-build-debug", "cmake-build-release", "bin", "obj", "pkg", "dist",
    "Debug", "Release",
    # Divers IDE / SCM
    ".git", ".svn", ".hg", ".idea", ".vscode"
}

# üîπ Liste des fichiers √† exclure (souvent g√©n√©r√©s automatiquement ou inutiles √† l'analyse)
excluded_files = {
    # SCM / VCS
    ".gitignore", ".gitattributes", ".gitmodules",
    ".hgignore", ".svnignore",

    # Config / lockfiles
    "package-lock.json", "yarn.lock", "pnpm-lock.yaml",
    "poetry.lock", "Pipfile.lock",

    # Build / cache
    "Thumbs.db", "Desktop.ini",
    ".DS_Store",  # macOS
    "npm-debug.log", "yarn-error.log",
    "Cargo.lock", "Gemfile.lock",

    # Environnements
    ".env", ".env.local", ".env.production", ".env.development",

    # Binaires / artefacts
    "*.pyc", "*.pyo", "*.pyd",
    "*.class", "*.jar", "*.war", "*.ear",
    "*.dll", "*.so", "*.dylib",
    "*.exe", "*.out", "*.o", "*.obj",
    "*.a", "*.lib",

    # Archives
    "*.zip", "*.tar", "*.gz", "*.bz2", "*.rar",

    # Divers
    "README.md", "LICENSE", "COPYING", "CHANGELOG", "TODO", "Makefile"
}


def iter_project_files(folder_path, code_ext):
    """It√®re uniquement sur les fichiers code utiles, en excluant dossiers et fichiers parasites."""
    for dirpath, dirnames, filenames in os.walk(folder_path):
        # On filtre les dossiers exclus
        dirnames[:] = [d for d in dirnames if d not in excluded_dirs]

        for f in filenames:
            # Exclure les fichiers parasites
            if f in excluded_files:
                continue
            # Exclure aussi par motif (ex: *.pyc, *.class, etc.)
            for pattern in excluded_files:
                if pattern.startswith("*.") and f.endswith(pattern[1:]):
                    break
            else:
                ext = os.path.splitext(f)[1].lower()
                if ext in code_ext:
                    yield os.path.join(dirpath, f)



def generate_readme_RAG(folder_path, output_file):

    # ---- 3.1 Lire fichiers code ----
    code_ext = {".py", ".js", ".ts", ".cpp", ".c", ".java", ".ipynb", ".php", ".html"}
    all_files = []
    all_codes = []

    for fp in iter_project_files(folder_path, code_ext):
        code = read_code(fp)
        if code.strip():
            all_files.append(fp)
            all_codes.append(code)

    if not all_codes:
        raise ValueError("Aucun fichier code trouv√© dans ce dossier !")

    # ---- 3.2 Chunking ----
    chunks = []
    chunk_paths = []

    for file_path, code in zip(all_files, all_codes):
        ch = split_functions(code)
        chunks.extend(ch)
        chunk_paths.extend([file_path] * len(ch))

    print(f"Total chunks : {len(chunks)}")

    # ---- 3.3 Embeddings ----
    embeddings = create_embeddings(chunks, batch_size=16)
    dim = len(embeddings[0])

    # ---- 3.4 Indexation FAISS ----
    index = faiss.IndexFlatL2(dim)
    index.add(np.array(embeddings))

    # ---- 3.5 Arborescence ----
    tree = show_tree(folder_path)

    # ---- 3.6 Construire le prompt RAG final ----
    prompt = f"""
Tu es un expert en analyse de code. 
Voici une liste de chunks extraits du projet.

Ton r√¥le :
- Reconstituer le sens du projet
- Faire un README.md professionnel
- Inclure : 
    # Titre du projet
    # Pr√©sentation g√©n√©rale
    # Arborescence du dossier
    # Biblioth√®ques n√©cessaires
    # Fonctionnement global
    # R√©sum√© fichier par fichier
    # Comment lancer le projet

Voici l'arborescence du dossier :
-----------------
{tree}
-----------------

Voici les chunks utiles (non ordonn√©s) :
-----------------
{chunks[:100]}  # On n'en envoie qu'un √©chantillon
-----------------

G√©n√®re maintenant un README Markdown complet et propre.
"""

    response = client.chat.complete(
        model="codestral-2508",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2,
        max_tokens=2000
    )

    readme_text = response.choices[0].message.content

    # ---- 3.7 Sauvegarde ----
    with open(output_file, "w", encoding="utf-8") as f:
        f.write(readme_text)

    print("\n README g√©n√©r√© :", output_file)


# ================================================
# 4. DEMANDE UTILISATEUR
# ================================================
folder = input("Dossier √† analyser : ").strip()
output = os.path.join(folder, "README.md")

if os.path.exists(output):
    os.remove(output)

generate_readme_RAG(folder, output)



Total chunks : 6
Erreur 429‚Ä¶ Retry dans 1.61s
Erreur 429‚Ä¶ Retry dans 2.13s

 README g√©n√©r√© : D:\33611\Documents\MASTER\M2\Open_data\projet\streamlit-planes\README.md
