In [None]:
%pip install langchain llama-cpp-python chromadb unstructured pdfminer.six sentence-transformers transformers torch soundfile bark flask flask-cors

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


: 

In [1]:
from transformers import MarianMTModel, MarianTokenizer

model_path = "models/opus-mt-fr-en"

try:
    tokenizer_fr_en = MarianTokenizer.from_pretrained(model_path, local_files_only=True)
    model_fr_en = MarianMTModel.from_pretrained(model_path, local_files_only=True, from_tf=True)
    print("Modèle TensorFlow chargé avec succès !")
except Exception as e:
    print(f"Erreur lors du chargement du modèle : {e}")

All TF 2.0 model weights were used when initializing MarianMTModel.

Some weights of MarianMTModel were not initialized from the TF 2.0 model and are newly initialized: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Modèle TensorFlow chargé avec succès !


In [2]:
# Verification et Chargement du Modèle de Traduction en Local
from transformers import MarianMTModel, MarianTokenizer
import os

model_path = "models/opus-mt-fr-en"

if not os.path.exists(model_path):
    raise FileNotFoundError(
        f"Modèle de traduction non trouvé dans {model_path}. "
        f"Télécharge-le avec : `huggingface-cli download Helsinki-NLP/opus-mt-fr-en --local-dir {model_path}`."
    )

try:
    tokenizer_fr_en = MarianTokenizer.from_pretrained(model_path, local_files_only=True)

    try:
        print("Tentative de chargement en PyTorch...")
        model_fr_en = MarianMTModel.from_pretrained(model_path, local_files_only=True)
        print("Modèle PyTorch chargé avec succès !")

    except Exception as e:
        print(f"Échec du chargement en PyTorch : {e}")
        print("Basculement vers le chargement TensorFlow...")

        model_fr_en = MarianMTModel.from_pretrained(model_path, local_files_only=True, from_tf=True)
        print("Modèle TensorFlow chargé avec succès !")

except Exception as e:
    print(f"Erreur lors du chargement du modèle : {e}")

Tentative de chargement en PyTorch...
Modèle PyTorch chargé avec succès !


In [3]:
import os
import shutil
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

data_path = "data/"
lexique_path = os.path.join(data_path, "lexique.txt")
persist_dir = "models/chromadb_lexique"

if not os.path.exists(lexique_path):
    raise FileNotFoundError(f"Le fichier {lexique_path} est introuvable !")

if os.path.exists(persist_dir):
    shutil.rmtree(persist_dir)

os.makedirs(persist_dir, exist_ok=True)

print(f"Indexation du lexique dans ChromaDB (dossier: {persist_dir})...")

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

loader = TextLoader(lexique_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
documents = text_splitter.split_documents(documents)

vectorstore_lexique = Chroma.from_documents(
    documents,
    embedding=embedding_model,
    persist_directory=persist_dir
)
vectorstore_lexique.persist()

print("Indexation terminée avec succès dans ChromaDB !")

Indexation du lexique dans ChromaDB (dossier: models/chromadb_lexique)...


  embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")


Indexation terminée avec succès dans ChromaDB !


  vectorstore_lexique.persist()


In [4]:
# interroge ChromaDB pour voir si un terme technique a une traduction spécifique
def retrieve_translation(term, top_k=1):
    results = vectorstore_lexique.similarity_search(term, k=top_k)
    
    if results:
        best_match = results[0].page_content.strip()

        if " = " in best_match:
            parts = best_match.split(" = ", 1) 
            if len(parts) == 2:
                src, tgt = parts[0].strip(), parts[1].strip()
                
                if src.lower() == term.lower():
                    return tgt
        
        print(f"Format invalide ou incohérence dans ChromaDB : '{best_match}'")
    
    return None

In [5]:
import torch

# Utilise ChromaDB pour les termes techniques
# Complète avec une traduction automatique pour le reste
def translate_text_rag(text, model, tokenizer, max_length=100):
    words = text.split()
    translated_words = []
    
    for word in words:
        rag_translation = retrieve_translation(word)
        if rag_translation:
            translated_words.append(rag_translation)
        else:
            inputs = tokenizer(word, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
            with torch.no_grad():
                translated_ids = model.generate(**inputs)
            translated_word = tokenizer.batch_decode(translated_ids, skip_special_tokens=True)[0]
            translated_words.append(translated_word)
    
    return " ".join(translated_words)

In [None]:
# déploiement de l'API Flask
# L'API traduit une phrase, lit la phrase originale en français et sa traduction en anglais
import sys
sys.setrecursionlimit(3000)

from flask import Flask, request, jsonify
from flask_cors import CORS
import numpy as np
import soundfile as sf
from bark import generate_audio, preload_models
import os
import sys
from threading import Thread
import torch

app = Flask(__name__)
CORS(app, resources={r"/*": {"origins": "*"}}, supports_credentials=True)

port = 7448

original_torch_load = torch.load

def safe_load(*args, **kwargs):
    kwargs["weights_only"] = False
    return original_torch_load(*args, **kwargs)

torch.load = safe_load

bark_model_path = "models/bark"
os.makedirs(bark_model_path, exist_ok=True)
os.environ["BARK_CACHE_DIR"] = bark_model_path

torch.serialization.add_safe_globals(["numpy.core.multiarray.scalar"])

print("Téléchargement et chargement des modèles Bark...")
sys.stdout.flush()
preload_models()

def save_and_play_audio(audio_array, filename):
    sf.write(filename, audio_array, samplerate=24000)
    os.system(f"afplay {filename}")

@app.route("/translate", methods=["POST"])
def translate():
    user_input = request.json.get("text", "").strip()
    translation = translate_text_rag(user_input, model_fr_en, tokenizer_fr_en)
    print(f"Phrase originale : {user_input}")
    print(f"Traduction : {translation}")
    sys.stdout.flush()

    audio_fr = generate_audio(user_input, history_prompt="v2/fr_speaker_1")
    save_and_play_audio(audio_fr, "french.wav")

    audio_en = generate_audio(translation, history_prompt="v2/en_speaker_1")
    save_and_play_audio(audio_en, "english.wav")

    return jsonify({"original": user_input, "translated": translation})

print(f"Flask démarre sur le port {port}")
sys.stdout.flush()

def run():
    app.run(host="0.0.0.0", port=port, debug=True, use_reloader=False)

Thread(target=run).start()

Téléchargement et chargement des modèles Bark...


No GPU being used. Careful, inference might be very slow!


Flask démarre sur le port 7448


  WeightNorm.apply(module, name, dim)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on all addresses (0.0.0.0)
 * Running on http://127.0.0.1:7448
 * Running on http://10.2.165.35:7448
[33mPress CTRL+C to quit[0m
127.0.0.1 - - [06/Mar/2025 12:04:14] "OPTIONS /translate HTTP/1.1" 200 -


Format invalide ou incohérence dans ChromaDB : 'descente de gradient = gradient descent
ensemble de données = dataset
analyse de données = data analysis
prétraitement des données = data preprocessing
vision par ordinateur = computer vision
traitement d'images = image processing
détection d'objets = object detection
reconnaissance faciale = facial recognition
synthèse vocale = speech synthesis
reconnaissance vocale = speech recognition
chatbot = chatbot
traitement du langage naturel = natural language processing'
Format invalide ou incohérence dans ChromaDB : 'apprentissage par renforcement = reinforcement learning
matrice de confusion = confusion matrix
classification binaire = binary classification
classification multi-classe = multi-class classification
détection d'anomalies = anomaly detection
mise en production = production deployment'
Format invalide ou incohérence dans ChromaDB : 'descente de gradient = gradient descent
ensemble de données = dataset
analyse de données = data anal

100%|██████████| 100/100 [00:10<00:00,  9.79it/s]
100%|██████████| 8/8 [00:35<00:00,  4.41s/it]
100%|██████████| 100/100 [00:09<00:00, 10.80it/s]
100%|██████████| 9/9 [00:34<00:00,  3.83s/it]
127.0.0.1 - - [06/Mar/2025 12:06:04] "POST /translate HTTP/1.1" 200 -


In [7]:
# Crée un fichier trad.html pour interagir avec l'API Flask
html_code = """
<!DOCTYPE html>
<html>
<head>
    <title>Traduction RAG</title>
    <script>
        async function sendMessage() {
            let userInput = document.getElementById("user_input").value;

            let response = await fetch("http://127.0.0.1:7448/translate", {
                method: "POST",
                headers: { "Content-Type": "application/json" },
                body: JSON.stringify({ text: userInput })
            });

            let data = await response.json();
            document.getElementById("translation").innerText = "Traduction : " + data.translated;
        }
    </script>
</head>
<body>
    <h2>Traduction RAG</h2>
    <input type="text" id="user_input" placeholder="Entrez une phrase en français...">
    <button onclick="sendMessage()">Traduire</button>
    <p id="translation"></p>
</body>
</html>
"""

with open("trad.html", "w") as f:
    f.write(html_code)

print("Interface web `trad.html` générée !")

Interface web `trad.html` générée !
