In [1]:
import os
import json
import boto3
import time

# === CONFIGURATION ===
FOLDER_PATH = "./notices_txt"
OUTPUT_FILE = "embeddings.json"
REGION = "us-east-1"
MODEL_ID = "amazon.titan-embed-text-v2:0"
DELAY = 0.5
LIMIT = 200  # ✅ Limite du nombre de fichiers à traiter

# === INIT CLIENT ===
client = boto3.client("bedrock-runtime", region_name=REGION)

# === FONCTION POUR VECTORISER UN TEXTE ===
def get_embedding(text):
    body = { "inputText": text }
    response = client.invoke_model(
        modelId=MODEL_ID,
        body=json.dumps(body),
        contentType="application/json"
    )
    response_body = json.loads(response['body'].read())
    return response_body['embedding']

# === TRAITEMENT LIMITÉ À 200 FICHIERS ===
embeddings = {}
file_count = 0

for filename in os.listdir(FOLDER_PATH):
    if file_count >= LIMIT:
        break

    if filename.endswith(".txt"):
        filepath = os.path.join(FOLDER_PATH, filename)
        with open(filepath, "r", encoding="utf-8") as file:
            content = file.read().strip()

            if not content:
                print(f"⚠️ Fichier vide ignoré : {filename}")
                continue

            try:
                print(f"🔄 Vectorisation : {filename}")
                vector = get_embedding(content)
                embeddings[filename] = vector
                file_count += 1
                time.sleep(DELAY)
            except Exception as e:
                print(f"❌ Erreur sur {filename} : {e}")

# === SAUVEGARDE ===
with open(OUTPUT_FILE, "w", encoding="utf-8") as out_file:
    json.dump(embeddings, out_file)

print(f"✅ {file_count} fichiers vectorisés. Résultats dans {OUTPUT_FILE}")


🔄 Vectorisation : ABACAVIR VIATRIS 300 mg.txt
❌ Erreur sur ABACAVIR VIATRIS 300 mg.txt : An error occurred (ValidationException) when calling the InvokeModel operation: 400 Bad Request: Too many input tokens. Max input tokens: 8192, request input token count: 8693 
🔄 Vectorisation : ABACAVIR-LAMIVUDINE BIOGARAN 600 mg-300 mg.txt
❌ Erreur sur ABACAVIR-LAMIVUDINE BIOGARAN 600 mg-300 mg.txt : An error occurred (ValidationException) when calling the InvokeModel operation: 400 Bad Request: Too many input tokens. Max input tokens: 8192, request input token count: 9044 
🔄 Vectorisation : ABACAVIR-LAMIVUDINE SANDOZ 600 mg-300 mg.txt
❌ Erreur sur ABACAVIR-LAMIVUDINE SANDOZ 600 mg-300 mg.txt : An error occurred (ValidationException) when calling the InvokeModel operation: 400 Bad Request: Too many input tokens. Max input tokens: 8192, request input token count: 8757 
🔄 Vectorisation : ABACAVIR-LAMIVUDINE VIATRIS 600 mg-300 mg.txt
❌ Erreur sur ABACAVIR-LAMIVUDINE VIATRIS 600 mg-300 mg.txt : An er