In [11]:
import os

directory = '/home/sagemaker-user/shared/directive_extraction/directives_translated'

for file in os.listdir(directory):
    full_path = os.path.join(directory, file)
    if os.path.isfile(full_path):
        with open(full_path, 'r', encoding='utf-8') as f:
            content = f.read()
        print(f"{file}: {len(content)} caract√®res")
    else:
        print(f"{file}: ce n'est pas un fichier")

.ipynb_checkpoints: ce n'est pas un fichier
1.DIRECTIVE (UE) 20192161 DU PARLEMENT EUROP√âEN ET DU CONSEIL_en.json: 107766 caract√®res
2.H.R.1 - One Big Beautiful Bill Act_en.json: 938820 caract√®res
4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL_en.json: 654115 caract√®res
5.‰∏≠Âçé‰∫∫Ê∞ëÂÖ±ÂíåÂõΩËÉΩÊ∫êÊ≥ï__‰∏≠ÂõΩÊîøÂ∫úÁΩë_en.json: 88898 caract√®res
6.‰∫∫Â∑•Áü•ËÉΩÈñ¢ÈÄ£ÊäÄË°ì„ÅÆÁ†îÁ©∂ÈñãÁô∫Âèä„Å≥Ê¥ªÁî®„ÅÆÊé®ÈÄ≤„Å´Èñ¢„Åô„ÇãÊ≥ïÂæã_en.json: 20908 caract√®res
3.H.R.5376 - Inflation Reduction Act of 2022_en.json: 779679 caract√®res


In [None]:
from bs4 import BeautifulSoup

with open("directive.html", "r", encoding="utf-8") as f:
    soup = BeautifulSoup(f, "html.parser")
    full_text = soup.get_text(separator="\n")


In [9]:
import os
import boto3
from bs4 import BeautifulSoup
import json

# --- Configurations ---
directory = '/home/sagemaker-user/shared/directive_extraction/directives'
translated_dir = '/home/sagemaker-user/shared/directive_extraction/directives_translated'
os.makedirs(translated_dir, exist_ok=True)

# --- Client Amazon Translate ---
translate = boto3.client('translate', region_name='us-west-2')

# --- Extraire le texte ---
def extract_text_from_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    soup = BeautifulSoup(content, 'html.parser')
    return soup.get_text(separator="\n")

# --- D√©couper le texte par taille en bytes ---
def split_text_by_bytes(text, max_bytes=9000):
    paragraphs = text.split("\n")
    chunks = []
    current_chunk = ""
    for para in paragraphs:
        if len((current_chunk + para + "\n").encode('utf-8')) <= max_bytes:
            current_chunk += para + "\n"
        else:
            if current_chunk:
                chunks.append(current_chunk.strip())
            # Si un paragraphe seul d√©passe max_bytes, on le d√©coupe en morceaux
            while len(para.encode('utf-8')) > max_bytes:
                slice_bytes = para.encode('utf-8')[:max_bytes]
                slice_text = slice_bytes.decode('utf-8', errors='ignore')
                chunks.append(slice_text.strip())
                para = para[len(slice_text):]
            current_chunk = para + "\n"
    if current_chunk:
        chunks.append(current_chunk.strip())
    return chunks

# --- Traduire en plusieurs parties ---
def translate_text_in_chunks(text, source_lang='auto', target_lang='en'):
    chunks = split_text_by_bytes(text)
    translated_chunks = []
    for i, chunk in enumerate(chunks):
        response = translate.translate_text(
            Text=chunk,
            SourceLanguageCode=source_lang,
            TargetLanguageCode=target_lang
        )
        translated_chunks.append(response['TranslatedText'])
    return translated_chunks

# --- Traitement de tous les fichiers ---
for file in os.listdir(directory):
    full_path = os.path.join(directory, file)
    if os.path.isfile(full_path):
        print(f"Traitement de {file}...")
        text = extract_text_from_file(full_path)
        translated_chunks = translate_text_in_chunks(text)

        # Sauvegarde en JSON avec plusieurs parties
        output_file = os.path.join(translated_dir, f"{os.path.splitext(file)[0]}_en.json")
        with open(output_file, 'w', encoding='utf-8') as f_out:
            json.dump({'original_file': file, 'translated_chunks': translated_chunks}, f_out, ensure_ascii=False, indent=2)

        print(f"{file} ‚Üí traduit et sauvegard√© dans {output_file}")
    else:
        print(f"{file} n'est pas un fichier")


.ipynb_checkpoints n'est pas un fichier
Traitement de 3.H.R.5376 - Inflation Reduction Act of 2022.xml...
3.H.R.5376 - Inflation Reduction Act of 2022.xml ‚Üí traduit et sauvegard√© dans /home/sagemaker-user/shared/directive_extraction/directives_translated/3.H.R.5376 - Inflation Reduction Act of 2022_en.json
Traitement de 4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.html...
4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.html ‚Üí traduit et sauvegard√© dans /home/sagemaker-user/shared/directive_extraction/directives_translated/4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL_en.json
Traitement de 5.‰∏≠Âçé‰∫∫Ê∞ëÂÖ±ÂíåÂõΩËÉΩÊ∫êÊ≥ï__‰∏≠ÂõΩÊîøÂ∫úÁΩë.html...
5.‰∏≠Âçé‰∫∫Ê∞ëÂÖ±ÂíåÂõΩËÉΩÊ∫êÊ≥ï__‰∏≠ÂõΩÊîøÂ∫úÁΩë.html ‚Üí traduit et sauvegard√© dans /home/sagemaker-user/shared/directive_extraction/directives_translated/5.‰∏≠Âçé‰∫∫Ê∞ëÂÖ±ÂíåÂõΩËÉΩÊ∫êÊ≥ï__‰∏≠ÂõΩÊîøÂ∫úÁΩë_en.json
Traitement de 6.‰∫∫Â∑•Áü•ËÉΩÈñ¢ÈÄ£ÊäÄË°ì„ÅÆÁ

In [95]:
law_analysis_prompt = """
You are an expert legal and economic analyst. Your task is to carefully analyze the following law, directive, or regulation and summarize all its important measures. For each measure, extract and explain its impact on the following dimensions:

1. **Economic Impact**: Potential costs, fines, subsidies, incentives, compliance requirements, effect on revenues, margins, or valuations.  
2. **Geographic Scope**: Countries, regions, extraterritorial effects.  
3. **Political / Regulatory Impact**: Changes in governance, authority responsibilities, enforcement mechanisms.  
4. **Sector / Industry Impact**: Which industries, value chain segments, or business types are affected.  

Instructions:  
- For each measure, quote the **exact original text** from the law that justifies your analysis.  
- Use clear, structured language and categorize each measure under the dimensions above.  
- If a dimension is not applicable, write "N/A".  
- Provide a concise but thorough summary for decision-makers and analysts.  
- I want at least 5 measures of the text.

Output format (JSON example):

{
  "summry": {
    "overall_summary": "",
    "main_themes": [],
    "key_impacts": "",
    "recommendations": ""
  },
  "measures": [
    {
      "original_text": "Text quoted from the law with article number",
      "economic_impact": "...",
      "geography": "...",
      "political_impact": "...",
      "sector_industry_impact": "..."
    },
    {
      "original_text": "...",
      "economic_impact": "...",
      "geography": "...",
      "political_impact": "...",
      "sector_industry_impact": "..."
    }
  ]
}


Analyze the following law text and return your structured summary exactly in this JSON format and nothing more:
"""


In [83]:
import json
import boto3
from botocore.config import Config

# --- Config Bedrock ---
config = Config(region_name="us-west-2", connect_timeout=30, read_timeout=300)
bedrock = boto3.client("bedrock-runtime", config=config)

# --- Charger le JSON ---
json_file = "/home/sagemaker-user/shared/directive_extraction/directives_translated/2.H.R.1 - One Big Beautiful Bill Act_en.json"
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)
# --- Construire le prompt √† partir des chunks ---
def build_prompt_from_chunks(chunks, law_analysis_prompt):
    for i, chunk in enumerate(chunks, start=1):
        law_analysis_prompt += f"\nChunk {i}:\n{chunk}\n"

    law_analysis_prompt += "\nReturn a single structured JSON object following the schema exactly."
    return law_analysis_prompt

prompt = build_prompt_from_chunks(data["translated_chunks"], law_analysis_prompt)
#prompt = law_analysis_prompt + "\n\n" + full_text
# --- Appel Bedrock ---
body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 10000,
        "temperature": 0.1,
        "messages": [{"role": "user", "content": [{"type": "text", "text": prompt}]}],
    }

response = bedrock.invoke_model(
    modelId="anthropic.claude-3-sonnet-20240229-v1:0",
    body=json.dumps(body),
    contentType="application/json"
)
# --- R√©cup√©rer le texte renvoy√© par le mod√®le ---
response_body = json.loads(response["body"].read())
reviewed_translation = response_body["content"][0]["text"]


try:
    final_json = json.loads(reviewed_translation)
except json.JSONDecodeError:
    print("Le mod√®le n'a pas renvoy√© un JSON valide. Voici le texte brut :")
    print(reviewed_translation)
    final_json = None

# --- Afficher le JSON final ---
if final_json:
    print(json.dumps(final_json, indent=2, ensure_ascii=False))


{
  "summary": {
    "overall_summary": "This law establishes the basic principles, policies, and institutional framework for promoting research, development, and utilization of artificial intelligence (AI) technology in Japan. It aims to contribute to improving people's lives and the healthy development of the national economy by comprehensively and systematically promoting measures related to AI technology.",
    "main_themes": [
      "Basic principles and philosophy for AI technology research, development, and utilization",
      "Responsibilities of the government, local public bodies, research institutions, businesses, and citizens",
      "Promotion of research, development, education, international cooperation, and infrastructure sharing",
      "Establishment of the Artificial Intelligence Strategy Headquarters and the Artificial Intelligence Basic Plan"
    ],
    "key_impacts": "The law sets the foundation for a coordinated national strategy to advance AI technology, involvi

In [88]:
import json

json_file = "/home/sagemaker-user/shared/directive_extraction/directives_translated/2.H.R.1 - One Big Beautiful Bill Act_en.json"
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)

chunks = data["translated_chunks"]

print(f"Nombre total de chunks : {len(chunks)}")
lengths = [len(c) for c in chunks]
print(f"Taille moyenne des chunks : {sum(lengths)//len(lengths)} caract√®res")
print(f"Taille max : {max(lengths)}, Taille min : {min(lengths)}")
print(f"Somme totale : {sum(lengths)} caract√®res")


Nombre total de chunks : 105
Taille moyenne des chunks : 8638 caract√®res
Taille max : 8974, Taille min : 1979
Somme totale : 907043 caract√®res


In [99]:
import os
import json
import boto3
from botocore.config import Config
import time

# --- Config Bedrock ---
config = Config(region_name="us-west-2", connect_timeout=30, read_timeout=300)
bedrock = boto3.client("bedrock-runtime", config=config)

# --- Param√®tres ---
name = "4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL_en.json"
json_file = os.path.join("/home/sagemaker-user/shared/directive_extraction/directives_translated", name)
max_chars_per_batch = 200000  # Taille cible par lot
output_file = os.path.join("/home/sagemaker-user/shared/directive_extraction/summary", name)

# --- Charger le JSON ---
with open(json_file, "r", encoding="utf-8") as f:
    data = json.load(f)

chunks = data["translated_chunks"]

print(f"\nüì¶ Nombre de chunks d'origine : {len(chunks)}")
lengths = [len(c) for c in chunks]
print(f"Taille moyenne : {sum(lengths)//len(lengths)} | Max : {max(lengths)} | Min : {min(lengths)}")
print(f"Somme totale du texte : {sum(lengths)} caract√®res")

# --- Regrouper les chunks en lots de ~200k caract√®res ---
def group_chunks(chunks, max_chars):
    grouped = []
    current_group = []
    current_len = 0
    for chunk in chunks:
        if current_len + len(chunk) > max_chars and current_group:
            grouped.append("\n".join(current_group))
            current_group = [chunk]
            current_len = len(chunk)
        else:
            current_group.append(chunk)
            current_len += len(chunk)
    if current_group:
        grouped.append("\n".join(current_group))
    return grouped

batches = group_chunks(chunks, max_chars_per_batch)

print(f"üß© Nombre de lots apr√®s regroupement : {len(batches)}")

# --- Construire le prompt ---
def build_prompt(batch_text, law_analysis_prompt):
    return f"{law_analysis_prompt}\n\n{batch_text}\n\nReturn a single JSON following the schema exactly."

# --- Appel Bedrock ---
def call_bedrock(prompt):
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 8000,
        "temperature": 0.1,
        "messages": [{"role": "user", "content": [{"type": "text", "text": prompt}]}],
    }

    response = bedrock.invoke_model(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        body=json.dumps(body),
        contentType="application/json",
    )

    response_body = json.loads(response["body"].read())
    text = response_body["content"][0]["text"]

    try:
        return json.loads(text)
    except json.JSONDecodeError:
        print("‚ö†Ô∏è R√©ponse non-JSON valide, texte brut :")
        print(text)
        return None

# --- Traitement des lots ---
all_results = []

for i, batch in enumerate(batches, start=1):
    print(f"\nüîπ Traitement du lot {i}/{len(batches)} (taille : {len(batch)} caract√®res)...")
    prompt = build_prompt(batch, law_analysis_prompt)
    result = call_bedrock(prompt)
    if result:
        all_results.append(result)
    time.sleep(2)

# --- Fusion finale ---
final_json = {"summary": "", "measures": []}

for res in all_results:
    # Si le r√©sum√© est encore vide et qu'on en trouve un ‚Üí on garde seulement le premier
    if not final_json["summary"] and "summary" in res:
        final_json["summary"] = res["summary"]
    
    # On fusionne toujours les mesures de tous les lots
    if "measures" in res:
        final_json["measures"].extend(res["measures"])

# --- Sauvegarde ---
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(final_json, f, indent=2, ensure_ascii=False)

print(f"\n‚úÖ Analyse termin√©e. R√©sultat sauvegard√© dans : {output_file}")




üì¶ Nombre de chunks d'origine : 73
Taille moyenne : 8484 | Max : 8936 | Min : 3702
Somme totale du texte : 619372 caract√®res
üß© Nombre de lots apr√®s regroupement : 4

üîπ Traitement du lot 1/4 (taille : 198545 caract√®res)...

üîπ Traitement du lot 2/4 (taille : 199642 caract√®res)...

üîπ Traitement du lot 3/4 (taille : 199796 caract√®res)...
‚ö†Ô∏è R√©ponse non-JSON valide, texte brut :
{
  "summary": {
    "overall_summary": "The law text provides detailed regulations for artificial intelligence (AI) systems in the European Union. It establishes harmonized rules for the development, marketing, and use of AI systems, with a focus on ensuring safety, fundamental rights protection, and ethical principles.",
    "main_themes": [
      "Risk-based approach to regulating AI systems",
      "Requirements and obligations for providers and users of high-risk AI systems",
      "Prohibited AI practices",
      "Governance and enforcement mechanisms",
      "Support for innovation an

In [None]:
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(final_json, f, indent=2, ensure_ascii=False)

print(f"\n‚úÖ Analyse termin√©e. R√©sultat sauvegard√© dans : {output_file}")

In [100]:
import json

# --- Sp√©cifie ton chemin de fichier JSON ---
file_path = "/home/sagemaker-user/shared/law_on_companies/summary/1.portfolio_analysis.json"

# --- Charger et afficher le contenu ---
with open(file_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Affichage lisible
print(json.dumps(data, indent=2, ensure_ascii=False))


{
  "summary": {
    "overall_portfolio_impact": "Moderate to High",
    "key_risk_sectors": [
      "Technology",
      "Consumer Discretionary",
      "Industrials",
      "Consumer Staples"
    ],
    "key_risk_countries": [
      "United States",
      "China",
      "European Union"
    ],
    "recommendations": "Reduce exposure to high-risk sectors like Technology and Consumer Discretionary. Consider geographic diversification away from high regulatory risk regions like the US, China and EU. Replace holdings with high regulatory risk scores with lower risk alternatives in the same sectors."
  },
  "portfolio_analysis": [
    {
      "symbol": "NVDA",
      "company_name": "NVIDIA Corporation",
      "sector": "Information Technology",
      "country_exposure": "United States, China, Taiwan",
      "regulatory_risk_score": "High",
      "dependance_risk_score": "Moderate",
      "justify_regulatory_risk": "Article 7 paragraph 4 bis requires disclosure of ranking parameters for con

In [7]:
import os
import json
import pandas as pd
import unicodedata
import re

def robust_read_csv(path):
    # Essaye UTF-8 puis latin1 au cas o√π
    for enc in ["utf-8-sig", "utf-8", "latin1"]:
        try:
            return pd.read_csv(path, encoding=enc)
        except Exception:
            continue
    raise RuntimeError(f"Impossible de lire le CSV: {path}")

def normalize_text(s):
    """Normalise pour matching par nom: minuscule, sans accents, espaces simplifi√©s, sans ponctuation."""
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    s = unicodedata.normalize("NFKD", s)
    s = "".join(ch for ch in s if not unicodedata.combining(ch))
    s = re.sub(r"[^\w\s]", " ", s)       # retire ponctuation
    s = re.sub(r"\s+", " ", s).strip()   # espaces multiples -> 1
    return s

def normalize_symbol(s):
    if pd.isna(s):
        return ""
    return str(s).strip().upper()

def find_first_col(columns, candidates):
    low = {c.lower(): c for c in columns}
    for cand in candidates:
        if cand.lower() in low:
            return low[cand.lower()]
    return None

def main(csv_path, json_path, output_path=None):
    if output_path is None:
        root, ext = os.path.splitext(csv_path)
        output_path = f"{root}_augmented{ext}"
        unmatched_report = f"{root}_unmatched_report.csv"
    else:
        root, ext = os.path.splitext(output_path)
        unmatched_report = f"{root}_unmatched_report.csv"

    # 1) Charger fichiers
    df_csv = robust_read_csv(csv_path)

    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    pa = data.get("portfolio_analysis", [])
    if not pa:
        raise ValueError("Le JSON ne contient pas de cl√© 'portfolio_analysis' non vide.")

    df_json = pd.DataFrame(pa)

    # 2) Pr√©parer colonnes depuis JSON
    for col in ["regulatory_risk_score", "dependance_risk_score",
                "justify_regulatory_risk", "justify_dependance_risk",
                "recommended_adjustments", "symbol", "company_name"]:
        if col not in df_json.columns:
            df_json[col] = None

    df_json["reglementary_score"] = df_json["regulatory_risk_score"]
    df_json["dependance_score"] = df_json["dependance_risk_score"]
    df_json["recommandation"] = (
        df_json["justify_regulatory_risk"].fillna("").astype(str).str.strip() + " " +
        df_json["justify_dependance_risk"].fillna("").astype(str).str.strip() + " " +
        df_json["recommended_adjustments"].fillna("").astype(str).str.strip()
    ).str.replace(r"\s+", " ", regex=True).str.strip()

    # Garder uniquement les colonnes utiles c√¥t√© JSON pour le merge
    df_json_small = df_json[[
        "symbol", "company_name", "reglementary_score", "dependance_score", "recommandation"
    ]].copy()

    # 3) D√©terminer cl√©(s) de jointure
    symbol_col_csv = find_first_col(
        df_csv.columns,
        ["symbol", "ticker", "SYM", "SYMBOL", "Ticker"]
    )
    company_col_csv = find_first_col(
        df_csv.columns,
        ["company_name", "Company", "Company Name", "Name", "Issuer", "Entreprise", "Soci√©t√©"]
    )

    # Pr√©parer versions normalis√©es
    df_json_small["_sym"] = df_json_small["symbol"].map(normalize_symbol)
    df_json_small["_name"] = df_json_small["company_name"].map(normalize_text)

    # Supprimer doublons c√¥t√© JSON (priorit√© symbole, sinon nom)
    df_json_dedup_sym = df_json_small.copy()
    df_json_dedup_sym = df_json_dedup_sym.sort_values(by=["_sym", "_name"]).drop_duplicates(subset=["_sym"], keep="first")
    df_json_dedup_name = df_json_small.copy()
    df_json_dedup_name = df_json_dedup_name.sort_values(by=["_name", "_sym"]).drop_duplicates(subset=["_name"], keep="first")

    df_enriched = df_csv.copy()
    df_enriched["_match_key"] = None
    unmatched_rows = None

    merged = False

    # 4) Essayer le merge par symbole
    if symbol_col_csv is not None and df_json_dedup_sym["_sym"].str.len().gt(0).any():
        df_enriched["_sym"] = df_enriched[symbol_col_csv].map(normalize_symbol)

        df_enriched = df_enriched.merge(
            df_json_dedup_sym[
                ["_sym", "reglementary_score", "dependance_score", "recommandation"]
            ],
            on="_sym",
            how="left",
            suffixes=("", "")
        )
        df_enriched["_match_key"] = df_enriched["_match_key"].fillna("symbol")

        # Capturer non appari√©s apr√®s symbole (si on a un nom pour 2√®me tentative)
        if company_col_csv is not None:
            still_unmatched_mask = df_enriched["reglementary_score"].isna() & df_enriched["dependance_score"].isna() & df_enriched["recommandation"].isna()
            if still_unmatched_mask.any():
                df_enriched.loc[still_unmatched_mask, "_name"] = df_enriched.loc[still_unmatched_mask, company_col_csv].map(normalize_text)

                # 5) Essayer un 2√®me passage par nom
                df_enriched = df_enriched.merge(
                    df_json_dedup_name[
                        ["_name", "reglementary_score", "dependance_score", "recommandation"]
                    ],
                    on="_name",
                    how="left",
                    suffixes=("", "_by_name")
                )

                # Remplir uniquement l√† o√π c'est encore NaN
                for col in ["reglementary_score", "dependance_score", "recommandation"]:
                    df_enriched[col] = df_enriched[col].where(~df_enriched[col].isna(), df_enriched[f"{col}_by_name"])
                    if f"{col}_by_name" in df_enriched.columns:
                        df_enriched.drop(columns=[f"{col}_by_name"], inplace=True)

                df_enriched["_match_key"] = df_enriched["_match_key"].mask(
                    df_enriched["_match_key"].eq("symbol") & (
                        df_enriched["reglementary_score"].notna() |
                        df_enriched["dependance_score"].notna() |
                        df_enriched["recommandation"].notna()
                    ),
                    "symbol"
                )
                df_enriched["_match_key"] = df_enriched["_match_key"].fillna("name")

        merged = True

    # 6) Si pas de symbole exploitable, tenter directement par nom
    if not merged and company_col_csv is not None:
        df_enriched["_name"] = df_enriched[company_col_csv].map(normalize_text)
        df_enriched = df_enriched.merge(
            df_json_dedup_name[
                ["_name", "reglementary_score", "dependance_score", "recommandation"]
            ],
            on="_name",
            how="left"
        )
        df_enriched["_match_key"] = "name"
        merged = True

    if not merged:
        raise ValueError(
            "Impossible de trouver une cl√© de jointure. "
            "Aucune colonne 'symbol/ticker' ni 'company_name/Company/Name' d√©tect√©e dans le CSV."
        )

    # 7) Rapporter les non appari√©s
    unmatched_mask = df_enriched["reglementary_score"].isna() & df_enriched["dependance_score"].isna() & df_enriched["recommandation"].isna()
    if unmatched_mask.any():
        unmatched_rows = df_enriched[unmatched_mask].copy()
        # Garder seulement colonnes utiles dans le rapport
        keep_cols = [c for c in [symbol_col_csv, company_col_csv] if c is not None]
        keep_cols += ["_sym", "_name", "_match_key"]
        keep_cols = [c for c in keep_cols if c in unmatched_rows.columns]
        unmatched_rows = unmatched_rows[keep_cols].drop_duplicates()
        unmatched_rows.to_csv(unmatched_report, index=False, encoding="utf-8-sig")
        print(f"‚ö†Ô∏è Lignes sans correspondance: {len(unmatched_rows)} ‚Äî rapport: {unmatched_report}")

    # 8) Nettoyage colonnes techniques
    for tmp in ["_sym", "_name"]:
        if tmp in df_enriched.columns:
            df_enriched.drop(columns=[tmp], inplace=True)

    # 9) Sauvegarde
    df_enriched.to_csv(output_path, index=False, encoding="utf-8-sig")
    print(f"‚úÖ Fichier enrichi √©crit dans: {output_path}")
    print("Colonnes ajout√©es: reglementary_score, dependance_score, recommandation")

# --- Param√®tres √† adapter / ou ex√©cuter tels quels si chemins valides ---
csv_path = "/home/sagemaker-user/shared/streamlit-app/data/analysis_result/4/4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.csv"
json_path = "/home/sagemaker-user/shared/law_on_companies/summary/4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL.json"
main(csv_path, json_path)

‚ö†Ô∏è Lignes sans correspondance: 40 ‚Äî rapport: /home/sagemaker-user/shared/streamlit-app/data/analysis_result/4/4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL_unmatched_report.csv
‚úÖ Fichier enrichi √©crit dans: /home/sagemaker-user/shared/streamlit-app/data/analysis_result/4/4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL_augmented.csv
Colonnes ajout√©es: reglementary_score, dependance_score, recommandation
