In [None]:
elem_to_keep = ['name', 'industry', 'sub_industry', 'customer_segmentation', 'products', 'supplier_countries',
                'supply_chain', 'geographic_market_segment']


In [68]:
import os
import pandas as pd

# --- Paramètres ---
csv_path = "/home/sagemaker-user/shared/2025-08-15_composition_sp500.csv"
root_dir = "/home/sagemaker-user/shared/company_profile/profiles"
top_n = 20

# --- Lire le CSV ---
df = pd.read_csv(csv_path)

# --- Trier par 'Weight' décroissant et prendre plus de 50 pour compenser les absents ---
df_sorted = df.sort_values("Weight", ascending=False)

top_symbols = []
for symbol in df_sorted["Symbol"]:
    company_folder = os.path.join(root_dir, symbol)
    if os.path.isdir(company_folder):
        top_symbols.append(symbol)
    if len(top_symbols) == top_n:
        break

# --- Afficher la liste finale ---
print(f"Top {top_n} entreprises existantes dans le dossier:")
print(top_symbols)


Top 20 entreprises existantes dans le dossier:
['NVDA', 'MSFT', 'AAPL', 'AMZN', 'META', 'AVGO', 'GOOGL', 'GOOG', 'TSLA', 'JPM', 'WMT', 'ORCL', 'V', 'LLY', 'MA', 'NFLX', 'XOM', 'COST', 'PLTR', 'JNJ']


In [69]:
import os
import json

root_dir = "/home/sagemaker-user/shared/company_profile/profiles"

company_jsons = {}

for symbol in top_symbols:
    company_folder = os.path.join(root_dir, symbol)
    json_files = [f for f in os.listdir(company_folder) if f.lower().endswith(".json")]
    
    company_jsons[symbol] = []
    
    for jf in json_files:
        file_path = os.path.join(company_folder, jf)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
                # Supprimer 'date' à la racine si présent
                if "date" in data:
                    del data["date"]
                # Supprimer 'date' dans law_info si présent
                if "law_info" in data and "date" in data["law_info"]:
                    del data["law_info"]["date"]
                company_jsons[symbol].append(data)
        except Exception as e:
            print(f"Erreur lors de la lecture de {file_path}: {e}")

json_50_text = json.dumps(company_jsons, ensure_ascii=False)

In [52]:
liste_files = os.listdir("/home/sagemaker-user/shared/directive_extraction/summary")

In [53]:
liste_files

['.ipynb_checkpoints',
 '1.DIRECTIVE (UE) 20192161_en.json',
 '2.H.R.1 - One Big Beautiful Bill Act_en.json',
 '3.H.R.5376 - Inflation Reduction Act of 2022_en.json',
 '4.REGULATION (EU) 20241689 OF THE EUROPEAN PARLIAMENT AND OF THE COUNCIL_en.json',
 '5.中华人民共和国能源法__中国政府网_en.json',
 '6.人工知能関連技術の研究開発及び活用の推進に関する法律_en.json']

In [70]:
import json
import os

# --- Chemin du fichier JSON ---
name = liste_files[6]
json_file = os.path.join("/home/sagemaker-user/shared/directive_extraction/summary", name)

# --- Vérifier que le fichier existe ---
if os.path.isfile(json_file):
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

json_law_text = json.dumps(data, ensure_ascii=False)

In [71]:
import boto3
from botocore.config import Config

config = Config(region_name="us-west-2", connect_timeout=30, read_timeout=100)
bedrock = boto3.client("bedrock-runtime", config=config)

prompt = f"""
You are an expert legal, economic, and financial analyst.

You have the following inputs:

1) Regulatory measures extracted from a law or directive:
{json_law_text}

2) Portfolio of the 50 largest companies in the S&P 500 with detailed company information:
{json_50_text}

Task:
- Study the effects of the law on this portfolio.
- Aggregate the impacts to estimate the overall effect on the reference portfolio (S&P 500).
- Identify zones of regulatory risk concentration (by sector, by country, etc.).
- Propose qualitative or quantitative adjustments (sector rotation, replacement of holdings, geographic reallocation).
- Evaluate the expected impact on individual stocks.
- Provide at least a risk or impact score for each company.

Output:
Return an exact structured JSON with the following format:

{{
  "summary": {{
    "overall_portfolio_impact": "",
    "key_risk_sectors": [],
    "key_risk_countries": [],
    "recommendations": ""
  }},
    # exactly make the 50 companies, even if it is not very accurate
  "portfolio_analysis": [
    {{
      "symbol": "",
      "company_name": "",
      "sector": "",
      "country_exposure": "",
      "regulatory_risk_score": "", in High, Low, Moderate
      "dependance_risk_score": "", in High, Low, Moderate
      "justify_regulatory_risk": "", # quote the law articles
      "justify_dependance_risk": "",
      "recommended_adjustments": ""
    }}
  ]
}}
"""
print(len(prompt))

# --- Body pour Claude ---
body = {
    "anthropic_version": "bedrock-2023-05-31",
    "max_tokens": 10000,
    "temperature": 0.1,
    "messages": [{"role": "user", "content": [{"type": "text", "text": prompt}]}],
}

response = bedrock.invoke_model(
    modelId="anthropic.claude-3-sonnet-20240229-v1:0",
    body=json.dumps(body),
    contentType="application/json"
)

# --- Récupérer le texte renvoyé par le modèle ---
response_body = json.loads(response["body"].read())
analysis_text = response_body["content"][0]["text"]

# --- Essayer de convertir en JSON ---
try:
    analysis_json = json.loads(analysis_text)
except json.JSONDecodeError:
    print("Le modèle n'a pas renvoyé un JSON valide. Voici le texte brut :")
    print(analysis_text)

# --- Afficher ou sauvegarder ---
print(json.dumps(analysis_json, indent=2, ensure_ascii=False))
name = name.replace('_en.json', '.json')
output_file = os.path.join("/home/sagemaker-user/shared/law_on_companies/summary", name)
output_dir = os.path.dirname(output_file)
os.makedirs(output_dir, exist_ok=True)
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(analysis_json, f, indent=2, ensure_ascii=False)
print(f"Analyse sauvegardée dans {output_file}")


59295
Le modèle n'a pas renvoyé un JSON valide. Voici le texte brut :
```json
{
  "summary": {
    "overall_portfolio_impact": "The law aims to promote the research, development, and utilization of AI technology in Japan, which could have a positive impact on companies involved in AI and related technologies. However, the potential impacts are likely to vary across sectors and companies based on their level of exposure and dependence on AI technology.",
    "key_risk_sectors": [
      "Technology",
      "Semiconductors",
      "Software",
      "Cloud Computing"
    ],
    "key_risk_countries": [
      "Japan",
      "United States",
      "China"
    ],
    "recommendations": "Companies in the key risk sectors and countries should closely monitor the implementation of the law and align their AI strategies accordingly. Sector rotation or geographic reallocation may be considered for companies with high regulatory risk or dependence on AI technology. Companies with strong AI capabiliti

In [47]:
import json
import re
import os

def repair_and_parse_json(text):
    """Tente de convertir un texte libre en JSON valide."""
    if isinstance(text, bytes):
        text = text.decode("utf-8", errors="replace")

    # Remplacement de guillemets typographiques et caractères problématiques
    text = text.replace("“", '"').replace("”", '"').replace("’", "'")

    # Supprime les préfixes/suffixes avant ou après les accolades principales
    match = re.search(r"\{.*\}", text, re.DOTALL)
    if match:
        text = match.group(0)
    else:
        raise ValueError("Aucun objet JSON détecté dans le texte.")

    # Tentative directe
    try:
        return json.loads(text)
    except json.JSONDecodeError:
        pass

    # Tentative de nettoyage supplémentaire
    text = re.sub(r"[\x00-\x1F]+", "", text)  # caractères non imprimables
    text = re.sub(r",\s*([\]}])", r"\1", text)  # virgules en trop
    text = re.sub(r"\\'", "'", text)  # échappements inutiles

    # Dernière tentative
    try:
        return json.loads(text)
    except Exception as e:
        print(f"Échec de conversion JSON : {e}")
        # Fallback
        return {"error": "invalid_json", "raw_text": text}


# --- Exemple d'utilisation ---
# analysis_text = <sortie texte de Claude>

analysis_json = repair_and_parse_json(analysis_text)

# --- Sauvegarde ---
output_file = "/home/sagemaker-user/shared/law_on_companies/summary/portfolio_analysis.json"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(analysis_json, f, indent=2, ensure_ascii=False)

print(f"✅ JSON nettoyé et sauvegardé dans : {output_file}")
print(json.dumps(analysis_json.get("summary", {}), indent=2, ensure_ascii=False))

Échec de conversion JSON : Expecting ',' delimiter: line 1 column 19799 (char 19798)
✅ JSON nettoyé et sauvegardé dans : /home/sagemaker-user/shared/law_on_companies/summary/portfolio_analysis.json
{}
