In [35]:
!git clone https://github.com/alphanome-ai/sec-parser.git
%cd sec-parser
!pip install .

Cloning into 'sec-parser'...
remote: Enumerating objects: 5000, done.[K
remote: Counting objects: 100% (582/582), done.[K
remote: Compressing objects: 100% (94/94), done.[K
remote: Total 5000 (delta 518), reused 488 (delta 488), pack-reused 4418 (from 2)[K
Receiving objects: 100% (5000/5000), 2.59 MiB | 260.00 KiB/s, done.
Resolving deltas: 100% (3556/3556), done.
Updating files: 100% (224/224), done.
/mnt/custom-file-systems/s3/shared/company_profile/sec-parser
Processing /mnt/custom-file-systems/s3/shared/company_profile/sec-parser
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: sec-parser
  Building wheel for sec-parser (pyproject.toml) ... [?25ldone
[?25h  Created wheel for sec-parser: filename=sec_parser-0.58.1-py3-none-any.whl size=77056 sha256=bcef295d4523328f2b2d6f51e131151f92167131cd602041cd723aef7a5e3f38
  Stored in

In [34]:
!pip uninstall -y sec-parser

Found existing installation: sec-parser 0.58.1
Uninstalling sec-parser-0.58.1:
  Successfully uninstalled sec-parser-0.58.1


In [31]:
!pip show sec-parser

[0m

In [32]:
!pip install sec-parser

Collecting sec-parser
  Using cached sec_parser-0.58.1-py3-none-any.whl.metadata (18 kB)
Using cached sec_parser-0.58.1-py3-none-any.whl (76 kB)
Installing collected packages: sec-parser
Successfully installed sec-parser-0.58.1


In [None]:
from sec_parser import Edgar10KParser
import json
import os
import re

root = "/home/sagemaker-user/shared"
input_dir = os.path.join(root, "fillings")
output_dir = os.path.join(root, "company_profile", "extracted_sections")

parser = Edgar10KParser()

# Parcourir chaque entreprise
for company_name in os.listdir(input_dir):
    company_path = os.path.join(input_dir, company_name)
    if not os.path.isdir(company_path):
        continue  # Ignorer les fichiers, ne prendre que les dossiers

    # Chercher le fichier 10-K dans le dossier de l'entreprise
    tenk_files = [f for f in os.listdir(company_path) if f.lower().endswith(".html")]
    if not tenk_files:
        print(f"Aucun fichier 10-K trouv√© pour {company_name}")
        continue

    file_path = os.path.join(company_path, tenk_files[0])  # Prendre le premier HTML trouv√©
    with open(file_path, "r", encoding="utf-8") as f:
        html = f.read()

    # Parser le HTML
    elements = parser.parse(html)
    print(elements)
    # Cr√©er le dossier de sortie pour cette entreprise
    company_output_folder = os.path.join(output_dir, company_name)
    os.makedirs(company_output_folder, exist_ok=True)

    current_section = None
    current_content = []

    for element in elements:
        # D√©tecter TopSectionTitle
        if "TopSectionTitle" in element.__class__.__name__:
            # Sauvegarder la section pr√©c√©dente
            if current_section and current_content:
                short_title = re.sub(r'[\\/*?:"<>|]', "_", current_section[:30])
                file_name = f"{short_title}.json"
                with open(os.path.join(company_output_folder, file_name), "w", encoding="utf-8") as f:
                    json.dump({"content": current_content}, f, indent=4, ensure_ascii=False)
            # Nouvelle section
            current_section = element.text
            current_content = []
        else:
            # Garder TitleElement et TextElement intacts
            current_content.append({
                "type": element.__class__.__name__.replace("Element", ""),
                "text": element.text
            })

    # Sauvegarder la derni√®re section
    if current_section and current_content:
        file_name = re.sub(r'[\\/*?:"<>|]', "_", current_section) + ".json"
        with open(os.path.join(company_output_folder, file_name), "w", encoding="utf-8") as f:
            json.dump({"content": current_content}, f, indent=4, ensure_ascii=False)

In [None]:
import os
import json
import boto3
from botocore.config import Config
import re

# --- Config AWS Bedrock ---
config = Config(
    region_name="us-west-2",
    connect_timeout=30,
    read_timeout=300,  # permet jusqu‚Äô√† 5 min par requ√™te
)
bedrock = boto3.client("bedrock-runtime", config=config)

# --- R√©pertoires et filtres ---
input_dir = "/home/sagemaker-user/shared/company_profile/extracted_sections/BRK-B"
selected_items = ["item 1", "item 1a", "item 1c", "item 2", "item 3", "item 7", "item 7a", "item 8"]
not_selected_items = ["item 1b", "item 10", "item 11", "item 12", "item 13", "item 14", "item 15", "item 16"]
fillings_dir = "/home/sagemaker-user/shared/fillings"


merged_data = {}

# --- Fusion des fichiers s√©lectionn√©s ---
for filename in os.listdir(input_dir):
    if filename.endswith(".json"):
        name = os.path.splitext(filename)[0].lower().strip()  # "item 1.business"
        main_name = name.split(".")[0]  # "item 1"

        # V√©rifie inclusion/exclusion
        if main_name in selected_items and main_name not in not_selected_items:
            file_path = os.path.join(input_dir, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            merged_data[filename] = data


# --- Fallback to full 10-K if any selected item is missing ---
if len(merged_data) < len(selected_items):
    filing_path = os.path.join(fillings_dir, company_name)
    if os.path.isdir(filing_path):
        for file in os.listdir(filing_path):
            if file.endswith(".html") or file.endswith(".txt"):
                with open(os.path.join(filing_path, file), "r", encoding="utf-8") as f:
                    content = f.read()
                merged_data = {"full_10k": content}
                break

# --- Fusion du texte ---
merged_text = "\n\n".join(
    f"--- {section} ---\n{json.dumps(content, ensure_ascii=False)}"
    for section, content in merged_data.items()
)

print(f"Taille du texte fusionn√© : {len(merged_text):,} caract√®res")

for key, values in merged_data.items():
    print(key)

# --- Fonction utilitaire pour d√©couper le texte ---
def chunk_text(text, max_length=200000):
    """D√©coupe un texte long en morceaux d‚Äôenviron max_length caract√®res."""
    return [text[i:i+max_length] for i in range(0, len(text), max_length)]

chunks = chunk_text(merged_text)

# --- Prompt principal (r√©utilis√© sur chaque chunk) ---
base_prompt = """
You are an expert financial analyst specializing in 10-K report analysis.

I want you to focus on several points:
Give the name of companies mentionned in the text and their relationship with the company of the report.
Where are the operations of the company ? What can you say about the ESG and R&D policies ?

From the following text, create a **concise JSON ** and try to complete the most keys you can.
Include measurement units for quantitative data and return **nothing else but the JSON**.

Expected structure:
{
  "date": "...",
  "name": "...",
  "industry": "...",
  "sub_industry": "...",
  "customer_segmentation": ["..."],
  "products": ["..."],
  "supplier_countries": ["..."],
  "supply_chain": "...",
  "geographic_market_segment": ["..."],
  "related_companies": [{"company_name": "...", "relationship_type": "..."}],
  "competitors": ["..."],
  "substitute_products": ["..."],
  "revenue": {"value": null, "unit": "...", "variation": null},
  "net_income": {"value": null, "unit": "..."},
  "gross_margin": {"value": null, "unit": "%"},
  "income_tax_expense": {"value": null, "unit": "..."},
  "share_buybacks": {"value": null, "unit": "..."},
  "dividends": {"value": null, "unit": "..."},
  "debt": {"value": null, "unit": "..."},
  "interest_expense": {"value": null, "unit": "..."},
  "depreciation": {"value": null, "unit": "..."},
  "free_cash_flow": {"value": null, "unit": "..."},
  "total_assets": {"value": null, "unit": "..."},
  "shareholders_equity": {"value": null, "unit": "..."},
  "ongoing_litigation": ["..."],
  "research_development_expense": {"value": null, "unit": "..."},
  "research_development_policy": ["..."],
  "ESG_policy": ["..."]
}

Most important, return **nothing else but the JSON in the right format**.
"""

# --- Fonction d‚Äôappel Claude ---
def call_claude(prompt_text):
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2500,
        "temperature": 0.1,
        "messages": [{"role": "user", "content": [{"type": "text", "text": prompt_text}]}],
    }

    response = bedrock.invoke_model(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        body=json.dumps(body),
    )

    response_body = json.loads(response["body"].read())
    return response_body["content"][0]["text"]

# --- Appels successifs et fusion JSON ---
partial_results = []
print(f"\nTraitement du bloc de {len(chunks)} chunks")

for i, chunk in enumerate(chunks, 1):
    prompt_chunk = f"{base_prompt}\n\nSource text (part {i}/{len(chunks)}):\n{chunk}"

    try:
        json_text = call_claude(prompt_chunk).strip()
        print(json_text[:500])  # voir ce qui est renvoy√©
        if not json_text:
            print(f"Chunk {i}: r√©ponse vide, ignor√©.")
            continue

        # Tente de d√©tecter du JSON m√™me si la r√©ponse contient du texte autour
        match = re.search(r"\{.*\}", json_text, re.DOTALL)
        if match:
            json_text = match.group(0)
        else:
            print(f"Chunk {i}: pas de JSON d√©tect√©.")
            continue

        # Parse JSON
        parsed = json.loads(json_text)
        partial_results.append(parsed)
        print(f"Chunk {i} trait√© avec succ√®s.")

    except json.JSONDecodeError as e:
        print(f"Chunk {i}: JSON invalide ({e})")
        print(f"R√©ponse partielle : {json_text[:200]}...")
    except Exception as e:
        print(f"Erreur sur le chunk {i}: {e}")


# --- Fusion intelligente des JSON partiels ---
final_json = {}

for partial in partial_results:
    for key, value in partial.items():
        # Cl√© d√©j√† existante dans le JSON final
        if key in final_json:
            existing = final_json.get(key)

            # Cas 1 : liste
            if isinstance(value, list):
                # Si liste de dictionnaires (ex: related_companies)
                if all(isinstance(v, dict) for v in value):
                    seen = set()
                    merged_list = []
                    for item in existing + value:
                        identifier = json.dumps(item, sort_keys=True)
                        if identifier not in seen:
                            seen.add(identifier)
                            merged_list.append(item)
                    final_json[key] = merged_list
                else:
                    # Liste simple (texte, nombres, etc.)
                    final_json[key] = list(set(existing + value))

            # Cas 2 : dictionnaire num√©rique {"value": ..., "unit": ...}
            elif isinstance(value, dict) and "value" in value:
                if not existing.get("value") and value.get("value"):
                    final_json[key] = value
                elif not existing.get("unit") and value.get("unit"):
                    existing["unit"] = value["unit"]
                    final_json[key] = existing

            # Cas 3 : champ simple (str, int, etc.)
            else:
                if not existing and value:
                    final_json[key] = value

        else:
            # Premi√®re apparition de la cl√©
            final_json[key] = value

print(final_json)

In [None]:
import os
import json
import re
import boto3
from botocore.config import Config

# --- Configuration AWS Bedrock ---
config = Config(region_name="us-west-2", connect_timeout=30, read_timeout=300)
bedrock = boto3.client("bedrock-runtime", config=config)

# --- R√©pertoires racine ---
root_dir = "/home/sagemaker-user/shared/company_profile/extracted_sections"
output_root = "/home/sagemaker-user/shared/company_profile/profiles"

# --- S√©lection et exclusion des sections ---
selected_items = ["item 1", "item 1a", "item 1c", "item 2", "item 3", "item 7", "item 7a", "item 8"]
not_selected_items = ["item 1b", "item 10", "item 11", "item 12", "item 13", "item 14", "item 15", "item 16"]

# --- Fonction : d√©coupe un texte long ---
def chunk_text(text, max_length=200000):
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

# --- Fonction : appel Claude ---
def call_claude(prompt_text):
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2500,
        "temperature": 0.1,
        "messages": [{"role": "user", "content": [{"type": "text", "text": prompt_text}]}],
    }
    response = bedrock.invoke_model(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        body=json.dumps(body),
    )
    response_body = json.loads(response["body"].read())
    return response_body["content"][0]["text"]

# --- Fusion intelligente de JSONs partiels ---
def merge_partial_jsons(partials):
    final_json = {}
    for partial in partials:
        for key, value in partial.items():
            if key in final_json:
                existing = final_json.get(key)

                # üß© Fusion des listes
                if isinstance(value, list):
                    if not isinstance(existing, list):
                        existing = [] if existing is None else [existing]
                    # Si ce sont des dictionnaires, on d√©doublonne proprement
                    if all(isinstance(v, dict) for v in value):
                        seen = set()
                        merged = []
                        for item in existing + value:
                            ident = json.dumps(item, sort_keys=True)
                            if ident not in seen:
                                seen.add(ident)
                                merged.append(item)
                        final_json[key] = merged
                    else:
                        final_json[key] = list(set(existing + value))

                # üßÆ Fusion d‚Äôobjets num√©riques (dicts contenant "value")
                elif isinstance(value, dict) and "value" in value:
                    if not isinstance(existing, dict):
                        existing = {}
                    if "value" not in existing or not existing.get("value"):
                        existing["value"] = value.get("value")
                    if "unit" not in existing or not existing.get("unit"):
                        existing["unit"] = value.get("unit")
                    final_json[key] = existing

                # üßæ Fusion des autres types simples
                elif existing in (None, "", []):
                    final_json[key] = value

                # üß© Si les deux sont des cha√Ænes, on peut les concat√©ner
                elif isinstance(existing, str) and isinstance(value, str):
                    if value not in existing:
                        final_json[key] = existing + " " + value
            else:
                final_json[key] = value
    return final_json


# --- Prompt principal ---
base_prompt = """
You are an expert financial analyst specializing in 10-K report analysis.

I want you to focus on several points:
Give the name of companies mentionned in the text and their relationship with the company of the report.
Where are the operations of the company ? What can you say about the ESG and R&D policies ?

From the following text, create a **concise JSON** and try to complete the most keys you can.
Include measurement units for quantitative data and return **nothing else but the JSON**.

For all quantitative data, keep the original number formatting, including commas (e.g., ‚Äú3,450‚Äù instead of ‚Äú3450‚Äù).

Expected structure:
{
  "date": "...",
  "name": "...",
  "industry": "...",
  "sub_industry": "...",
  "customer_segmentation": ["..."],
  "products": ["..."],
  "supplier_countries": ["..."],
  "supply_chain": "...",
  "geographic_market_segment": ["..."],
  "related_companies": [{"company_name": "...", "relationship_type": "..."}],
  "competitors": ["..."],
  "substitute_products": ["..."],
  "revenue": {"value": null, "unit": "...", "variation": null},
  "net_income": {"value": null, "unit": "..."},
  "gross_margin": {"value": null, "unit": "%"},
  "income_tax_expense": {"value": null, "unit": "..."},
  "share_buybacks": {"value": null, "unit": "..."},
  "dividends": {"value": null, "unit": "..."},
  "debt": {"value": null, "unit": "..."},
  "interest_expense": {"value": null, "unit": "..."},
  "depreciation": {"value": null, "unit": "..."},
  "free_cash_flow": {"value": null, "unit": "..."},
  "total_assets": {"value": null, "unit": "..."},
  "shareholders_equity": {"value": null, "unit": "..."},
  "ongoing_litigation": ["..."],
  "research_development_expense": {"value": null, "unit": "..."},
  "research_development_policy": ["..."],
  "ESG_policy": ["..."]
}
"""

# --- Liste des entreprises √† traiter plus tard ---
pending_companies = []

# --- √âtape 1 : traitement des entreprises normales ---
for company in sorted(os.listdir(root_dir)):
    company_folder = os.path.join(root_dir, company)
    if not os.path.isdir(company_folder):
        continue
    print(f"\nAnalyse de l‚Äôentreprise : {company}")

    merged_data = {}

    # Try to merge extracted sections
    if os.path.isdir(company_folder):
        for filename in os.listdir(company_folder):
            # Nettoyer les caract√®res invisibles et espaces multiples
            clean_filename = (
                filename.replace("\xa0", " ")  # supprime les espaces ins√©cables
                .replace("\t", " ")            # supprime les tabulations
                .strip()
            )
            clean_filename = " ".join(clean_filename.split())  # r√©duit les doubles espaces
        
            # Si le nom nettoy√© diff√®re, on renomme le fichier pour corriger sur disque
            if clean_filename != filename:
                old_path = os.path.join(company_folder, filename)
                new_path = os.path.join(company_folder, clean_filename)
                os.rename(old_path, new_path)
                filename = clean_filename  # mise √† jour du nom
        
            if filename.endswith(".json"):
                # Normaliser pour comparaison
                name = (
                    os.path.splitext(filename)[0]
                    .lower()
                    .replace(".", "")
                    .replace("\xa0", " ")
                    .strip()
                )
                name = " ".join(name.split())
        
                # S√©lection stricte
                if any(name == item or name.startswith(item + " ") for item in selected_items):
                    if not any(name == bad or name.startswith(bad + " ") for bad in not_selected_items):
                        file_path = os.path.join(company_folder, filename)
                        try:
                            with open(file_path, "r", encoding="utf-8") as f:
                                data = json.load(f)
                            merged_data[filename] = data
                        except FileNotFoundError:
                            print(f"‚ö†Ô∏è Fichier introuvable m√™me apr√®s nettoyage : {file_path}")


    # --- Fallback to full 10-K if any selected item is missing ---
    if len(merged_data) < len(selected_items):
        filing_path = os.path.join(fillings_dir, company_name)
        if os.path.isdir(filing_path):
            for file in os.listdir(filing_path):
                if file.endswith(".html") or file.endswith(".txt"):
                    with open(os.path.join(filing_path, file), "r", encoding="utf-8") as f:
                        content = f.read()
                    merged_data = {"full_10k": content}
                    break

    # --- Prepare merged text for Claude ---
    merged_text = "\n\n".join(
        f"--- {section} ---\n{json.dumps(content, ensure_ascii=False)}"
        for section, content in merged_data.items()
    )

    chunks = chunk_text(merged_text)

    # Si plus de 10 chunks ‚Üí on le traitera plus tard
    if len(chunks) > 10:
        print(f"Trop volumineux ({len(chunks)} blocs), ajout√© √† la file d‚Äôattente.")
        pending_companies.append(company)
        continue

    # Traitement imm√©diat
    partial_results = []
    print(f"Total blocs {len(chunks)}: {len(merged_text)}")
    for i, chunk in enumerate(chunks, 1):
        prompt_chunk = f"{base_prompt}\n\nSource text (part {i}/{len(chunks)}):\n{chunk}"
        try:
            json_text = call_claude(prompt_chunk).strip()
            match = re.search(r"\{.*\}", json_text, re.DOTALL)
            if match:
                parsed = json.loads(match.group(0))
                partial_results.append(parsed)
                print(f"Chunk {i} trait√© avec succ√®s.")
            else:
                print(f"Chunk {i}: pas de JSON d√©tect√©.")
        except Exception as e:
            print(f"Erreur sur chunk {i}: {e}")

    if not partial_results:
        print(f"Aucun r√©sultat pour {company}")
        continue

    final_json = merge_partial_jsons(partial_results)

    output_path = os.path.join(output_root, company, f"{company}.json")
    os.makedirs(output_root, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_json, f, indent=2, ensure_ascii=False)

# --- √âtape 2 : traitement diff√©r√© des gros fichiers ---
if pending_companies:
    print("\nTraitement diff√©r√© des entreprises volumineuses :")
    print(", ".join(pending_companies))
    # Tu peux ici relancer le m√™me pipeline, ou le r√©partir sur plusieurs workers.

In [14]:
import os
import json
import re
import boto3
from botocore.config import Config
from multiprocessing import Pool
import time

# --- Configuration AWS Bedrock ---
config = Config(region_name="us-west-2", connect_timeout=30, read_timeout=300)
bedrock = boto3.client("bedrock-runtime", config=config)

# --- R√©pertoires racine ---
root_dir = "/home/sagemaker-user/shared/company_profile/extracted_sections"
output_root = "/home/sagemaker-user/shared/company_profile/profiles"
fillings_dir = "/home/sagemaker-user/shared/fillings"

# --- S√©lection et exclusion des sections ---
selected_items = ["item 1", "item 1a", "item 1c", "item 2", "item 3", "item 7", "item 7a", "item 8"]
not_selected_items = ["item 1b", "item 10", "item 11", "item 12", "item 13", "item 14", "item 15", "item 16"]

# --- Fonction : d√©coupe un texte long ---
def chunk_text(text, max_length=200000):
    return [text[i:i + max_length] for i in range(0, len(text), max_length)]

# --- Fonction : appel Claude ---
def call_claude(prompt_text):
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2500,
        "temperature": 0.1,
        "messages": [{"role": "user", "content": [{"type": "text", "text": prompt_text}]}],
    }
    response = bedrock.invoke_model(
        modelId="anthropic.claude-3-sonnet-20240229-v1:0",
        body=json.dumps(body),
    )
    response_body = json.loads(response["body"].read())
    return response_body["content"][0]["text"]

# --- Fusion intelligente de JSONs partiels ---
def merge_partial_jsons(partials):
    final_json = {}
    for partial in partials:
        for key, value in partial.items():
            if key in final_json:
                existing = final_json.get(key)

                # üß© Fusion des listes
                if isinstance(value, list):
                    if not isinstance(existing, list):
                        existing = [] if existing is None else [existing]
                    # Si ce sont des dictionnaires, on d√©doublonne proprement
                    if all(isinstance(v, dict) for v in value):
                        seen = set()
                        merged = []
                        for item in existing + value:
                            ident = json.dumps(item, sort_keys=True)
                            if ident not in seen:
                                seen.add(ident)
                                merged.append(item)
                        final_json[key] = merged
                    else:
                        final_json[key] = list(set(existing + value))

                # üßÆ Fusion d‚Äôobjets num√©riques (dicts contenant "value")
                elif isinstance(value, dict) and "value" in value:
                    if not isinstance(existing, dict):
                        existing = {}
                    if "value" not in existing or not existing.get("value"):
                        existing["value"] = value.get("value")
                    if "unit" not in existing or not existing.get("unit"):
                        existing["unit"] = value.get("unit")
                    final_json[key] = existing

                # üßæ Fusion des autres types simples
                elif existing in (None, "", []):
                    final_json[key] = value

                # üß© Si les deux sont des cha√Ænes, on peut les concat√©ner
                elif isinstance(existing, str) and isinstance(value, str):
                    if value not in existing:
                        final_json[key] = existing + " " + value
            else:
                final_json[key] = value
    return final_json

# --- Prompt principal ---
base_prompt = """
You are an expert financial analyst specializing in 10-K report analysis.

I want you to focus on several points:
Give the name of companies mentionned in the text and their relationship with the company of the report.
Where are the operations of the company ? What can you say about the ESG and R&D policies ?

From the following text, create a **concise JSON** and try to complete the most keys you can.
Include measurement units for quantitative data and return **nothing else but the JSON**.

For all quantitative data, keep the original number formatting, including commas (e.g., ‚Äú3,450‚Äù instead of ‚Äú3450‚Äù).

Expected structure:
{
  "date": "...",
  "name": "...",
  "industry": "...",
  "sub_industry": "...",
  "customer_segmentation": ["..."],
  "products": ["..."],
  "supplier_countries": ["..."],
  "supply_chain": "...",
  "geographic_market_segment": ["..."],
  "related_companies": [{"company_name": "...", "relationship_type": "..."}],
  "competitors": ["..."],
  "substitute_products": ["..."],
  "revenue": {"value": null, "unit": "...", "variation": null},
  "net_income": {"value": null, "unit": "..."},
  "gross_margin": {"value": null, "unit": "%"},
  "income_tax_expense": {"value": null, "unit": "..."},
  "share_buybacks": {"value": null, "unit": "..."},
  "dividends": {"value": null, "unit": "..."},
  "debt": {"value": null, "unit": "..."},
  "interest_expense": {"value": null, "unit": "..."},
  "depreciation": {"value": null, "unit": "..."},
  "free_cash_flow": {"value": null, "unit": "..."},
  "total_assets": {"value": null, "unit": "..."},
  "shareholders_equity": {"value": null, "unit": "..."},
  "ongoing_litigation": ["..."],
  "research_development_expense": {"value": null, "unit": "..."},
  "research_development_policy": ["..."],
  "ESG_policy": ["..."]
}
"""

# --- Fonction pour traiter une entreprise ---
def process_company(company, max_block):
    company_folder = os.path.join(root_dir, company)
    if not os.path.isdir(company_folder):
        return None

    # print(f"\nAnalyse de l‚Äôentreprise : {company}")
    merged_data = {}

    # Try to merge extracted sections
    if os.path.isdir(company_folder):
        for filename in os.listdir(company_folder):
            if not filename.lower().endswith(".json"):
                continue
    
            # Nettoyage de base
            clean_filename = filename.replace("\xa0", " ").replace("\t", " ").strip()
            clean_filename = " ".join(clean_filename.split())
            if clean_filename != filename:
                os.rename(os.path.join(company_folder, filename), os.path.join(company_folder, clean_filename))
                filename = clean_filename
    
            # Extraction du "item X" ou "item XA"
            name = os.path.splitext(filename)[0].lower().strip()
            match = re.match(r'(item\s\d+[a-z]?)', name)  # capture "item 1", "item 1a", etc.
            if not match:
                continue
    
            main_item = match.group(1)  # ex: "item 1a"
    
            # V√©rifie inclusion/exclusion
            if main_item in selected_items and main_item not in not_selected_items:
                file_path = os.path.join(company_folder, filename)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        data = json.load(f)
                    merged_data[filename] = data
                except FileNotFoundError:
                    print(f"‚ö†Ô∏è Fichier introuvable m√™me apr√®s nettoyage : {file_path}")



    # --- Fallback to full 10-K if any selected item is missing ---
    if abs(len(merged_data) - len(selected_items)) > 3:
        filing_path = os.path.join(fillings_dir, company)
        if os.path.isdir(filing_path):
            for file in os.listdir(filing_path):
                if file.endswith(".html") or file.endswith(".txt"):
                    with open(os.path.join(filing_path, file), "r", encoding="utf-8") as f:
                        content = f.read()
                    merged_data = {"full_10k": content}
                    break

    merged_text = "\n\n".join(
        f"--- {section} ---\n{json.dumps(content, ensure_ascii=False)}"
        for section, content in merged_data.items()
    )

    chunks = chunk_text(merged_text)

    if len(chunks) > max_block:
        print(f"Trop volumineux {company}, ajout√© √† la file d‚Äôattente.")
        return company  # mettre dans pending_companies

    partial_results = []
    print(f"Total blocs {company} {len(chunks)}: {len(merged_text)}")
    for i, chunk in enumerate(chunks, 1):
        prompt_chunk = f"{base_prompt}\n\nSource text (part {i}/{len(chunks)}):\n{chunk}"
        try:
            json_text = call_claude(prompt_chunk).strip()
            match = re.search(r"\{.*\}", json_text, re.DOTALL)
            if match:
                parsed = json.loads(match.group(0))
                partial_results.append(parsed)
                #print(f"Chunk {i} trait√© avec succ√®s.")
            else:
                print(f"Chunk {i}: pas de JSON d√©tect√© {company}.")
        except Exception as e:
            print(f"{company} Erreur sur chunk {i}: {e}")

        # ‚úÖ Limiter √† 1 requ√™te par seconde
        time.sleep(1.1)

    if not partial_results:
        print(f"Aucun r√©sultat pour {company}")
        return None

    final_json = merge_partial_jsons(partial_results)

    output_path = os.path.join(output_root, company, f"{company}.json")
    os.makedirs(output_root, exist_ok=True)
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(final_json, f, indent=2, ensure_ascii=False)
    print(f"Entreprise {company} fini")
    return None

In [None]:
# --- Multi-process ---
if __name__ == "__main__":
    companies = sorted(os.listdir(root_dir))
    pending_companies = []

    with Pool(processes=10) as pool:  # Ajuste le nombre de processes selon ton instance
        results = pool.starmap(process_company, [(c, max_block) for c in companies])

    # R√©cup√©rer les entreprises volumineuses
    pending_companies = [c for c in results if c is not None]

    if pending_companies:
        print("\nTraitement diff√©r√© des entreprises volumineuses :")
        print(", ".join(pending_companies))

In [None]:
from multiprocessing import Pool

# Exemple : max_block r√©duit pour retraitement des volumineuses
max_block = 100  # tu peux ajuster selon ce que tu veux autoriser

if pending_companies:
    print(f"Relance du traitement pour {len(pending_companies)} entreprises volumineuses...")

    # pool.map ne passe qu'un seul argument, on utilise starmap pour passer max_block
    with Pool(processes=5) as pool:
        results = pool.starmap(process_company, [(c, max_block) for c in pending_companies])

    # V√©rifier s'il reste encore des entreprises volumineuses
    still_pending = [c for c in results if c is not None]

    if still_pending:
        print("\nCertaines entreprises restent volumineuses apr√®s cette relance :")
        print(", ".join(still_pending))
    else:
        print("Toutes les entreprises volumineuses ont √©t√© trait√©es avec succ√®s !")
else:
    print("Aucune entreprise volumineuse √† retraiter.")


In [11]:
import os
import pandas as pd

# --- Param√®tres ---
csv_path = "/home/sagemaker-user/shared/2025-08-15_composition_sp500.csv"  # ton CSV S&P500
root_dir = "/home/sagemaker-user/shared/company_profile/profiles"
top_n = 50

# --- Lire le CSV ---
df = pd.read_csv(csv_path)

# --- Extraire les 100 premiers symboles selon weight d√©croissant ---
df_sorted = df.sort_values("Weight", ascending=False)
top_symbols = df_sorted.head(top_n)["Symbol"].tolist()

# --- V√©rifier l‚Äôexistence des dossiers et pr√©sence d‚Äôun fichier CSV ---
missing_companies = []

for symbol in top_symbols:
    company_folder = os.path.join(root_dir, symbol)
    if not os.path.isdir(company_folder):
        missing_companies.append(symbol)
        continue

    # V√©rifie s'il y a au moins un fichier .csv dans le dossier
    has_json = any(f.lower().endswith(".json") for f in os.listdir(company_folder))
    if not has_json:
        missing_companies.append(symbol)

print(f"Parmi les {top_n} entreprises les plus importantes :")
print(f"- {len(top_symbols)} entreprises v√©rifi√©es")
print(f"- {len(missing_companies)} dossiers ou JSON manquants")
print("Dossiers/JSON manquants :", missing_companies)


Parmi les 50 entreprises les plus importantes :
- 50 entreprises v√©rifi√©es
- 6 dossiers ou JSON manquants
Dossiers/JSON manquants : ['BRK,B', 'HD', 'GE', 'TMUS', 'MS', 'AXP']


In [19]:
import os

input_folder = "/home/sagemaker-user/shared/company_profile/profiles"
missing_json = []

# Parcourt chaque entreprise dans le dossier
for company in sorted(os.listdir(input_folder)):
    company_path = os.path.join(input_folder, company)
    if not os.path.isdir(company_path):
        continue

    # V√©rifie s'il y a au moins un fichier JSON
    has_json = any(f.lower().endswith(".json") for f in os.listdir(company_path))

    if not has_json:
        missing_json.append(company)

# R√©sum√©
print(f"\nTotal entreprises sans JSON : {len(missing_json)}")
print(missing_json)



Total entreprises sans JSON : 77
['.ipynb_checkpoints', 'ALL', 'AME', 'AMP', 'APA', 'ARE', 'AXP', 'BA', 'C', 'CAH', 'CARR', 'CCI', 'CHD', 'CI', 'COIN', 'CPRT', 'DAL', 'DD', 'DECK', 'DGX', 'DHR', 'DLR', 'DOW', 'DUK', 'EBAY', 'EIX', 'ELV', 'EMN', 'EQIX', 'EQR', 'EXPD', 'EXPE', 'FCX', 'FDS', 'FDX', 'FI', 'GD', 'GE', 'HBAN', 'HD', 'HON', 'HST', 'HWM', 'IDXX', 'INTC', 'IP', 'KIM', 'LEN', 'LUV', 'MET', 'MS', 'NI', 'O', 'OTIS', 'PLD', 'PODD', 'PRU', 'RL', 'RMD', 'SBAC', 'SCHW', 'SNA', 'SPGI', 'STZ', 'SWK', 'SYF', 'TFC', 'TJX', 'TMUS', 'TPL', 'TT', 'TXT', 'TYL', 'VLTO', 'WDC', 'WST', 'WY']


In [None]:
import os
import json
import re
import time
import boto3
from botocore.config import Config

# --- Configuration Bedrock / r√©gion ---
config = Config(region_name="us-west-2", connect_timeout=30, read_timeout=300)
bedrock = boto3.client("bedrock-runtime", config=config)

# --- Liste des symboles √† g√©n√©rer (remplace/compl√®te si besoin) ---
symbols = [
    "ALL","AME","AMP","APA","ARE","AXP","BA","C","CAH","CARR","CCI","CHD","CI","COIN","CPRT",
    "DAL","DD","DECK","DGX","DHR","DLR","DOW","DUK","EBAY","EIX","ELV","EMN","EQIX","EQR",
    "EXPD","EXPE","FCX","FDS","FDX","FI","GD","GE","HBAN","HD","HON","HST","HWM","IDXX",
    "INTC","IP","KIM","LEN","LUV","MET","MS","NI","O","OTIS","PLD","PODD","PRU","RL","RMD",
    "SBAC","SCHW","SNA","SPGI","STZ","SWK","SYF","TFC","TJX","TMUS","TPL","TT","TXT","TYL",
    "VLTO","WDC","WST","WY"
]

# --- Dossiers ---
profiles_root = "/home/sagemaker-user/shared/company_profile/profiles"

# --- Prompt template (FR) ---
prompt_template = """
Tu es un analyste financier expert. √Ä partir de ta connaissance publique (sans acc√®s √† un 10-K local), remplis STRICTEMENT,
et ne retourne RIEN d'autre qu'un objet JSON, au format exact suivant (respecte les cl√©s et types):

{{
  "date": "...",
  "name": "...",
  "industry": "...",
  "sub_industry": "...",
  "customer_segmentation": ["..."],
  "products": ["..."],
  "supplier_countries": ["..."],
  "supply_chain": "...",
  "geographic_market_segment": ["..."],
  "related_companies": [{{"company_name": "...", "relationship_type": "..."}}],
  "competitors": ["..."],
  "substitute_products": ["..."],
  "revenue": {{"value": null, "unit": "...", "variation": null}},
  "net_income": {{"value": null, "unit": "..."}},
  "gross_margin": {{"value": null, "unit": "%"}},
  "income_tax_expense": {{"value": null, "unit": "..."}},
  "share_buybacks": {{"value": null, "unit": "..."}},
  "dividends": {{"value": null, "unit": "..."}},
  "debt": {{"value": null, "unit": "..."}},
  "interest_expense": {{"value": null, "unit": "..."}},
  "depreciation": {{"value": null, "unit": "..."}},
  "free_cash_flow": {{"value": null, "unit": "..."}},
  "total_assets": {{"value": null, "unit": "..."}},
  "shareholders_equity": {{"value": null, "unit": "..."}},
  "ongoing_litigation": ["..."],
  "research_development_expense": {{"value": null, "unit": "..."}},
  "research_development_policy": ["..."],
  "ESG_policy": ["..."]
}}

Remplis les champs avec les valeurs les plus plausibles √† partir de ta connaissance publique pour l'entreprise dont le symbole boursier est: "{symbol}".
Si tu n'as pas d'information pour un champ, laisse `"value": null` ou `[]` ou `""` selon le type.
Retourne uniquement le JSON (aucune explication).
"""

# --- Fonction d'appel Bedrock / Claude ---
def call_claude(prompt_text, model_id="anthropic.claude-3-sonnet-20240229-v1:0"):
    body = {
        "anthropic_version": "bedrock-2023-05-31",
        "max_tokens": 2500,
        "temperature": 0.0,
        "messages": [{"role": "user", "content": [{"type": "text", "text": prompt_text}]}],
    }
    resp = bedrock.invoke_model(modelId=model_id, body=json.dumps(body))
    resp_body = json.loads(resp["body"].read())
    # The model text often at ["content"][0]["text"]
    return resp_body["content"][0]["text"]

def extract_first_json(s: str):
    """
    Extrait le premier objet JSON complet trouv√© dans une cha√Æne.
    Ignore le reste du texte et g√®re les accolades imbriqu√©es.
    """
    start = s.find('{')
    if start == -1:
        return None

    stack = 0
    for i in range(start, len(s)):
        if s[i] == '{':
            stack += 1
        elif s[i] == '}':
            stack -= 1
            if stack == 0:
                return s[start:i+1]
    return None


# --- Boucle principale ---
os.makedirs(profiles_root, exist_ok=True)
errors = []
generated = 0

for sym in symbols:
    out_dir = os.path.join(profiles_root, sym)
    os.makedirs(out_dir, exist_ok=True)
    out_path = os.path.join(out_dir, f"{sym}.json")

    # Skip si d√©j√† pr√©sent
    if os.path.exists(out_path):
        print("D√©j√† existant, saut.")
        continue

    prompt = prompt_template.format(symbol=sym)

    try:
        raw = call_claude(prompt)
        json_text = extract_first_json(raw)

        if not json_text:
            print(f"Aucun JSON trouv√© pour {sym}")
            errors.append((sym, "no_json_in_response"))
            continue
        # Validation et sauvegarde
        parsed = json.loads(json_text)
        with open(out_path, "w", encoding="utf-8") as f:
            json.dump(parsed, f, ensure_ascii=False, indent=2)
        generated += 1
        print(f"{sym} sauvegard√© avec succ√®s")

    except Exception as e:
        print(f"Erreur pour {sym}: {e}")
        errors.append((sym, str(e)))

    time.sleep(1.1)

# --- R√©sum√© ---
print(f"Generated profiles: {generated}")
print(f"Errors: {len(errors)}")
if errors:
    for e in errors[:50]:
        print(e)
