In [None]:
import os
import json
import time
import requests
import pandas as pd
from dotenv import load_dotenv
from tqdm import tqdm

#### Authentification INPI

In [None]:
# --- Authentification INPI ---
load_dotenv()

USERNAME = os.getenv("INPI_USERNAME")
PASSWORD = os.getenv("INPI_PASSWORD")

if not USERNAME or not PASSWORD:
    raise RuntimeError("INPI_USERNAME ou INPI_PASSWORD manquant dans le .env")

LOGIN_URL = "https://registre-national-entreprises.inpi.fr/api/sso/login"

def login_inpi():
    payload = {"username": USERNAME, "password": PASSWORD}
    response = requests.post(LOGIN_URL, json=payload, timeout=10)
    response.raise_for_status()
    token = response.json()["token"]
    print("üîê Token INPI obtenu")
    return {"Authorization": f"Bearer {token}"}

if "HEADERS" not in globals():
    HEADERS = login_inpi()

In [None]:
RESULTS_FILE = "siren_attachments.json"
# --- Chargement des r√©sultats existants ---
if os.path.exists(RESULTS_FILE):
    with open(RESULTS_FILE, "r") as f:
        results = json.load(f)
    print(f"üìÇ {len(results)} SIREN d√©j√† pr√©sents")
else:
    results = {}
    print("üìÇ Aucun r√©sultat existant")


In [None]:
# --- Chargement de la liste des SIREN √† traiter ---
with open("siren_list.json", "r") as f:
    siren_list = json.load(f)

print(f"‚úÖ {len(siren_list)} SIREN charg√©s")
siren_list = [str(s) for s in siren_list]
results = {str(k): v for k, v in results.items()}

In [None]:
# --- Calcul des SIREN restants √† traiter ---
def has_bilan_exploitable(data):
    return any(b.get("bilanSaisi") for b in data.get("bilans", []))

remaining_siren = [
    s for s in siren_list
    if s not in results or not has_bilan_exploitable(results.get(s, {}))
]

print(f"‚ÑπÔ∏è {len(remaining_siren)} SIREN √† r√©cup√©rer")

---

#### Boucle de r√©cup√©ration

In [None]:
# --- Param√®tres ---
RESULTS_FILE = "siren_attachments.json"
BATCH_SIZE = 100
PAUSE_BETWEEN_REQUESTS = 5.0
PAUSE_BETWEEN_BATCHES = 60
MAX_RETRIES = 3
TIMEOUT = 10
BACKOFF_BASE = 10

# --- Boucle de r√©cup√©ration par batch ---
for i in range(0, len(remaining_siren), BATCH_SIZE):
    batch = remaining_siren[i:i + BATCH_SIZE]
    print(f"\nüîπ Batch {i//BATCH_SIZE + 1} / {(len(remaining_siren)-1)//BATCH_SIZE + 1}")

    for siren in tqdm(batch, desc="R√©cup√©ration SIREN"):
        success = False
        attempts = 0

        while not success and attempts < MAX_RETRIES:
            try:
                url = f"https://registre-national-entreprises.inpi.fr/api/companies/{siren}/attachments"
                response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
                response.raise_for_status()
                attachments = response.json()

                # üîπ Ne garder que les cl√©s utiles
                filtered = {
                    k: attachments.get(k, [])
                    for k in ["bilans", "comptesResultats"]
                    if k in attachments
                }

                # üîí Fusion s√©curis√©e
                existing = results.get(siren, {})
                for key, value in filtered.items():
                    existing[key] = value
                results[siren] = existing

                success = True

            except requests.exceptions.HTTPError as e:
                code = e.response.status_code
                attempts += 1

                if code == 429:
                    wait = BACKOFF_BASE * (2 ** (attempts - 1))
                    print(f"‚ö†Ô∏è 429 pour {siren}, pause {wait}s")
                    time.sleep(wait)

                elif code == 401:
                    print(f"üîê 401 pour {siren} ‚Üí refresh token")
                    HEADERS = login_inpi()
                    time.sleep(2)

                elif code == 404:
                    # Marque comme trait√© mais vide
                    results.setdefault(siren, {})
                    success = True

                else:
                    print(f"‚ùå {siren} erreur HTTP {code}")
                    time.sleep(5)

            except Exception as e:
                attempts += 1
                print(f"‚ùå {siren} erreur {e}")
                time.sleep(5)

        if not success:
            # trace pour √©viter boucle infinie
            results[siren] = results.get(siren, {})

        time.sleep(PAUSE_BETWEEN_REQUESTS)

    # üíæ Sauvegarde interm√©diaire (fin batch)
    with open(RESULTS_FILE, "w") as f:
        json.dump(results, f, indent=2)

    # üîç Contr√¥le de coh√©rence post-batch
    bilans_exploitables = {
        (s, b.get("numChrono"))
        for s, data in results.items()
        for b in data.get("bilans", [])
        if b.get("bilanSaisi")
    }
    print("üìä Bilans exploitables actuels :", len(bilans_exploitables))

    print(f"‚úÖ Batch termin√© ‚Äî pause {PAUSE_BETWEEN_BATCHES}s")
    time.sleep(PAUSE_BETWEEN_BATCHES)

print("üéâ R√©cup√©ration termin√©e")

In [None]:
# Test de connection
siren_peugeot = "552100554"
url = f"https://registre-national-entreprises.inpi.fr/api/companies/{siren_peugeot}/attachments"

r = requests.get(url, headers=HEADERS, timeout=10)

print("HTTP status :", r.status_code)

---

#### V√©rification des SIREN touch√©s par le 429 √©ventuel pour aller les chercher plus tard

In [None]:
# --- 1. Charger les r√©sultats existants ---
with open("siren_attachments.json", "r") as f:
    results = json.load(f)

# --- 2. Cr√©er un DataFrame pour audit ---
audit = pd.DataFrame([
    {"siren": siren, "nb_bilans": len(data.get("bilans", []))}
    for siren, data in results.items()
])

# --- 3. Identifier les SIREN qui ont 0 bilan et qui sont donc probablement touch√©s par 429 ---
audit["probable_429"] = audit["nb_bilans"] == 0

# --- 4. Extraire la liste de SIREN √† relancer ---
siren_a_relancer = audit.loc[audit["probable_429"], "siren"].tolist()

print(f"Nombre de SIREN touch√©s par 429 ou r√©sultats vides : {len(siren_a_relancer)}")
print("Exemple des 10 premiers SIREN √† relancer :")
print(siren_a_relancer[:10])