In [None]:
import pandas as pd
import requests
from dotenv import load_dotenv
import os
import psycopg2
import json
from tqdm import tqdm
import time

#### D√©finition de l'acc√®s

In [None]:
load_dotenv()

USERNAME = os.getenv("INPI_USERNAME")
PASSWORD = os.getenv("INPI_PASSWORD")

if not USERNAME or not PASSWORD:
    raise RuntimeError("INPI_USERNAME ou INPI_PASSWORD manquant dans le .env")

LOGIN_URL = "https://registre-national-entreprises.inpi.fr/api/sso/login"

payload = {
    "username": USERNAME,
    "password": PASSWORD,
}

response = requests.post(LOGIN_URL, json=payload, timeout=10)
response.raise_for_status()

token = response.json()["token"]

HEADERS = {
    "Authorization": f"Bearer {token}"
}

---

#### V√©rification de l'acc√®s

In [None]:
print("Token OK :", token[:20], "...")

---

##### Import du json des bilans √† r√©cup√©rer

In [None]:
with open("siren_list.json", "r") as f:
    siren_list = json.load(f)

print(f"‚úÖ {len(siren_list)} SIREN charg√©s")
print(siren_list[:10])

---

##### Lancement de la r√©cup√©ration des bilans

In [None]:
# Test de requ√™te avec un SIREN connu
siren_test = "552100554"
url = f"https://registre-national-entreprises.inpi.fr/api/companies/{siren_test}/attachments"

try:
    response = requests.get(url, headers=HEADERS, timeout=10)
    response.raise_for_status()
    data = response.json()
    print("‚úÖ OK :", list(data.keys()))
except requests.exceptions.RequestException as e:
    print("‚ùå Erreur :", e)

##### Lancement de la r√©cup√©ration

In [None]:
# # --- Param√®tres ---
# BATCH_SIZE = 100
# PAUSE_BETWEEN_REQUESTS = 5.0
# PAUSE_BETWEEN_BATCHES = 60
# MAX_RETRIES = 3
# TIMEOUT = 10

# # --- Charger la liste de SIREN ---
# with open("siren_list.json", "r") as f:
#     siren_list = json.load(f)

# # --- Charger r√©sultats existants pour reprise ---
# try:
#     with open("siren_attachments.json", "r") as f:
#         results = json.load(f)
# except FileNotFoundError:
#     results = {}

# # --- D√©terminer les SIREN manquants ---
# remaining_siren = [s for s in siren_list if s not in results]
# print(f"‚ÑπÔ∏è {len(remaining_siren)} SIREN √† r√©cup√©rer")

# # --- Compteurs ---
# total_bilans = 0
# total_comptes = 0

# # --- Boucle par batch ---
# for i in range(0, len(remaining_siren), BATCH_SIZE):
#     batch = remaining_siren[i:i + BATCH_SIZE]
#     print(f"\nüîπ Batch {i//BATCH_SIZE + 1} / {len(remaining_siren)//BATCH_SIZE + 1} : {len(batch)} SIREN")

#     for siren in tqdm(batch, desc="R√©cup√©ration SIREN"):
#         success = False
#         attempts = 0

#         while not success and attempts < MAX_RETRIES:
#             try:
#                 url = f"https://registre-national-entreprises.inpi.fr/api/companies/{siren}/attachments"
#                 response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
#                 response.raise_for_status()
#                 attachments = response.json()

#                 # --- Ne conserver que les documents utiles ---
#                 bilans = attachments.get("bilans", []) + attachments.get("bilansSaisis", [])
#                 comptes = attachments.get("comptesResultats", [])

#                 results[siren] = {
#                     "bilans": bilans,
#                     "comptesResultats": comptes
#                 }

#                 # --- Mettre √† jour les compteurs ---
#                 total_bilans += len(bilans)
#                 total_comptes += len(comptes)

#                 success = True
#             except Exception as e:
#                 attempts += 1
#                 print(f"‚ùå {siren} erreur {e} (tentative {attempts}/{MAX_RETRIES})")
#                 time.sleep(5)

#         if not success:
#             results[siren] = {"bilans": [], "comptesResultats": []}

#         time.sleep(PAUSE_BETWEEN_REQUESTS)

#     # --- Sauvegarde interm√©diaire ---
#     with open("siren_attachments.json", "w") as f:
#         json.dump(results, f, indent=2)

#     print(f"‚úÖ Batch {i//BATCH_SIZE + 1} termin√©, pause {PAUSE_BETWEEN_BATCHES}s")
#     time.sleep(PAUSE_BETWEEN_BATCHES)

# # --- Sauvegarde finale ---
# with open("siren_attachments.json", "w") as f:
#     json.dump(results, f, indent=2)

# print(f"üéâ R√©cup√©ration termin√©e !")
# print(f"üìä Total bilans r√©cup√©r√©s : {total_bilans}")
# print(f"üìä Total comptes de r√©sultats r√©cup√©r√©s : {total_comptes}")

#### Relance de l'API pour documents manquants suite crash

---

In [None]:
def login_inpi():
    LOGIN_URL = "https://registre-national-entreprises.inpi.fr/api/sso/login"
    
    payload = {
        "username": USERNAME,
        "password": PASSWORD,
    }

    response = requests.post(LOGIN_URL, json=payload, timeout=10)
    response.raise_for_status()
    
    token = response.json()["token"]
    
    headers = {
        "Authorization": f"Bearer {token}"
    }
    
    print("üîê Nouveau token INPI obtenu")
    return headers


In [None]:
HEADERS = login_inpi()


---

#### Boucle de r√©cup√©ration automatique token et liste

In [None]:
if os.path.exists("siren_attachments.json"):
    with open("siren_attachments.json", "r") as f:
        results = json.load(f)
else:
    results = {}

In [None]:
# --- Param√®tres ---
BATCH_SIZE = 100
PAUSE_BETWEEN_REQUESTS = 5.0
PAUSE_BETWEEN_BATCHES = 60
MAX_RETRIES = 3
TIMEOUT = 10
BACKOFF_BASE = 10

results = {}

remaining_siren = [s for s in siren_list if s not in results]
print(f"‚ÑπÔ∏è {len(remaining_siren)} SIREN √† r√©cup√©rer")

for i in range(0, len(remaining_siren), BATCH_SIZE):
    batch = remaining_siren[i:i + BATCH_SIZE]
    print(f"\nüîπ Batch {i//BATCH_SIZE + 1} / {(len(remaining_siren)-1)//BATCH_SIZE + 1}")

    for siren in tqdm(batch, desc="R√©cup√©ration SIREN"):
        success = False
        attempts = 0

        while not success and attempts < MAX_RETRIES:
            try:
                url = f"https://registre-national-entreprises.inpi.fr/api/companies/{siren}/attachments"
                response = requests.get(url, headers=HEADERS, timeout=TIMEOUT)
                response.raise_for_status()

                attachments = response.json()

                # On garde UNIQUEMENT ce qui est utile
                filtered = {
                    k: v for k, v in attachments.items()
                    if k in ["bilans", "bilansSaisis"]
                }

                results[siren] = filtered if filtered else {}
                success = True

            except requests.exceptions.HTTPError as e:
                code = e.response.status_code
                attempts += 1

                if code == 429:
                    wait = BACKOFF_BASE * (2 ** (attempts - 1))
                    print(f"‚ö†Ô∏è 429 pour {siren}, pause {wait}s")
                    time.sleep(wait)

                elif code == 401:
                    print(f"üîê 401 pour {siren} ‚Üí refresh token")
                    HEADERS = login_inpi()
                    time.sleep(2)

                elif code == 404:
                    results[siren] = {}
                    success = True

                else:
                    print(f"‚ùå {siren} erreur HTTP {code}")
                    time.sleep(5)

            except Exception as e:
                attempts += 1
                print(f"‚ùå {siren} erreur {e}")
                time.sleep(5)

        if not success:
            results[siren] = {}

        time.sleep(PAUSE_BETWEEN_REQUESTS)

    with open("siren_attachments.json", "w") as f:
        json.dump(results, f, indent=2)

    print(f"‚úÖ Batch termin√© ‚Äî pause {PAUSE_BETWEEN_BATCHES}s")
    time.sleep(PAUSE_BETWEEN_BATCHES)

with open("siren_attachments.json", "w") as f:
    json.dump(results, f, indent=2)

print("üéâ R√©cup√©ration termin√©e")

In [None]:
url = "https://registre-national-entreprises.inpi.fr/api/companies/799255351/attachments"
response = requests.get(url, headers=HEADERS)

print(response.status_code)
print(response.text)

---

#### V√©rification des 429 potentiels

In [None]:
# --- 1. Charger les r√©sultats existants ---
with open("siren_attachments.json", "r") as f:
    results = json.load(f)

# --- 2. Cr√©er un DataFrame pour audit ---
audit = pd.DataFrame([
    {"siren": siren, "nb_bilans": len(data.get("bilans", []))}
    for siren, data in results.items()
])

# --- 3. Identifier les SIREN qui ont 0 bilan et qui sont donc probablement touch√©s par 429 ---
audit["probable_429"] = audit["nb_bilans"] == 0

# --- 4. Extraire la liste de SIREN √† relancer ---
siren_a_relancer = audit.loc[audit["probable_429"], "siren"].tolist()

print(f"Nombre de SIREN touch√©s par 429 ou r√©sultats vides : {len(siren_a_relancer)}")
print("Exemple des 10 premiers SIREN √† relancer :")
print(siren_a_relancer[:10])

---

#### Alimentation des infos

In [None]:
# # R√©cup√©rer toutes les pages du bilan
# pages = bilan_saisi["bilan"]["detail"]["pages"]

# # Parcourir toutes les pages et toutes les liasses pour afficher les montants
# for page in pages:
#     print(f"--- Page {page['numero']} ---")
#     for liasse in page["liasses"]:
#         code = liasse["code"]
#         m1 = liasse.get("m1")
#         m2 = liasse.get("m2")
#         m3 = liasse.get("m3")
#         m4 = liasse.get("m4")
#         print(code, m1, m2, m3, m4)

In [None]:
# # Remplacer les cha√Ænes vides par 0 et convertir en int
# for col in ["m1", "m2", "m3", "m4"]:
#     df[col] = df[col].replace("", "0").astype(int)

# print(df.head(20))

In [None]:
# print("Codes disponibles dans ce bilan :", df["code"].unique())


In [None]:
# # Total actif exercice N
# total_actif = df[df["code"] == "AT"]["m3"].iloc[0]
# print("Total actif exercice N :", total_actif)

# # Capitaux propres exercice N
# if not df[df["code"] == "CS"].empty:
#     capitaux_propres = df[df["code"] == "CS"]["m3"].iloc[0]
#     print("Capitaux propres exercice N :", capitaux_propres)
# else:
#     print("Code CS introuvable dans ce bilan")

---

#### Connection BDD

In [None]:
# # Charger .env
# load_dotenv()

# # R√©cup√©rer l'URL depuis les variables d'environnement
# DATABASE_URL = os.getenv("NEON_DATABASE_URL")
# if not DATABASE_URL:
#     raise ValueError("La variable NEON_DATABASE_URL n'est pas d√©finie dans .env")

# # Connexion √† la BDD
# conn = psycopg2.connect(DATABASE_URL)
# cur = conn.cursor()
# print("‚úÖ Connexion √† N√©on r√©ussie !")

#### Cr√©ation Tables BDD

In [None]:
# # Soci√©t√©
# cur.execute("""
# CREATE TABLE IF NOT EXISTS societes (
#     siren VARCHAR PRIMARY KEY,
#     denomination VARCHAR,
#     adresse VARCHAR,
#     date_cloture DATE,
#     type_bilan VARCHAR,
#     num_depot VARCHAR,
#     code_activite VARCHAR
# )
# """)

# # Bilan
# cur.execute("""
# CREATE TABLE IF NOT EXISTS lignes_bilan (
#     id SERIAL PRIMARY KEY,
#     siren VARCHAR REFERENCES societes(siren),
#     page INT,
#     code VARCHAR,
#     libelle VARCHAR,
#     m1 BIGINT,
#     m2 BIGINT,
#     m3 BIGINT,
#     m4 BIGINT
# )
# """)

# conn.commit()

#### Alimentation BDD

In [None]:
# # soci√©t√©
# cur.execute("""
# INSERT INTO societes (siren, denomination, adresse, date_cloture, type_bilan, num_depot, code_activite)
# VALUES (%s, %s, %s, %s, %s, %s, %s)
# ON CONFLICT (siren) DO UPDATE
# SET denomination = EXCLUDED.denomination,
#     adresse = EXCLUDED.adresse,
#     date_cloture = EXCLUDED.date_cloture,
#     type_bilan = EXCLUDED.type_bilan
# """, (
#     identite["siren"],
#     identite["denomination"],
#     identite["adresse"],
#     identite["dateClotureExercice"],
#     identite["codeTypeBilan"],
#     identite.get("numDepot"),
#     identite.get("codeActivite")
# ))

# conn.commit()

In [None]:
# pages = bilan_saisi["bilan"]["detail"]["pages"]

# # Exemple de mapping codes ‚Üí libell√©s
# code_map = {
#     "AJ": "Immobilisations incorporelles",
#     "A1": "Terrains",
#     "AT": "Total Actif",
#     "CS": "Capitaux propres",
#     "CO": "R√©sultat net",
#     "BJ": "Stocks",
#     "BT": "Avances et acomptes",
#     "BX": "Cr√©ances clients",
#     "BZ": "Autres cr√©ances",
#     "CJ": "Disponibilit√©s",
#     # ajouter le reste si n√©cessaire
# }

# for page in pages:
#     page_num = page["numero"]
#     for liasse in page["liasses"]:
#         code = liasse["code"]
#         libelle = code_map.get(code, "")
#         m1 = int(liasse.get("m1") or 0)
#         m2 = int(liasse.get("m2") or 0)
#         m3 = int(liasse.get("m3") or 0)
#         m4 = int(liasse.get("m4") or 0)

#         cur.execute("""
#         INSERT INTO lignes_bilan (siren, page, code, libelle, m1, m2, m3, m4)
#         VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
#         """, (
#             identite["siren"],
#             page_num,
#             code,
#             libelle,
#             m1, m2, m3, m4
#         ))

# conn.commit()

#### V√©rification insertions

In [None]:
# cur.execute("SELECT * FROM lignes_bilan WHERE siren = %s LIMIT 10", (identite["siren"],))
# rows = cur.fetchall()
# for row in rows:
#     print(row)