In [None]:
import pandas as pd

In [None]:
# Extraction des SIRET des DECP
decp_acheteurs_df = pd.read_parquet("dist/decp.parquet", columns=["acheteur.id"])
decp_acheteurs_df = decp_acheteurs_df.drop_duplicates().loc[
    decp_acheteurs_df["acheteur.id"] != ""
]
decp_acheteurs_df.index.size

In [None]:
etablissement_df_chunked = pd.read_csv(
    "data/StockEtablissement_utf8_geo.csv.gz",
    chunksize=(1000000),
    dtype="object",
    index_col=None,
    usecols=[
        "siret",
        "siren",
        # "denominationUsuelleEtablissement", vide
        "enseigne1Etablissement",
    ],
)

merged_chunks_list = []

with etablissement_df_chunked as reader:
    for df_chunk in reader:
        merge = pd.merge(
            decp_acheteurs_df,
            df_chunk,
            how="inner",
            left_on="acheteur.id",
            right_on="siret",
        )
        if merge.index.size > 0:
            merged_chunks_list.append(merge)

decp_acheteurs_df = pd.concat(merged_chunks_list).drop(columns=["siret"])

del etablissement_df_chunked, df_chunk

In [None]:
# Récupération des données SIREN
# Possibilité d'utiliser les URL sources en prod au lieu d'un fichier local

unite_legale_df_chunked = pd.read_csv(
    "./data/StockUniteLegale_utf8.zip",
    index_col=None,
    dtype="object",
    sep=",",
    chunksize=1000000,
    usecols=[
        "siren",
        "denominationUniteLegale",
        # "sigleUniteLegale" trop variable, parfois long
    ],
)

merged_chunks_list = []

with unite_legale_df_chunked as reader:
    for df_chunk in reader:
        merge = pd.merge(decp_acheteurs_df, df_chunk, how="inner", on="siren")
        if not merge.empty and merge.notnull().any().any() and len(merge) >= 1:
            merged_chunks_list.append(merge)
del unite_legale_df_chunked, df_chunk

decp_acheteurs_df = pd.concat(merged_chunks_list)

del merged_chunks_list

In [None]:
# Construction du champ acheteur.nom

from numpy import NaN


def construct_nom(row):
    if row["enseigne1Etablissement"] is NaN:
        return row["denominationUniteLegale"]
    else:
        return f'{row["denominationUniteLegale"]} - {row["enseigne1Etablissement"]}'


decp_acheteurs_df["acheteur.nom"] = decp_acheteurs_df.apply(construct_nom, axis=1)

In [None]:
final_columns = ["acheteur.id", "acheteur.nom"]

decp_df = pd.read_parquet("dist/decp.parquet").drop(columns=["acheteur.nom"])
decp_df = pd.merge(
    decp_df,
    decp_acheteurs_df[final_columns],
    on="acheteur.id",
    how="left",
)

In [None]:
# Enregistrement
decp_df.to_csv("data/decp.csv", index=None)
decp_df.to_parquet("data/decp.parquet", index=None)