In [2]:
import duckdb
import textwrap


def set_attr(con):
    con.execute("SET max_memory='8GB';")
    con.execute("SET threads=4;")
    con.execute("SET enable_progress_bar=1;")
    con.execute("SET temp_directory = 'C:/Projet/tmp/';")
    con.execute("SET disable_parquet_prefetching = TRUE;")
    con.execute("SET parquet_metadata_cache = TRUE;")


In [None]:
def transform_csv(con):
    con.execute("CREATE SCHEMA IF NOT EXISTS raw;")

    for i in range(1, 13):
        csv_path = f"C:/DUCK/Projet_Data/Raw_data/A2024{i:02d}.csv.gz"
        parquet_path = f"C:/DUCK/Projet_Data/Raw_data/A2024{i:02d}.parquet"
        con.execute(f"""
            COPY (
                SELECT * FROM read_csv_auto('{csv_path}')
            ) TO '{parquet_path}' (FORMAT PARQUET);
        """)


def load_parquet(con):
    con.execute("CREATE SCHEMA IF NOT EXISTS raw;")
    for i in range(1, 13):
        try:
            con.execute(f"""
                CREATE TABLE IF NOT EXISTS raw.A2024{i:02d} AS
                SELECT * FROM read_parquet('C:/DUCK/Projet_Data/Raw_data/A2024{i:02d}.parquet');
            """)
            print(f"[✔] Successfully loaded A2024{i:02d}.parquet")
        except Exception as e:
            print(f"[✘] Failed to load A2024{i:02d}.parquet: {e}")


with duckdb.connect("projet.db") as con:
    set_attr(con)
    # transform_csv(con)
    load_parquet(con)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202401.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202402.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202403.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202404.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202405.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202406.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202407.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202408.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202409.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202410.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202411.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202412.parquet


In [55]:
def stage(column_map, tables):
    indent = " " * 4
    columns = ",\n".join(
        [f"{indent}{col} AS {alias}" for col, alias in column_map.items()]
    )
    list_sql = [
        f"""SELECT
{columns}
FROM {table}
GROUP BY ALL"""
        for table in tables
    ]
    sql = "\nUNION ALL\n".join(list_sql) + ";"
    print(sql)
    return sql


dict = {
    "SUM(FLT_DEP_MNT)": "mnt_depassement",
    "SUM(FLT_PAI_MNT)": "mnt_pay",
    "SUM(FLT_REM_MNT)": "mnt_rem",
    "SOI_ANN": "ann_soin",
    "SOI_MOI": "mois_soin",
    "AGE_BEN_SNDS": "age",
    "BEN_RES_REG": "region_ben",
    "BEN_SEX_COD": "sexe",
    "ASU_NAT": "nat_assurance",
    "CPT_ENV_TYP": "type_envlp",
    "DRG_AFF_NAT": "nat_destinataire",
    "PRS_FJH_TYP": "forfait_journalier",
    "PRS_PPU_SEC": "code_secteur",
    "PRS_REM_TYP": "type_remb",
}

tables = [f"raw.A2024{i:02d}" for i in range(1, 13)]

with open("2. staging.sql", "w+") as f:
    f.write(
        "CREATE SCHEMA IF NOT EXISTS stg;\nCREATE OR REPLACE TABLE stg.transactions AS\n"
    )
    f.write(stage(dict, tables))

SELECT
    SUM(FLT_DEP_MNT) AS mnt_depassement,
    SUM(FLT_PAI_MNT) AS mnt_pay,
    SUM(FLT_REM_MNT) AS mnt_rem,
    SOI_ANN AS ann_soin,
    SOI_MOI AS mois_soin,
    AGE_BEN_SNDS AS age,
    BEN_RES_REG AS region_ben,
    BEN_SEX_COD AS sexe,
    ASU_NAT AS nat_assurance,
    CPT_ENV_TYP AS type_envlp,
    DRG_AFF_NAT AS nat_destinataire,
    PRS_FJH_TYP AS forfait_journalier,
    PRS_PPU_SEC AS code_secteur,
    PRS_REM_TYP AS type_remb
FROM raw.A202401
GROUP BY ALL
UNION ALL
SELECT
    SUM(FLT_DEP_MNT) AS mnt_depassement,
    SUM(FLT_PAI_MNT) AS mnt_pay,
    SUM(FLT_REM_MNT) AS mnt_rem,
    SOI_ANN AS ann_soin,
    SOI_MOI AS mois_soin,
    AGE_BEN_SNDS AS age,
    BEN_RES_REG AS region_ben,
    BEN_SEX_COD AS sexe,
    ASU_NAT AS nat_assurance,
    CPT_ENV_TYP AS type_envlp,
    DRG_AFF_NAT AS nat_destinataire,
    PRS_FJH_TYP AS forfait_journalier,
    PRS_PPU_SEC AS code_secteur,
    PRS_REM_TYP AS type_remb
FROM raw.A202402
GROUP BY ALL
UNION ALL
SELECT
    SUM(FLT_DEP_MNT) AS

In [54]:
def exploratoire(dict, stg_table):
    list_sql = [
        textwrap.dedent(f"""
        SELECT
            '{column}' AS column_name,
            COUNT(*) AS nb_total,
            COUNT(*) FILTER (WHERE {column} {f"= {value}" if isinstance(value, int) else f"IN {value}"}) AS nb_unutilisable,
            ROUND(nb_unutilisable / nb_total, 6) AS pct_unutilisable
        FROM {stg_table}""")
        for column, value in dict.items()
    ]
    sql = "\nUNION ALL".join(list_sql) + "\nORDER BY column_name;"
    print(sql.strip())
    return sql.strip()


dict = {
    "ann_soin": (0000, "0001"),
    "mois_soin": (00),
    "age": (99),
    "region_ben": (99),
    "sexe": (99),
    "nat_assurance": (0, 99),
    "type_envlp": (9, 98),
    "nat_destinataire": (0, 99),
    "code_secteur": (9),
    "type_remb": (99),
    "forfait_journalier": (8, 9),
}
with open("2. exploratoire.sql", "w+") as f:
    f.write(exploratoire(dict, "stg.transactions"))

SELECT
    'ann_soin' AS column_name,
    COUNT(*) AS nb_total,
    COUNT(*) FILTER (WHERE ann_soin IN (0, '0001')) AS nb_unutilisable,
    ROUND(nb_unutilisable / nb_total, 6) AS pct_unutilisable
FROM stg.transactions
UNION ALL
SELECT
    'mois_soin' AS column_name,
    COUNT(*) AS nb_total,
    COUNT(*) FILTER (WHERE mois_soin = 0) AS nb_unutilisable,
    ROUND(nb_unutilisable / nb_total, 6) AS pct_unutilisable
FROM stg.transactions
UNION ALL
SELECT
    'age' AS column_name,
    COUNT(*) AS nb_total,
    COUNT(*) FILTER (WHERE age = 99) AS nb_unutilisable,
    ROUND(nb_unutilisable / nb_total, 6) AS pct_unutilisable
FROM stg.transactions
UNION ALL
SELECT
    'region_ben' AS column_name,
    COUNT(*) AS nb_total,
    COUNT(*) FILTER (WHERE region_ben = 99) AS nb_unutilisable,
    ROUND(nb_unutilisable / nb_total, 6) AS pct_unutilisable
FROM stg.transactions
UNION ALL
SELECT
    'sexe' AS column_name,
    COUNT(*) AS nb_total,
    COUNT(*) FILTER (WHERE sexe = 99) AS nb_unutilisable,
 

In [None]:
def transform(con):
    with open("type_donnee.sql", "r") as file:
        sql_script = file.read()
        try:
            con.execute(sql_script)
            print("[✔] Successfully created stg.transactions_libelles")
        except Exception as e:
            print(f"[✘] Failed to create stg.transactions_libelles: {e}")


with duckdb.connect("projet.db") as con:
    set_attr(con)
    transform(con)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully created stg.transactions_libelles


In [None]:
def dwh(con):
    with open("3. dwh.sql", "r") as file:
        sql_script = file.read()
        try:
            con.execute(sql_script)
            print("[✔] Successfully created dwh")
        except Exception as e:
            print(f"[✘] Failed to create dwh: {e}")


with duckdb.connect("projet.db") as con:
    set_attr(con)
    dwh(con)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [3]:
con = duckdb.connect("projet.db")
con.close()