In [None]:
import duckdb


def transform_csv(con):
    con.execute("CREATE SCHEMA IF NOT EXISTS raw;")

    for i in range(1, 13):
        csv_path = f"C:/Projet/Raw_data/A2024{i:02d}.csv.gz"
        parquet_path = f"C:/Projet/Raw_data/A2024{i:02d}.parquet"
        con.execute(f"""
            COPY (
                SELECT * FROM read_csv_auto('{csv_path}')
            ) TO '{parquet_path}' (FORMAT PARQUET);
        """)


def load_parquet(con):
    con.execute("CREATE SCHEMA IF NOT EXISTS raw;")
    for i in range(1, 13):
        try:
            con.execute(f"""
                CREATE TABLE IF NOT EXISTS raw.A2024{i:02d} AS
                SELECT * FROM read_parquet('C:/Projet/Raw_data/A2024{i:02d}.parquet');
            """)
            print(f"[✔] Successfully loaded A2024{i:02d}.parquet")
        except Exception as e:
            print(f"[✘] Failed to load A2024{i:02d}.parquet: {e}")


def stage(con):
    con.execute("CREATE SCHEMA IF NOT EXISTS stg;")
    columns = """FLT_DEP_MNT mnt_depassement,
	FLT_PAI_MNT mnt_pay,
	FLT_REM_MNT mnt_rem,
	SOI_ANN ann_soin,
	SOI_MOI mois_soin,
	AGE_BEN_SNDS age,
	BEN_RES_REG region_ben,
	BEN_SEX_COD sexe,
	MTM_NAT ticket_mod,
	ASU_NAT nat_assurance,
	CPT_ENV_TYP type_envlp,
	DRG_AFF_NAT nat_destinataire,
	EXO_MTF motif_exo,
	PRS_NAT nat_prestation,
	PRS_PPU_SEC code_secteur,
	PRS_REM_TAU taux_remb,
	PRS_REM_TYP type_remb"""

    union_sql_parts = []

    for i in range(1, 13):
        union_sql_parts.append(f"""
            SELECT {columns}
            FROM raw.A2024{i:02d}
        """)
    union_sql = "\nUNION ALL\n".join(union_sql_parts)

    try:
        con.execute(f"""
            CREATE OR REPLACE TABLE stg.transactions AS
            {union_sql};
        """)
        print("[✔] Successfully created stg.transactions")
    except Exception as e:
        print(f"[✘] Failed to create stg.transactions: {e}")

    for i in range(1, 13):
        try:
            con.execute(f"DROP TABLE IF EXISTS raw.A2024{i:02d};")
            print(f"[✔] Successfully deleted raw.A2024{i:02d}")
        except Exception as e:
            print(f"[✘] Failed to delete raw.A2024{i:02d}: {e}")


In [3]:
with duckdb.connect("projet.db") as con:
    con.execute("SET max_memory='8GB';")
    con.execute("SET threads=4;")
    con.execute("SET enable_progress_bar=1;")
    con.execute("SET temp_directory = 'C:/Projet/tmp/';")
    con.execute("SET disable_parquet_prefetching = TRUE;")
    con.execute("SET parquet_metadata_cache = TRUE;")
    # transform_csv(con)
    load_parquet(con)

[✘] Failed to load A202401.parquet: IO Error: No files found that match the pattern "C:/Projet/Raw_data/A202401.parquet"
[✘] Failed to load A202402.parquet: IO Error: No files found that match the pattern "C:/Projet/Raw_data/A202402.parquet"
[✘] Failed to load A202403.parquet: IO Error: No files found that match the pattern "C:/Projet/Raw_data/A202403.parquet"
[✘] Failed to load A202404.parquet: IO Error: No files found that match the pattern "C:/Projet/Raw_data/A202404.parquet"
[✘] Failed to load A202405.parquet: IO Error: No files found that match the pattern "C:/Projet/Raw_data/A202405.parquet"
[✘] Failed to load A202406.parquet: IO Error: No files found that match the pattern "C:/Projet/Raw_data/A202406.parquet"
[✘] Failed to load A202407.parquet: IO Error: No files found that match the pattern "C:/Projet/Raw_data/A202407.parquet"
[✘] Failed to load A202408.parquet: IO Error: No files found that match the pattern "C:/Projet/Raw_data/A202408.parquet"
[✔] Successfully loaded A202409.

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202410.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202411.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully loaded A202412.parquet


In [None]:
with duckdb.connect("projet.db") as con:
    con.execute("SET max_memory='8GB';")
    con.execute("SET threads=4;")
    con.execute("SET enable_progress_bar=1;")
    con.execute("SET temp_directory = 'C:/Projet/tmp/';")
    con.execute("SET disable_parquet_prefetching = TRUE;")
    con.execute("SET parquet_metadata_cache = TRUE;")
    stage(con)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

[✔] Successfully created stg.transactions
[✔] Successfully deleted raw.A202401
[✔] Successfully deleted raw.A202402
[✔] Successfully deleted raw.A202403
[✔] Successfully deleted raw.A202404
[✔] Successfully deleted raw.A202405
[✔] Successfully deleted raw.A202406
[✔] Successfully deleted raw.A202407
[✔] Successfully deleted raw.A202408
[✔] Successfully deleted raw.A202409
[✔] Successfully deleted raw.A202410
[✔] Successfully deleted raw.A202411
[✔] Successfully deleted raw.A202412


In [3]:
con = duckdb.connect("projet.db")
con.close()