In [3]:
import duckdb

In [4]:
duckdb.sql("""

    SELECT count(*) 
    FROM '../db/VCF_annovar/entete_variant.parquet' AS ENTETE

""")

┌──────────────┐
│ count_star() │
│    int64     │
├──────────────┤
│       274999 │
└──────────────┘

In [5]:
duckdb.sql("""
    SELECT * 
    FROM '../db/VCF_annovar/entete_variant.parquet' AS ENTETE 
    --LEFT JOIN '../db/VCF_annovar/sample_variant.parquet' AS SAMPLE ON ENTETE.HASH = SAMPLE.HASH
    LEFT JOIN '../db/VCF_annovar/info_variant.parquet' AS INFO ON ENTETE.HASH = INFO.HASH
    LEFT JOIN '../db/VCF_annovar/info_variant.parquet' AS INFO2 ON ENTETE.HASH = INFO2.HASH
    ORDER BY ENTETE.HASH
""")

┌──────────────────┬──────────────────────┬───────────┬─────────────┬─────────┬─────────┬─────────┬────────────┬──────────────────────┬──────────────────────┬──────────────────────┬─────────┬─────────┬─────────┬──────────────┬─────────┬───────────┬─────────┬─────────┬─────────┬─────────┬───────────┬─────────┬────────────────┬─────────┬──────────────┬─────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬──────────────────────┬─────────────┬─────────────┬─────────────────┬──────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬──────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬─────────────────

In [11]:
result = duckdb.sql("""
    SELECT HASH, SAMPLE, SAMPLE_GT, SAMPLE_AD, SAMPLE_DP, SAMPLE_GQ, SAMPLE_PL, SAMPLE_PGT, SAMPLE_PID, SAMPLE_PS FROM '../db/VCF_annovar/sample_variant.parquet' 
""")

print(result.columns)
print(result.fetchmany(5))


['HASH', 'SAMPLE', 'SAMPLE_GT', 'SAMPLE_AD', 'SAMPLE_DP', 'SAMPLE_GQ', 'SAMPLE_PL', 'SAMPLE_PGT', 'SAMPLE_PID', 'SAMPLE_PS']
[('03002d43bd273be064f9549c99dc5c38139bc5f56ec54832f6347de488ff3a81', '1BD4I', '0/1', '8,21', '29', '99', '610,0,207', None, None, None), ('f3fba87a45dc231d63adde9e0a28142a1ba6928b1694c68b8cce2707803a47b9', '1BD4I', '0/0', '32,0', '32', '93', '0,93,1395', None, None, None), ('fa35f50e416fc9592961fdd2dc6c43bc519a6e90bf32577ef367425d49351c19', '1BD4I', '0/1', '21,11', '32', '99', '395,0,1238', '.', '.', '.'), ('b82c05d3267d14e48267af1dc98ade791f686bec3ec0475458f2d63ac99cd14a', '1BD4I', '0/0', '34,0', '34', '48', '0,48,1076', '.', '.', '.'), ('5f91798762573292c8d9c250ae289fdc1e06f49ebdb1951d4be047632ff38859', '1BD4I', '0/0', '33,0', '33', '99', '0,99,1236', None, None, None)]


In [7]:
##################################################
# Version généralisée avec DuckDB (nombre variable d'échantillons)
##################################################
def generalized_pivot_with_duckdb(file):
    # Obtenir les échantillons uniques
    samples = []
    for sample in duckdb.sql(f"SELECT DISTINCT SAMPLE FROM '{file}'").fetchall():
        samples.append(sample[0])

    # Créer les parties de la requête SQL dynamiquement
    select_clauses = ["HASH"]
    excluded_columns = ["FORMAT", "VALEUR"]
    pivot_column = ["SAMPLE",]

    columns_to_pivot = []
    for column in duckdb.sql(f"SELECT * FROM '{file}' LIMIT 1").columns:
        if column not in select_clauses + pivot_column + excluded_columns:
            columns_to_pivot.append(column)

    print(columns_to_pivot)
    
    for sample in samples:
        for col in columns_to_pivot:
            col_name = f"SAMPLE_{sample}_{col.replace('SAMPLE_', '')}"
            select_clauses.append(f"MAX(CASE WHEN SAMPLE = '{sample}' THEN {col} END) AS {col_name}")
    
    print(select_clauses)
    # Construire la requête complète
    query = f"""
    SELECT
        {','.join(select_clauses)}
    FROM '{file}'
    GROUP BY HASH
    """
    duckdb.sql(
        f"COPY ({query}) TO '{'../db/sample_pivot_variant.parquet'}' (FORMAT PARQUET)"
    )
    return duckdb.sql(query)

In [8]:
generalized_pivot_with_duckdb('../db/VCF_annovar/sample_variant.parquet')

['SAMPLE_GT', 'SAMPLE_AD', 'SAMPLE_DP', 'SAMPLE_GQ', 'SAMPLE_PL', 'SAMPLE_PGT', 'SAMPLE_PID', 'SAMPLE_PS']
['HASH', "MAX(CASE WHEN SAMPLE = '1BD4P' THEN SAMPLE_GT END) AS SAMPLE_1BD4P_GT", "MAX(CASE WHEN SAMPLE = '1BD4P' THEN SAMPLE_AD END) AS SAMPLE_1BD4P_AD", "MAX(CASE WHEN SAMPLE = '1BD4P' THEN SAMPLE_DP END) AS SAMPLE_1BD4P_DP", "MAX(CASE WHEN SAMPLE = '1BD4P' THEN SAMPLE_GQ END) AS SAMPLE_1BD4P_GQ", "MAX(CASE WHEN SAMPLE = '1BD4P' THEN SAMPLE_PL END) AS SAMPLE_1BD4P_PL", "MAX(CASE WHEN SAMPLE = '1BD4P' THEN SAMPLE_PGT END) AS SAMPLE_1BD4P_PGT", "MAX(CASE WHEN SAMPLE = '1BD4P' THEN SAMPLE_PID END) AS SAMPLE_1BD4P_PID", "MAX(CASE WHEN SAMPLE = '1BD4P' THEN SAMPLE_PS END) AS SAMPLE_1BD4P_PS", "MAX(CASE WHEN SAMPLE = '1BD4I' THEN SAMPLE_GT END) AS SAMPLE_1BD4I_GT", "MAX(CASE WHEN SAMPLE = '1BD4I' THEN SAMPLE_AD END) AS SAMPLE_1BD4I_AD", "MAX(CASE WHEN SAMPLE = '1BD4I' THEN SAMPLE_DP END) AS SAMPLE_1BD4I_DP", "MAX(CASE WHEN SAMPLE = '1BD4I' THEN SAMPLE_GQ END) AS SAMPLE_1BD4I_GQ", "MAX

┌──────────────────────────────────────────────────────────────────┬─────────────────┬─────────────────┬─────────────────┬─────────────────┬─────────────────┬──────────────────┬──────────────────┬─────────────────┬─────────────────┬─────────────────┬─────────────────┬─────────────────┬─────────────────┬──────────────────┬──────────────────┬─────────────────┬─────────────────┬─────────────────┬─────────────────┬─────────────────┬─────────────────┬──────────────────┬──────────────────┬─────────────────┐
│                               HASH                               │ SAMPLE_1BD4P_GT │ SAMPLE_1BD4P_AD │ SAMPLE_1BD4P_DP │ SAMPLE_1BD4P_GQ │ SAMPLE_1BD4P_PL │ SAMPLE_1BD4P_PGT │ SAMPLE_1BD4P_PID │ SAMPLE_1BD4P_PS │ SAMPLE_1BD4I_GT │ SAMPLE_1BD4I_AD │ SAMPLE_1BD4I_DP │ SAMPLE_1BD4I_GQ │ SAMPLE_1BD4I_PL │ SAMPLE_1BD4I_PGT │ SAMPLE_1BD4I_PID │ SAMPLE_1BD4I_PS │ SAMPLE_1BD4M_GT │ SAMPLE_1BD4M_AD │ SAMPLE_1BD4M_DP │ SAMPLE_1BD4M_GQ │ SAMPLE_1BD4M_PL │ SAMPLE_1BD4M_PGT │ SAMPLE_1BD4M_PID │ SAMP

In [None]:

    def pivot_parquet_sample(self, file, export_path):

        # Obtenir les samples uniques
        samples = []
        for sample in duckdb.sql(f"SELECT DISTINCT SAMPLE FROM '{file}'").fetchall():
            samples.append(sample[0])

        select_clauses = ["HASH"]
        excluded_columns = ["FORMAT", "VALEUR"]
        pivot_column = ["SAMPLE",]

        columns_to_pivot = []
        for column in duckdb.sql(f"SELECT * FROM '{file}' LIMIT 1").columns:
            if column not in select_clauses + pivot_column + excluded_columns:
                columns_to_pivot.append(column)

        # Créer les parties de la requête SQL dynamiquement
        for sample in samples:
            for col in columns_to_pivot:
                col_name = f"SAMPLE_{sample}_{col.replace('SAMPLE_', '')}"
                select_clauses.append(f"MAX(CASE WHEN SAMPLE = '{sample}' THEN {col} END) AS {col_name}")

        # Construire la requête complète
        query = f"""
        SELECT
            {','.join(select_clauses)}
        FROM '{file}'
        GROUP BY HASH
        """

        duckdb.sql(
            f"COPY ({query}) TO '{export_path}' (FORMAT PARQUET)"
        )

        self._reference_file(export_path)
