### Traitement du fichier clinvar

In [3]:
import pandas as pd
import gzip

vcf_file = "../../data/public_db/clinvar/clinvar_20250915.vcf.gz"

### Chargement du fichier VCF et extraction du header

In [4]:
def get_vcf_names(vcf_path):
    with gzip.open(vcf_path, "rt") as ifile:
        for line in ifile:
            if line.startswith("#CHROM"):
                vcf_names = line.strip().lstrip("#").split("\t")
                break
    return vcf_names

names = get_vcf_names(vcf_file)

### Lire le VCF avec pandas

In [5]:
vcf_chunks = pd.read_csv(
    vcf_file,
    comment='#',
    chunksize=10000,
    delim_whitespace=True,
    header=None,
    names=names,
    dtype=str
)

  vcf_chunks = pd.read_csv(


### Parser le champ INFO

In [6]:
import re

def parse_info(info_str):
    """
    Transforme le champ INFO d'un VCF en dictionnaire clé=valeur.
    Ne suppose pas de clés fixes, récupère tout.
    """
    info_dict = {}
    if pd.isna(info_str):
        return info_dict
    items = info_str.split(';')
    for item in items:
        if '=' in item:
            key, value = item.split('=', 1)
            info_dict[key] = value
    return info_dict


### Chargement des gènes mitochondriaux

In [7]:
import json

json_path = "../../data/mitocarta/mitocarta_pivot_full.json"
with open(json_path, "r", encoding="utf8") as f:
    mitocarta_pivot = json.load(f)

# Récupérer tous les HumanGeneID
mito_hgnc_ids = set(mitocarta_pivot.keys())
print("Nombre de gènes mitochondriaux :", len(mito_hgnc_ids))
print(mito_hgnc_ids)

Nombre de gènes mitochondriaux : 1136
{'26520', '4723', '9463', '64902', '84680', '10650', '27165', '374882', '113115', '57505', '58510', '47', '65003', '5164', '26589', '1757', '63931', '160428', '84340', '1622', '7417', '92399', '5106', '9517', '84902', '441024', '55066', '384', '115817', '60528', '10632', '1375', '874', '1727', '167681', '26027', '80273', '1337', '212', '91942', '64978', '79896', '80324', '292', '4697', '29078', '100130890', '11022', '399512', '51250', '84269', '64757', '1629', '223', '440574', '10249', '84896', '29090', '4536', '89874', '130916', '6182', '84705', '10965', '3338', '219402', '51205', '57546', '22907', '11162', '8846', '91137', '84908', '64756', '57678', '56945', '4528', '4129', '26355', '10166', '9588', '92106', '51023', '2936', '25828', '128308', '80724', '7381', '4128', '81034', '221154', '211', '2820', '54704', '841', '1329', '374986', '51566', '4967', '51067', '51095', '283130', '84331', '92170', '3032', '836', '6742', '54677', '50865', '204', '5

### Extraire les champs clés et filtrer mitochondriaux

In [8]:
first_chunk = next(vcf_chunks)
print("Colonnes du chunk :", first_chunk.columns.tolist())
print("Aperçu des premières lignes :")
print(first_chunk.head())

Colonnes du chunk : ['CHROM', 'POS', 'ID', 'REF', 'ALT', 'QUAL', 'FILTER', 'INFO']
Aperçu des premières lignes :
  CHROM    POS       ID REF ALT QUAL FILTER  \
0     1  66926  3385321  AG   A    .      .   
1     1  69134  2205837   A   G    .      .   
2     1  69308  3925305   A   G    .      .   
3     1  69314  3205580   T   G    .      .   
4     1  69404  3925306   T   C    .      .   

                                                INFO  
0  ALLELEID=3544463;CLNDISDB=Human_Phenotype_Onto...  
1  ALLELEID=2193183;CLNDISDB=MedGen:CN169374;CLND...  
2  ALLELEID=4039319;CLNDISDB=MedGen:CN169374;CLND...  
3  ALLELEID=3374047;CLNDISDB=MedGen:CN169374;CLND...  
4  ALLELEID=4039320;CLNDISDB=MedGen:CN169374;CLND...  


In [9]:
print("Exemples de champ INFO :")
print(first_chunk['INFO'].head(10).tolist())

info_str = first_chunk["INFO"].iloc[0]

items = info_str.split(';')
print("Items séparés :", items)


info_dict = {}
for item in items:
    if '=' in item:
        key, value = item.split('=', 1)
        info_dict[key] = value

print("Dictionnaire INFO :", info_dict)


Exemples de champ INFO :
['ALLELEID=3544463;CLNDISDB=Human_Phenotype_Ontology:HP:0000547,MONDO:MONDO:0019200,MeSH:D012174,MedGen:C0035334,OMIM:268000,OMIM:PS268000,Orphanet:791;CLNDN=Retinitis_pigmentosa;CLNHGVS=NC_000001.11:g.66927del;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGSCV=SCV005419006;CLNVC=Deletion;CLNVCSO=SO:0000159;GENEINFO=OR4F5:79501;MC=SO:0001627|intron_variant;ORIGIN=0', 'ALLELEID=2193183;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69134A>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Likely_benign;CLNSIGSCV=SCV003526545;CLNVC=single_nucleotide_variant;CLNVCSO=SO:0001483;CLNVI=ClinGen:CA502008;GENEINFO=OR4F5:79501;MC=SO:0001583|missense_variant;ORIGIN=1;RS=781394307', 'ALLELEID=4039319;CLNDISDB=MedGen:CN169374;CLNDN=not_specified;CLNHGVS=NC_000001.11:g.69308A>G;CLNREVSTAT=criteria_provided,_single_submitter;CLNSIG=Uncertain_significance;CLNSIGSCV=SCV006120601;CLNVC=single_nucleotide_variant;CLNVCS

### Decoupe et filtrage des gènes mitochondriaux

Etape très lente

In [10]:
clinvar_mito_records = {}

for chunk in vcf_chunks:  # vcf_chunks : iterator pandas avec chunksize
    # parser INFO
    chunk['info_dict'] = chunk['INFO'].apply(parse_info)
    
    # Extraire gene_symbol et gene_id depuis GENEINFO
    def extract_geneinfo(info):
        geneinfo = info.get('GENEINFO', None)
        if geneinfo:
            parts = geneinfo.split(':')
            symbol = parts[0]
            gene_id = parts[1] if len(parts) > 1 else None
            return pd.Series([symbol, gene_id])
        else:
            return pd.Series([None, None])
    
    chunk[['gene_symbol', 'gene_id']] = chunk['info_dict'].apply(extract_geneinfo)
    
    # Filtrer uniquement variants mitochondriaux sur gene_id
    chunk_mito = chunk[chunk['gene_id'].isin(mito_hgnc_ids)]

In [11]:
print(chunk_mito)

        CHROM        POS       ID REF ALT QUAL FILTER  \
3680435     X  155026961  2276085   C   A    .      .   
3680436     X  155026975  2496211   G   A    .      .   
3680437     X  155026993  2352260   C   T    .      .   
3680438     X  155027003  3517726   C   G    .      .   
3680439     X  155027008  3852109   C   G    .      .   
...       ...        ...      ...  ..  ..  ...    ...   
3683832    MT      15884   252455   G   C    .      .   
3683833    MT      15885   693973   C   T    .      .   
3683917    MT      16230  1525975   A   G    .      .   
3683918    MT      16274  1525974   G   A    .      .   
3683919    MT      16319  1526264   G   A    .      .   

                                                      INFO  \
3680435  AF_EXAC=0.00010;ALLELEID=2265907;CLNDISDB=MedG...   
3680436  ALLELEID=2469986;CLNDISDB=MedGen:CN169374;CLND...   
3680437  ALLELEID=2340246;CLNDISDB=MedGen:CN169374;CLND...   
3680438  ALLELEID=3676721;CLNDISDB=MedGen:CN169374;CLND...   
36804

### Créaation du record

In [12]:
# Créer les records
for _, row in chunk_mito.iterrows():
    variant_id = f"{row['CHROM']}:{row['POS']}:{row['REF']}:{row['ALT']}"
    record = {
        "variant_id": variant_id,
        "gene_id": row['gene_id'],
        "gene_symbol": row['gene_symbol'],
        "chromosome": row['CHROM'],
        "position": row['POS'],
        "ref": row['REF'],
        "alt": row['ALT'],
        "info": row['info_dict'],            # toutes les clés INFO
        "clinical_significance": row['info_dict'].get('CLNSIG', None),
        "disease_name": row['info_dict'].get('CLNDN', None),
        "rsid": row['info_dict'].get('RS', None)
    }
    clinvar_mito_records[variant_id] = record

print("Nombre total de variants mitochondriaux :", len(clinvar_mito_records))

Nombre total de variants mitochondriaux : 2348


In [14]:
import json

# Chemin de sortie
json_file = "../../data/clinvar/clinvar_mito_records.json"
jsonl_file = "../../data/clinvar/clinvar_mito_records.jsonl"

# 1️⃣ Sauvegarde en JSON classique
with open(json_file, "w", encoding="utf-8") as f:
    json.dump(clinvar_mito_records, f, ensure_ascii=False, indent=2)

print("Fichier JSON sauvegardé :", json_file)

# 2️⃣ Sauvegarde en JSONL
with open(jsonl_file, "w", encoding="utf-8") as f:
    for record in clinvar_mito_records.values():
        f.write(json.dumps(record, ensure_ascii=False) + "\n")

print("Fichier JSONL sauvegardé :", jsonl_file)


Fichier JSON sauvegardé : ../../data/clinvar/clinvar_mito_records.json
Fichier JSONL sauvegardé : ../../data/clinvar/clinvar_mito_records.jsonl
