# Writing questions about Clinvar

Goal: Evaluate RAG.

## Data preparation

In [1]:
import pandas as pd
import json
from pprint import pprint
import polars as pl
import random

with open("../../data/clinvar/clinvar_mito_records.json","r") as f:
    df = json.load(f)


print("classe :", type(df), "taille", len(df))
print("\nStructure :")
print("Key : MT:10083:A:G")
pprint(df["MT:10083:A:G"])

df = pd.DataFrame(df) # Convert into table
df.columns = df.iloc[-1]
df = df.drop(df.index[-1])
df = df.T


print("Final structure \n", df.head())

classe : <class 'dict'> taille 2348

Structure :
Key : MT:10083:A:G
{'alt': 'G',
 'chromosome': 'MT',
 'clinical_significance': 'Likely_benign',
 'disease_name': 'Leigh_syndrome',
 'gene_id': '4537',
 'gene_symbol': 'MT-ND3',
 'info': {'ALLELEID': '680149',
          'CLNDISDB': 'MONDO:MONDO:0009723,MedGen:C2931891,OMIM:256000,Orphanet:506',
          'CLNDN': 'Leigh_syndrome',
          'CLNHGVS': 'NC_012920.1:m.10083A>G',
          'CLNREVSTAT': 'criteria_provided,_single_submitter',
          'CLNSIG': 'Likely_benign',
          'CLNSIGSCV': 'SCV000997653',
          'CLNVC': 'single_nucleotide_variant',
          'CLNVCSO': 'SO:0001483',
          'CLNVI': 'ClinGen:CA414804238',
          'GENEINFO': 'MT-ND3:4537',
          'ORIGIN': '1',
          'RS': '1556423760'},
 'position': '10083',
 'ref': 'A',
 'rsid': '1556423760',
 'variant_id': 'MT:10083:A:G'}
Final structure 
                  variant_id gene_id gene_symbol chromosome   position ref alt  \
rsid                       

In [2]:
import pandas as pd

def expand_info_column(df: pd.DataFrame, info_col: str = "info") -> pd.DataFrame:
    """
    Aplati la colonne 'info' d'un DataFrame contenant des dictionnaires variables.
    Chaque clé rencontrée dans 'info' devient une colonne distincte.
    """
    # Étape 1 : Identifier toutes les clés présentes dans 'info'
    all_info_keys = set()
    for entry in df[info_col]:
        if isinstance(entry, dict):
            all_info_keys.update(entry.keys())
    
    # Étape 2 : Créer un DataFrame à partir des dicts
    info_expanded = pd.DataFrame([
        {k: entry.get(k, None) if isinstance(entry, dict) else None for k in all_info_keys}
        for entry in df[info_col]
    ])
    
    # Étape 3 : Préfixer les colonnes pour clarté
    info_expanded = info_expanded.add_prefix(f"{info_col}.")
    
    # 💡 Étape 4 : Réinitialiser les index pour éviter l’erreur de concaténation
    df_reset = df.reset_index(drop=True)
    info_expanded_reset = info_expanded.reset_index(drop=True)
    
    # Étape 5 : Combiner avec le DataFrame d’origine (sans la colonne 'info')
    df_flat = pd.concat([df_reset.drop(columns=[info_col]), info_expanded_reset], axis=1)
    
    return df_flat


df_flat = expand_info_column(df, info_col="info")
df_flat.head()
df_flat.columns

df = pl.from_pandas(df_flat)

In [3]:
for col in df:
    print(col)

shape: (2_348,)
Series: 'variant_id' [str]
[
	"X:155026961:C:A"
	"X:155026975:G:A"
	"X:155026993:C:T"
	"X:155027003:C:G"
	"X:155027008:C:G"
	…
	"MT:15884:G:C"
	"MT:15885:C:T"
	"MT:16230:A:G"
	"MT:16274:G:A"
	"MT:16319:G:A"
]
shape: (2_348,)
Series: 'gene_id' [str]
[
	"65991"
	"65991"
	"65991"
	"65991"
	"65991"
	…
	"4519"
	"4519"
	"4519"
	"4519"
	"4519"
]
shape: (2_348,)
Series: 'gene_symbol' [str]
[
	"FUNDC2"
	"FUNDC2"
	"FUNDC2"
	"FUNDC2"
	"FUNDC2"
	…
	"MT-CYB"
	"MT-CYB"
	"MT-CYB"
	"MT-CYB"
	"MT-CYB"
]
shape: (2_348,)
Series: 'chromosome' [str]
[
	"X"
	"X"
	"X"
	"X"
	"X"
	…
	"MT"
	"MT"
	"MT"
	"MT"
	"MT"
]
shape: (2_348,)
Series: 'position' [str]
[
	"155026961"
	"155026975"
	"155026993"
	"155027003"
	"155027008"
	…
	"15884"
	"15885"
	"16230"
	"16274"
	"16319"
]
shape: (2_348,)
Series: 'ref' [str]
[
	"C"
	"G"
	"C"
	"C"
	"C"
	…
	"G"
	"C"
	"A"
	"G"
	"G"
]
shape: (2_348,)
Series: 'alt' [str]
[
	"A"
	"A"
	"T"
	"G"
	"G"
	…
	"C"
	"T"
	"G"
	"A"
	"A"
]
shape: (2_348,)
Series: 'clinical_significa

In [5]:
print(df.head(n=2))
df.columns

shape: (2, 27)
┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ variant_id ┆ gene_id ┆ gene_symb ┆ chromosom ┆ … ┆ info.CLNR ┆ info.CLNV ┆ info.CLNS ┆ info.AF_E │
│ ---        ┆ ---     ┆ ol        ┆ e         ┆   ┆ EVSTAT    ┆ C         ┆ IGCONF    ┆ XAC       │
│ str        ┆ str     ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│            ┆         ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ str       │
╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ X:15502696 ┆ 65991   ┆ FUNDC2    ┆ X         ┆ … ┆ criteria_ ┆ single_nu ┆ null      ┆ 0.00010   │
│ 1:C:A      ┆         ┆           ┆           ┆   ┆ provided, ┆ cleotide_ ┆           ┆           │
│            ┆         ┆           ┆           ┆   ┆ _single_s ┆ variant   ┆           ┆           │
│            ┆         ┆           ┆           ┆   ┆ ubm…      ┆           ┆

['variant_id',
 'gene_id',
 'gene_symbol',
 'chromosome',
 'position',
 'ref',
 'alt',
 'clinical_significance',
 'disease_name',
 'info.CLNVCSO',
 'info.CLNDISDB',
 'info.CLNVI',
 'info.ORIGIN',
 'info.AF_ESP',
 'info.CLNHGVS',
 'info.CLNSIG',
 'info.RS',
 'info.CLNSIGSCV',
 'info.GENEINFO',
 'info.ALLELEID',
 'info.MC',
 'info.AF_TGP',
 'info.CLNDN',
 'info.CLNREVSTAT',
 'info.CLNVC',
 'info.CLNSIGCONF',
 'info.AF_EXAC']

## Questions

### Quel est le nombre de variants dans ce subset de Clinvar ? 

In [54]:
print("Variant ID:", df["variant_id"][0:3])
print("Number of unique variants in clinvar subset:", len(df.select("variant_id").unique()))
print("Shape of clinvar subset:", df.shape)

Variant ID: shape: (3,)
Series: 'variant_id' [str]
[
	"X:155026961:C:A"
	"X:155026975:G:A"
	"X:155026993:C:T"
]
Number of unique variants in clinvar subset: 2348
Shape of clinvar subset: (2348, 27)


#### Which variants are associated with the disease “Epsilon-trimethyllysine hydroxylase deficiency”? (Example question)

In [55]:
variants = (
    df.filter(pl.col("info.CLNDN") == "Epsilon-trimethyllysine_hydroxylase_deficiency")
    .select("variant_id")
    .unique()
)

variant_list = variants["variant_id"].to_list
print(variant_list)

<bound method Series.to_list of shape: (7,)
Series: 'variant_id' [str]
[
	"X:155504770:AAACTGGAAAGAGTCCAG…
	"X:155506930:GAT:G"
	"X:155506896:A:G"
	"X:155513983:TA:T"
	"X:155511662:AATCCACCTACCCACAGG…
	"X:155545271:C:A"
	"X:155514133:G:A"
]>


### Which variants are associated with the disease {disease}? (Template question)

In [16]:
import re
diseases_in_clinvar = (
    df
    .select(pl.col("info.CLNDN"))
    .drop_nulls()
    .unique()
)

def extract_clean_diseases(df: pl.DataFrame, column: str = "info.CLNDN") -> list[str]:
    """
    Extrait une liste unique de noms de maladies propres à partir de ClinVar.
    Gère les séparateurs | , / et remplace les underscores.
    """
    if column not in df.columns:
        print(f"⚠️ Colonne {column} absente du DataFrame ClinVar")
        return []

    diseases_raw = df.select(pl.col(column)).drop_nulls().to_series().to_list()

    all_diseases = set()
    for entry in diseases_raw:
        if not isinstance(entry, str):
            continue

        # Multi-split sur | , /
        parts = re.split(r"[|,\/]", entry)

        for disease in parts:
            clean = (
                disease.strip()
                .replace("_", " ")
                .replace("  ", " ")
            )
            if clean and clean.lower() not in {"not provided", "not specified", "see cases"}:
                all_diseases.add(clean)

    return sorted(all_diseases)


all_diseases = extract_clean_diseases(df, "info.CLNDN")
n= 5
sample_diseases = random.sample(all_diseases, min(n, len(all_diseases)))

print(all_diseases)

[' MITOCHONDRIAL TYPE 3', ' adult-onset', ' agenesis of', ' and septooptic dysplasia', ' and sideroblastic anemia 3', ' bilateral', ' cardiomyopathy', ' episodic', ' infantile', ' lactic acidosis', ' late-onset', ' mitochondrial', ' mitochondrial type 1', ' nuclear type 1', ' nuclear type 5', ' recurrent', ' resistance to', ' reversible', ' susceptibility to', ' type IA', ' valproate-induced', ' with dystonia', ' with optic atrophy and reversible leukoencephalopathy', 'Abnormal CSF lactate concentration', 'Abnormal aortic valve physiology', 'Abnormal basal ganglia MRI signal intensity', 'Abnormal brain morphology', 'Abnormal electroretinogram', 'Abnormal mitochondria in muscle tissue', 'Abnormal mitral valve physiology', 'Abnormality of eye movement', 'Acute liver failure', 'Acute megakaryoblastic leukemia', 'Alzheimer disease', 'Ataxia', 'Ataxia and polyneuropathy', 'Atrophy of the brain and cerebellum', 'Attention deficit hyperactivity disorder', 'Auditory neuropathy spectrum disorde

In [None]:
n = 10

##### DATA PREPARATION
# 1. Extract the disease column, removes nulls, duplications
diseases_in_clinvar = (
    df
    .select(pl.col("info.CLNDN"))
    .drop_nulls()
    .unique()
)

# 2. Flat diseases lists separated by "|", excluding also not wanted terms
disease_set_in_clinvar = (
    diseases_in_clinvar
    .with_columns(
        pl.col("info.CLNDN")
        .str.split("|")
    )
    .explode("info.CLNDN")
    .filter(~pl.col("info.CLNDN").is_in(["not_provided", "not_specified", "See_cases"]))
    .select("info.CLNDN")
    .unique()
)

# Convert to python list
disease_list = disease_set_in_clinvar["info.CLNDN"].to_list()

# 3. Random draw of n diseases
random_diseases = random.sample(disease_list, n)


# ========= VARIANTS FOR N DISEASE ==============

results = []

for disease in random_diseases:
    variants = (
        df
        .filter(pl.col("info.CLNDN").str.contains(disease))
        .select("variant_id")
        .unique()
    )

    variant_list = variants["variant_id"].to_list()
    results.append({"disease": disease, "variants": variant_list})

df_results = pl.DataFrame(results)

print(df_results)

shape: (10, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ disease                         ┆ variants                        │
│ ---                             ┆ ---                             │
│ str                             ┆ list[str]                       │
╞═════════════════════════════════╪═════════════════════════════════╡
│ Leber_optic_atrophy             ┆ ["MT:12923:G:A", "MT:14831:G:A… │
│ Leigh_syndrome,_mitochondrial   ┆ ["MT:9176:T:C"]                 │
│ Subcutaneous_lipoma             ┆ ["MT:9355:A:T"]                 │
│ Venous_thromboembolism          ┆ ["MT:9540:T:C", "MT:11719:G:A"… │
│ Mediastinal_germ_cell_tumor     ┆ ["MT:14372:C:A"]                │
│ Mitochondrial_complex_I_defici… ┆ ["MT:10158:T:C", "MT:10197:G:A… │
│ Mitochondrial_complex_IV_defic… ┆ ["MT:9479:TTTTTTCTTCGCAGGA:T"]  │
│ Carcinoma_of_colon              ┆ ["MT:3308:T:C"]                 │
│ Moderate_sensorineural_hearing… ┆ ["MT:9355:A:T"]                 │
│ Par

In [174]:
## Queis

for row in df_results.iter_rows():
    disease = row[0]
    var = row[1]
    print(f"Question: Which variants are associated with the disease {disease}?")
    print(f"Answer: Variant(s) associated with {disease.replace("_", " ")} is/are {", ".join(v for v in var)}.")
    print()


Question: Which variants are associated with the disease Leber_optic_atrophy?
Answer: Variant(s) associated with Leber optic atrophy is/are MT:12923:G:A, MT:14831:G:A, MT:11360:A:G, MT:9176:T:G, MT:9237:G:A, MT:12811:T:C, MT:14495:A:G, MT:14325:T:C, MT:12706:T:C, MT:4216:T:C, MT:9804:G:A, MT:8969:G:A, MT:4136:A:G, MT:8686:T:C, MT:11777:C:A, MT:14482:C:G, MT:14568:C:T, MT:3394:T:C, MT:3733:G:A, MT:14484:T:C, MT:8851:T:C, MT:4640:C:A, MT:13760:C:T, MT:5244:G:A, MT:3635:G:A, MT:15257:G:A, MT:4025:C:T, MT:11696:G:A, MT:13051:G:A, MT:14795:T:C, MT:14482:C:A, MT:10663:T:C, MT:13042:G:A, MT:13637:A:G, MT:4171:C:A, MT:14279:G:A, MT:13094:T:C, MT:11778:G:A, MT:247:GA:G, MT:235:A:G, MT:14487:T:C, MT:14668:C:T, MT:15579:A:G, MT:8783:G:A, MT:9438:G:A, MT:9176:T:C, MT:13730:G:A, MT:13045:A:C, MT:3700:G:A, MT:12848:C:T, MT:15437:G:A, MT:9035:T:C, MT:9101:T:C, MT:15812:G:A, MT:8603:T:G, MT:12417:CA:C, MT:10197:G:A, MT:15485:C:T, MT:14596:A:T, MT:13763:C:T, MT:11253:T:C, MT:3697:G:A, MT:9185:T:C, MT:8

#### Which gene is linked to the variant {variant_id}? (Template question)

In [175]:
n = 10

##### DATA PREPARATION
# 1. Extract the disease column, removes nulls, duplications
variantid_in_clinvar_list = (
    df
    .select(pl.col("variant_id"))
    .drop_nulls()
    .unique()
    .to_series()
    .to_list()
)

random_variants = random.sample(variantid_in_clinvar_list, n)

# ========= GENE FOR N VARIANTS ==============
results = []

print(random_variants[0])
print(df.filter(pl.col("variant_id") == random_variants[0]))

for variant in random_variants:
        
    gene = (
            df
            .filter(pl.col("variant_id") == variant)
            .select("gene_symbol","gene_id")
        )

    # Security
    if gene.height > 0:
        gene_symbol = gene["gene_symbol"][0]
        gene_id = gene["gene_id"][0]
    else:
        gene_symbol = None
        gene_id = None

    results.append({
        "variant_id": variant, 
        "gene_symbol": gene_symbol, 
        "gene_id": gene_id})

df_results = pl.DataFrame(results)
print(df_results)

MT:3688:G:A
shape: (1, 27)
┌────────────┬─────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬───────────┐
│ variant_id ┆ gene_id ┆ gene_symb ┆ chromosom ┆ … ┆ info.CLND ┆ info.CLNV ┆ info.ORIG ┆ info.GENE │
│ ---        ┆ ---     ┆ ol        ┆ e         ┆   ┆ ISDB      ┆ C         ┆ IN        ┆ INFO      │
│ str        ┆ str     ┆ ---       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ ---       │
│            ┆         ┆ str       ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆ str       │
╞════════════╪═════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪═══════════╡
│ MT:3688:G: ┆ 4535    ┆ MT-ND1    ┆ MT        ┆ … ┆ MONDO:MON ┆ single_nu ┆ 1         ┆ MT-ND1:45 │
│ A          ┆         ┆           ┆           ┆   ┆ DO:004497 ┆ cleotide_ ┆           ┆ 35        │
│            ┆         ┆           ┆           ┆   ┆ 0,MeSH:D0 ┆ variant   ┆           ┆           │
│            ┆         ┆           ┆           ┆   ┆ 283…      ┆

In [176]:
## Queis
for row in df_results.iter_rows():
    variant_id = row[0]
    gene_symbol = row[1]
    gene_id = row[2]

    print(f"Question: Which gene is linked to the variant {variant_id}?")
    print(f"Answer: The variant {variant_id} is linked to the gene {gene_symbol} (Gene ID: {gene_id}).")
    print()

Question: Which gene is linked to the variant MT:3688:G:A?
Answer: The variant MT:3688:G:A is linked to the gene MT-ND1 (Gene ID: 4535).

Question: Which gene is linked to the variant MT:14873:C:T?
Answer: The variant MT:14873:C:T is linked to the gene MT-CYB (Gene ID: 4519).

Question: Which gene is linked to the variant MT:4513:C:T?
Answer: The variant MT:4513:C:T is linked to the gene MT-ND2 (Gene ID: 4536).

Question: Which gene is linked to the variant MT:6353:A:G?
Answer: The variant MT:6353:A:G is linked to the gene MT-CO1 (Gene ID: 4512).

Question: Which gene is linked to the variant MT:6626:T:C?
Answer: The variant MT:6626:T:C is linked to the gene MT-CO1 (Gene ID: 4512).

Question: Which gene is linked to the variant X:155061862:C:T?
Answer: The variant X:155061862:C:T is linked to the gene CMC4 (Gene ID: 100272147).

Question: Which gene is linked to the variant MT:13434:A:G?
Answer: The variant MT:13434:A:G is linked to the gene MT-ND5 (Gene ID: 4540).

Question: Which gen

### Which genes are associated with the disease {disease}? 

In [7]:
n = 10

##### DATA PREPARATION
# 1. Extract the disease column, removes nulls, duplications
diseases_in_clinvar = (
    df
    .select(pl.col("info.CLNDN"))
    .drop_nulls()
    .unique()
)

# 2. Flat diseases lists separated by "|", excluding also not wanted terms
disease_set_in_clinvar = (
    diseases_in_clinvar
    .with_columns(
        pl.col("info.CLNDN")
        .str.split("|")
    )
    .explode("info.CLNDN")
    .filter(~pl.col("info.CLNDN").is_in(["not_provided", "not_specified", "See_cases"]))
    .select("info.CLNDN")
    .unique()
)

# Convert to python list
disease_list = disease_set_in_clinvar["info.CLNDN"].to_list()

# 3. Random draw of n diseases
random_diseases = random.sample(disease_list, n)


# ========= VARIANTS FOR N DISEASE ==============

results = []

for disease in random_diseases:
    genes = (
        df
        .filter(pl.col("info.CLNDN").str.contains(disease))
        .select("gene_symbol")
        .unique()
    )

    genes_list = genes["gene_symbol"].to_list()
    results.append({"disease": disease, "genes": genes_list})

df_results = pl.DataFrame(results)

print(df_results)

shape: (10, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ disease                         ┆ genes                           │
│ ---                             ┆ ---                             │
│ str                             ┆ list[str]                       │
╞═════════════════════════════════╪═════════════════════════════════╡
│ Visual_impairment               ┆ ["MT-ND4"]                      │
│ Ovarian_neoplasm                ┆ ["MT-CYB"]                      │
│ Cerebellar_vermis_hypoplasia    ┆ ["TMLHE"]                       │
│ Epilepsy                        ┆ ["MT-CYB", "MT-CO3", … "MT-ND2… │
│ Calcification_of_extrapyramida… ┆ ["MT-ND4"]                      │
│ Acute_megakaryoblastic_leukemi… ┆ ["MT-ND6"]                      │
│ Increased_circulating_lactate_… ┆ ["MT-ND1"]                      │
│ Progressive_spastic_paraparesi… ┆ ["MT-ATP6"]                     │
│ Abnormal_brain_morphology       ┆ ["MT-ND6"]                      │
│ Epe

In [8]:
for row in df_results.iter_rows():
    disease = row[0]
    genes_list = row[1]

    print(f"Question:  Which gene(s) is/are associated with the disease {disease}? ")
    print(f"Answer: The gene(s) {", ".join(gene for gene in genes_list)} is/are associated with the disease {disease}.")
    print()

Question:  Which gene(s) is/are associated with the disease Visual_impairment? 
Answer: The gene(s) MT-ND4 is/are associated with the disease Visual_impairment.

Question:  Which gene(s) is/are associated with the disease Ovarian_neoplasm? 
Answer: The gene(s) MT-CYB is/are associated with the disease Ovarian_neoplasm.

Question:  Which gene(s) is/are associated with the disease Cerebellar_vermis_hypoplasia? 
Answer: The gene(s) TMLHE is/are associated with the disease Cerebellar_vermis_hypoplasia.

Question:  Which gene(s) is/are associated with the disease Epilepsy? 
Answer: The gene(s) MT-CYB, MT-CO3, MT-ND4, MT-ND2 is/are associated with the disease Epilepsy.

Question:  Which gene(s) is/are associated with the disease Calcification_of_extrapyramidal_basal_ganglia? 
Answer: The gene(s) MT-ND4 is/are associated with the disease Calcification_of_extrapyramidal_basal_ganglia.

Question:  Which gene(s) is/are associated with the disease Acute_megakaryoblastic_leukemia? 
Answer: The gen

### What is the clinical significance of variant {variant_id}?

In [116]:
df["clinical_significance"]
print(df["clinical_significance"].unique())

shape: (8,)
Series: 'clinical_significance' [str]
[
	"Likely_pathogenic"
	"not_provided"
	"Benign"
	"Benign/Likely_benign"
	"Conflicting_classifications_of…
	"Likely_benign"
	"Pathogenic"
	"Uncertain_significance"
]


In [177]:
n = 10

##### DATA PREPARATION
# 1. Extract the disease column, removes nulls, duplications
variantid_in_clinvar_list = (
    df
    .select(pl.col("variant_id"))
    .drop_nulls()
    .unique()
    .to_series()
    .to_list()
)

random_variants = random.sample(variantid_in_clinvar_list, n)

# ========= GENE FOR N VARIANTS ==============
results = []

for variant in random_variants:
        
    clinical_sign = (
            df
            .filter(pl.col("variant_id") == variant)
            .select("clinical_significance")
        )
    
    clinical_sign = clinical_sign["clinical_significance"][0]
    
    results.append({
        "variant_id": variant, 
        "clinical_significance": clinical_sign})

df_results = pl.DataFrame(results)
print(df_results)

shape: (10, 2)
┌───────────────┬────────────────────────┐
│ variant_id    ┆ clinical_significance  │
│ ---           ┆ ---                    │
│ str           ┆ str                    │
╞═══════════════╪════════════════════════╡
│ MT:13711:G:A  ┆ Benign                 │
│ MT:8448:T:C   ┆ Benign                 │
│ MT:13438:C:A  ┆ Uncertain_significance │
│ MT:4094:C:T   ┆ Uncertain_significance │
│ MT:7922:T:C   ┆ Uncertain_significance │
│ MT:13834:A:G  ┆ Benign                 │
│ MT:13770:C:A  ┆ Likely_benign          │
│ MT:5131:TAA:T ┆ Uncertain_significance │
│ MT:8387:G:A   ┆ Benign                 │
│ MT:9861:T:C   ┆ Benign/Likely_benign   │
└───────────────┴────────────────────────┘


In [178]:
for row in df_results.iter_rows():
    variant_id = row[0]
    clinical_significance = row[1]

    print(f"Question: What is the clinical significance of variant {variant_id}?")
    print(f"Answer: The variant {variant_id} is classified as {clinical_significance}.")
    print()

Question: What is the clinical significance of variant MT:13711:G:A?
Answer: The variant MT:13711:G:A is classified as Benign.

Question: What is the clinical significance of variant MT:8448:T:C?
Answer: The variant MT:8448:T:C is classified as Benign.

Question: What is the clinical significance of variant MT:13438:C:A?
Answer: The variant MT:13438:C:A is classified as Uncertain_significance.

Question: What is the clinical significance of variant MT:4094:C:T?
Answer: The variant MT:4094:C:T is classified as Uncertain_significance.

Question: What is the clinical significance of variant MT:7922:T:C?
Answer: The variant MT:7922:T:C is classified as Uncertain_significance.

Question: What is the clinical significance of variant MT:13834:A:G?
Answer: The variant MT:13834:A:G is classified as Benign.

Question: What is the clinical significance of variant MT:13770:C:A?
Answer: The variant MT:13770:C:A is classified as Likely_benign.

Question: What is the clinical significance of variant 

### List all variants classified as {info.CLNSIG}?

In [None]:
n = 1 

##### DATA PREPARATION
# 1. Extract the disease column, removes nulls, duplications
acmg_class_in_clinvar_list = (
    df
    .select(pl.col("clinical_significance"))
    .drop_nulls()
    .unique()
    .to_series()
    .to_list()
)

random_acmg_class = random.sample(acmg_class_in_clinvar_list, n)
print(random_acmg_class)

results = []

for acmg_class in random_acmg_class:
    every_var_in_one_class = (
        df
        .filter(pl.col("clinical_significance") == acmg_class)
        .select("variant_id")
    )

    variant_list_in_acmg_class = every_var_in_one_class["variant_id"].to_list()
    results.append({
        "acmg_class": acmg_class,
        "variant_list": variant_list_in_acmg_class
    })

df_results = pl.DataFrame(results)
print(df_results)

['Conflicting_classifications_of_pathogenicity']
shape: (1, 2)
┌─────────────────────────────────┬─────────────────────────────────┐
│ acmg_class                      ┆ variant_list                    │
│ ---                             ┆ ---                             │
│ str                             ┆ list[str]                       │
╞═════════════════════════════════╪═════════════════════════════════╡
│ Conflicting_classifications_of… ┆ ["X:155514267:T:C", "MT:3565:A… │
└─────────────────────────────────┴─────────────────────────────────┘


### Which variants in gene {gene_symbol} are classified as Uncertain significance?

In [203]:
n = 3 

##### DATA PREPARATION
# 1. Extract the disease column, removes nulls, duplications
acmg_class_in_clinvar_list = (
    df
    .select(pl.col("clinical_significance"))
    .drop_nulls()
    .unique()
    .to_series()
    .to_list()
)

gene_symbol_in_clinvar_list = (
    df
    .select(pl.col("gene_symbol"))
    .drop_nulls()
    .unique()
    .to_series()
    .to_list()
)


random_acmg_class = random.sample(acmg_class_in_clinvar_list, 1)[0]
random_genes = random.sample(gene_symbol_in_clinvar_list, n)
print(random_acmg_class)


##### EXTRACTION
results = []

for gene in random_genes:
    every_var_in_one_class_for_one_gene = (
        df
        .filter(
            (pl.col("clinical_significance") == random_acmg_class) 
            & (pl.col("gene_symbol") == gene)
        )
        .select("variant_id")
        .unique()
    )

    variant_list_in_acmg_class = every_var_in_one_class_for_one_gene["variant_id"].to_list()
    results.append({
        "acmg_class": random_acmg_class,
        "gene": gene,
        "variant_list": variant_list_in_acmg_class
    })

df_results = pl.DataFrame(results)
print(df_results)

Conflicting_classifications_of_pathogenicity
shape: (3, 3)
┌─────────────────────────────────┬─────────┬──────────────────┐
│ acmg_class                      ┆ gene    ┆ variant_list     │
│ ---                             ┆ ---     ┆ ---              │
│ str                             ┆ str     ┆ list[str]        │
╞═════════════════════════════════╪═════════╪══════════════════╡
│ Conflicting_classifications_of… ┆ MT-ND5  ┆ ["MT:13112:T:C"] │
│ Conflicting_classifications_of… ┆ MT-CO1  ┆ ["MT:6480:G:A"]  │
│ Conflicting_classifications_of… ┆ MT-ATP6 ┆ ["MT:8609:C:T"]  │
└─────────────────────────────────┴─────────┴──────────────────┘


In [201]:
for row in df_results.iter_rows():
    clinical_significance = row[0]
    gene_symbol = row[1]
    variant_list = row[2]
    

    print(f"Question: Which variants in gene {gene_symbol} are classified as {clinical_significance}?")
    print(f"Answer: For the gene {gene_symbol}, the variants {", ".join(var for var in variant_list)} is/are classified as {clinical_significance}")
    print()

Question: Which variants in gene MT-CO1 are classified as Pathogenic?
Answer: For the gene MT-CO1, the variants MT:7275:T:C, MT:6691:GA:G, MT:6899:GA:G, MT:5953:CA:C, MT:6924:GC:G, MT:6716:A:AGGG, MT:6277:G:A, MT:6935:CA:C, MT:6687:T:TCAC, MT:6938:CT:C, MT:6814:AT:A, MT:6748:TC:T, MT:6743:T:TTGG, MT:6857:CA:C, MT:6607:TC:T is/are classified as Pathogenic

Question: Which variants in gene MT-ND6 are classified as Pathogenic?
Answer: For the gene MT-ND6, the variants MT:14487:T:C, MT:14243:G:GC, MT:14484:T:C is/are classified as Pathogenic

Question: Which variants in gene CMC4 are classified as Pathogenic?
Answer: For the gene CMC4, the variants  is/are classified as Pathogenic



### What is the difference in clinical significance between {variant_A} and {variant_B} of {gene_symbol}?

In [None]:
n = 3 

##### DATA PREPARATION
# 1. Extract the disease column, removes nulls, duplications

gene_symbol_in_clinvar_list = (
    df
    .select(pl.col("gene_symbol"))
    .drop_nulls()
    .unique()
    .to_series()
    .to_list()
)
random_genes = random.sample(gene_symbol_in_clinvar_list, n)


##### EXTRACTION
results = []

for gene in random_genes:
    variantid_in_clinvar_list = (
        df
        .filter(pl.col("gene_symvol") == random_genes)
        .select(pl.col("variant_id"))
        .drop_nulls()
        .unique()
        .to_series()
        .to_list()
    )
    random_variants = random.sample(variantid_in_clinvar_list, 2)

    every_var_in_one_class_for_one_gene = (
        df
        .filter(
            (pl.col("clinical_significance") == random_acmg_class) 
            & (pl.col("gene_symbol") == gene)
        )
        .select("variant_id")
        .unique()
    )

    variant_list_in_acmg_class = every_var_in_one_class_for_one_gene["variant_id"].to_list()
    results.append({
        "acmg_class": random_acmg_class,
        "gene": gene,
        "variant_list": variant_list_in_acmg_class
    })

df_results = pl.DataFrame(results)
print(df_results)

Template questions :

- [x] Which variants are associated with the disease {disease}? 
- [ ] Which genes are associated with the disease {disease}? 
- [x] Which gene is linked to the variant {variant_id}?
- [x] What is the clinical significance of variant {variant_id}?
- [x] List all variants classified as {clinical_significance}?
- [x] List all diseases associated with gene {gene_symbol}?
- [x] Which variants in gene {gene_symbol} are classified as {clinical_significance}?
- Which variants of gene {gene_symbol} are associated with multiple diseases
- What is the difference in clinical significance between variant {variant_A} and {variant_B} of {gene_symbol}?




In [24]:
def template_disease_to_variants(df: pl.DataFrame, chunks: list, n: int = 10):
    df_valid = df.drop_nulls(subset=["info.CLNDN", "variant_id"])
    all_diseases = extract_clean_diseases(df_valid, "info.CLNDN")
    if not all_diseases:
        return []

    sample_diseases = random.sample(all_diseases, min(n, len(all_diseases)))
    print(sample_diseases)
    questions = []
    for disease in sample_diseases:
        variants = (
            df.filter(pl.col("info.CLNDN").str.contains(disease))
            .select("variant_id")
            .unique()
        )
        print(variants)
        variant_list = [v for v in variants["variant_id"]]
        if not variant_list:
            continue
        # chunk_ids = match_chunks_by_term(chunks, disease)
        questions.append({
            # "question_id": random_hex(16),
            # "created_at": utc_now_iso(),
            "query": f"Quels variants sont associés à la maladie {disease} ?",
            "ground_truth": f"Les variants {', '.join(variant_list)} sont associés à {disease}.",
            "type_question": "faits-directs",
            # "difficulty": DIFFICULTY,
            # "profile": PROFILE,
            # "status": STATUS,
            # "source_dataset": SOURCE_DATASET,
            "tags": ["disease_to_variants"],
            "chunk_metadata": {
                # "chunk_ids": chunk_ids,
                # "source": SOURCE_DATASET
            }
        })
    return questions


template_disease_to_variants(df, chunks=[], n=1)

['Striatal necrosis']
shape: (0, 1)
┌────────────┐
│ variant_id │
│ ---        │
│ str        │
╞════════════╡
└────────────┘


[]

In [22]:
print(df.columns)
print(df["variant_id"])

['variant_id', 'gene_id', 'gene_symbol', 'chromosome', 'position', 'ref', 'alt', 'clinical_significance', 'disease_name', 'info.AF_EXAC', 'info.CLNSIG', 'info.ALLELEID', 'info.CLNSIGSCV', 'info.CLNHGVS', 'info.CLNVCSO', 'info.CLNVI', 'info.ORIGIN', 'info.GENEINFO', 'info.RS', 'info.AF_ESP', 'info.AF_TGP', 'info.CLNVC', 'info.CLNDISDB', 'info.MC', 'info.CLNREVSTAT', 'info.CLNSIGCONF', 'info.CLNDN']
shape: (2_348,)
Series: 'variant_id' [str]
[
	"X:155026961:C:A"
	"X:155026975:G:A"
	"X:155026993:C:T"
	"X:155027003:C:G"
	"X:155027008:C:G"
	…
	"MT:15884:G:C"
	"MT:15885:C:T"
	"MT:16230:A:G"
	"MT:16274:G:A"
	"MT:16319:G:A"
]
