In [None]:
pd.set_option('display.max_rows', None)

In [None]:
import numpy as np
import swifter
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
clinvar_variant_summary = pd.read_csv("/mnt/d/mave_calibration/cache/variant_summary_2024-08.txt.gz",delimiter="\t",compression='gzip')

In [None]:
clinvar_variant_summary.to_pickle("/mnt/d/mave_calibration/cache/variant_summary_2024-08.pkl")

In [None]:
clinvar_variant_summary.shape

# Filter only single nucleotide variant


In [None]:
clinvar_snv = clinvar_variant_summary[clinvar_variant_summary['Type']=='single nucleotide variant']

In [None]:
clinvar_snv.shape

In [None]:
import re
from Bio.PDB.Polypeptide import protein_letters_3to1, protein_letters_3to1_extended
residues = set(list(map(lambda k : k.title(), protein_letters_3to1.keys())))

def parse_protein_variant(s):
    """
    Parse protein conseequence of a variant

    Parameters
    ----------
    s : str
        The input string to parse
    
    Returns
    -------
    reference_aa : str
    """
    # SPECIAL CASE: no protein
    if s == "p.0":
        raise ValueError("No protein")
    
    # Define a regular expression pattern to match the number in the string
    pattern = re.compile(r'(\D*)(\d+)(\D*)')
    
    # Search for the pattern in the input string
    match = pattern.search(s)
    
    if match:
        # Extract the three parts from the match groups
        before_number = match.group(1)
        number = match.group(2)
        after_number = match.group(3)
        wt_aa = before_number.replace("p.","")
        after_number = after_number.replace(")","")
        if wt_aa not in residues and wt_aa not in set(("Ter","Sec","Pyl")):
            raise ValueError(f"Expecting the value before the number to be a residue, not '{wt_aa}', as found in '{s}'")
        try:
            number = int(number)
        except:
            raise ValueError(f"Cannot convert {number} to an integer")
        if after_number[0] == "_":
            raise ValueError("Looks like variant causes a new translation initiation site")
        if "ext" in after_number:
            raise ValueError("Looks like an extension")
        if "^" in after_number:
            raise ValueError("Uncertain")
        if "/" in after_number:
            raise ValueError("Mosaic")
        return wt_aa, number, after_number
    else:
        # Return the original string if no number is found
        raise ValueError("Not a supported protein variant")

def is_protein_variant(name):
    return "(p." in name

def is_silent(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return False
    return variant == "="

def is_nonsense(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return False
    if variant == "*":
        return True
    if variant == "Ter":
        return True
    return False

def is_unknown(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return False
    return variant == "?" or variant == "Xaa"

def is_missense(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return False
    return variant in residues

def is_other_protein_variant(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return True
    return variant not in {"=", "*", "Ter", "?", "Xaa"} and variant not in residues


a few tests of parse protein variant

In [None]:
parse_protein_variant("p.Ala123Thr")

In [None]:
parse_protein_variant("p.Ala123Ter"), parse_protein_variant("p.Ala123*")

In [None]:
parse_protein_variant("p.Leu2_Met124del"), parse_protein_variant("p.Met1_Leu2insArgSerThrVal")

In [None]:
parse_protein_variant("p.Ter327Arg")

# Non-protein variants

In [None]:
nonprotein_variant_mask = clinvar_snv['Name'].apply(lambda x: not is_protein_variant(x))

In [None]:
nonprotein_variant_mask.sum()

In [None]:
clinvar_snv[nonprotein_variant_mask].iloc[0].Name

# Missense Variants

In [None]:
missense_mask = clinvar_snv.Name.swifter.apply(is_missense)

In [None]:
clinvar_missense = clinvar_snv[missense_mask]

In [None]:
clinvar_missense.shape

In [None]:
clinvar_missense.iloc[0].Name

# Silent Variants

In [None]:
silent_mask = clinvar_snv.Name.swifter.apply(is_silent)
clinvar_silent = clinvar_snv[silent_mask]

In [None]:
silent_mask.sum()

In [None]:
clinvar_snv[silent_mask].iloc[0].Name

In [None]:
clinvar_silent[clinvar_silent.ClinicalSignificance.isin(set(("Pathogenic", "Likely pathogenic", "Pathogenic/Likely pathogenic"))) & \
               ~clinvar_silent.ReviewStatus.isin({'no clasification provided',
                                                  'no assertion criteria provided',
                                                  'no classification for the single variant'})].shape

# Nonsense

In [None]:
nonsense_mask = clinvar_snv.Name.swifter.apply(is_nonsense)
clinvar_snv[nonsense_mask].shape

In [None]:
clinvar_snv[nonsense_mask].iloc[0].Name

# Unknown Alt-AA

In [None]:
unknown_mask = clinvar_snv.Name.swifter.apply(is_unknown)
clinvar_snv[unknown_mask].shape

In [None]:
clinvar_snv[unknown_mask].iloc[0].Name

# Other cases (e.g. extensions)

In [None]:
other_protein_variant_mask = clinvar_snv.Name.swifter.apply(is_other_protein_variant)
clinvar_snv[other_protein_variant_mask].shape

In [None]:
clinvar_snv[other_protein_variant_mask].iloc[0].Name

# Check that all SNVs are accounted for and all categorizations are mutually exclusive

In [None]:
clinvar_snv.shape[0], (missense_mask | silent_mask | nonsense_mask | unknown_mask | other_protein_variant_mask | nonprotein_variant_mask).sum().item()

In [None]:
(np.stack([missense_mask, silent_mask, nonsense_mask, unknown_mask, other_protein_variant_mask, nonprotein_variant_mask], axis=1).sum(axis=1) == 1).all()

# Summary

In [None]:
categorization = pd.Series(dict(missense=missense_mask.sum(),
                silent=silent_mask.sum(),
                nonsense=nonsense_mask.sum(),
                unknown=unknown_mask.sum(),
                other=other_protein_variant_mask.sum(),
                non_protein_variant=nonprotein_variant_mask.sum())).sort_values()

In [None]:
categorization

In [None]:
def is_valid_refseq_accession(refseq_string):
    """
    Check if a string is a valid RefSeq nucleotide accession number.

    Args:
        refseq_string (str): The string to verify.

    Returns:
        bool: True if the string is a valid RefSeq nucleotide accession, False otherwise.
    """
    # Regular expression pattern for RefSeq nucleotide accession (with optional version)
    # pattern = re.compile(r'^(NM|NP|NR|NG|NC|NT|XM|XP|XR|XT|NC|NG|NT|NR)_\d{6,10}(\.\d+)?$')
    
    # Regular expression pattern for RefSeq nucleotide accession (with required version)
    pattern = re.compile(r'^(NM|NP|NR|NG|NC|NT|XM|XP|XR|XT|NC|NG|NT|NR)_\d{6,10}(\.)(\d+)?$')
    
    # Check if the string matches the pattern
    try:
        return bool(pattern.match(refseq_string))
    except TypeError:
        raise TypeError(f"Expected a string, but got {type(refseq_string)}")

In [None]:
import re

def parse_clinvar_record(clinvar_string):
    """
    Extract transcript, gene symbol (if exists), DNA sequence substitution, and protein substitution (if exists) from a ClinVar record name.
    
    Args:
        clinvar_string (str): The ClinVar record name (e.g., "NM_015697.9(COQ2):c.30G>A (p.Arg10_Lys11=)" or "c.30G>A (p.Arg10_Lys11=)" or "NM_015697.9:c.30G>A" or "NM_015697.9(COQ2):c.30G>A").
    
    Returns:
        dict: A dictionary containing 'transcript', 'gene_symbol', 'dna_substitution', and 'protein_substitution'.
    """
    # Regular expression pattern to match the ClinVar record format with optional transcript, gene symbol, and protein substitution
    pattern = re.compile(
        r'(?:(?P<transcript>NM_\d+\.\d+|\w+_\d+))?(?:\s*(?:\((?P<gene_symbol>[^\)]+)\)|(?P<gene_symbol_no_paren>[^\s:]+)))?:(?P<dna_substitution>c\.[^\s]+)(?:\s*\((?P<protein_substitution>p\.[^\)]+)\))?'
    )
    
    # Search for the pattern in the input string
    match = pattern.search(clinvar_string)
    
    if match:
        # Extract the parts from the match groups
        transcript = match.group('transcript') or ''
        gene_symbol = match.group('gene_symbol') or match.group('gene_symbol_no_paren') or ''
        dna_substitution = match.group('dna_substitution')
        protein_substitution = match.group('protein_substitution') or ''
        
        return {
            'transcript': transcript,
            'gene_symbol': gene_symbol,
            'dna_substitution': dna_substitution,
            'protein_substitution': protein_substitution
        }
    else:
        # Return None or raise an error if the format is incorrect
        return {
            'transcript': '',
            'gene_symbol': '',
            'dna_substitution': '',
            'protein_substitution': ''}

# Example usage
clinvar_strings = [
    "NM_015697.9(COQ2):c.30G>A (p.Arg10_Lys11=)",
    "NM_015697.9:c.30G>A (p.Arg10_Lys11=)",
    "c.30G>A (p.Arg10_Lys11=)", # X
    "NM_000038.5(GENE):c.123A>T (p.Asn41=)",
    "NM_000038.5:c.123A>T (p.Asn41=)",
    "c.123A>T (p.Asn41=)", # X
    "NM_015697.9:c.30G>A (p.Arg10_Lys11=)",
    "FOXRED1: (p.Arg10_Lys11=)", # X
    "NM_015697.9(COQ2):c.30G>A",  # Protein substitution missing
    "NM_015697.9(ABC):c.30G>A",         # Protein substitution and gene symbol missing
    "NC_000023.10:c.33038255C>A",
    "NG_023416.2:g.(88604_176551)_(176714_576894)del" # X
]

for clinvar_string in clinvar_strings:
    try:
        parsed_result = parse_clinvar_record(clinvar_string)
        print(parsed_result)
    except ValueError as e:
        print(e)

In [None]:
missense_parse = clinvar_snv[clinvar_snv.Name.swifter.apply(is_missense)].Name.swifter.apply(parse_clinvar_record)

In [None]:
missense_parse_df = pd.DataFrame([d for d in missense_parse],)

In [None]:
missense_parse_df

In [None]:
clinvar_silent_parse = clinvar_snv[silent_mask].Name.swifter.apply(parse_clinvar_record)

In [None]:
silent_parse_df = pd.DataFrame([d for d in clinvar_silent_parse],)

In [None]:
silent_parse_df

In [None]:
fail_cases = clinvar_variant_summary[(clinvar_variant_summary.Type == "single nucleotide variant") & (clinvar_variant_summary.Name.swifter.apply(parse_clinvar_record).str.len() == 0)]

In [None]:
fail_cases.Name.shape

In [None]:
parse_res = [d for d in clinvar_snv.Name.swifter.apply(parse_clinvar_record) if len(d)]

In [None]:
parse_res = pd.DataFrame.from_records(parse_res)

In [1]:
from data_processing.clinvar import getClinvar, parse_clinvar_name
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
out = getClinvar(use_cached_processed_file=False, cache_dir="/mnt/d/mave_calibration/cache",debug=True)

loading variant summary pickle
loaded


In [5]:
out[0]

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),nsv/esv (dbVar),RCVaccession,PhenotypeIDS,PhenotypeList,Origin,OriginSimple,Assembly,ChromosomeAccession,Chromosome,Start,Stop,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,SomaticClinicalImpact,SomaticClinicalImpactLastEvaluated,ReviewStatusClinicalImpact,Oncogenicity,OncogenicityLastEvaluated,ReviewStatusOncogenicity,is_pathogenic,is_benign,HGVSp,RefSeq_nuc,is_missense,is_nonsense,is_unknown,is_silent,is_other_protein_variant,p_lp,b_lb,vus,conflicting,Name_name,Name_transcript,Name_geneSymbol,Name_dnaVariant,Name_proteinVariant,CHROM,POS,REF,ALT,hgvs_pro
4,15043.0,single nucleotide variant,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0.0,"Jun 29, 2015",150829393.0,-,RCV000000014,"MONDO:MONDO:0033005,MedGen:C4551772,OMIM:25130...",Galloway-Mowat syndrome 1,germline,germline,GRCh37,NC_000015.9,15,85342440.0,85342440.0,na,na,15q25.3,no assertion criteria provided,1.0,-,N,"ClinGen:CA210674,UniProtKB:Q92610#VAR_064583,O...",1.0,4.0,85342440.0,G,A,-,-,-,-,-,-,False,False,p.Gly1046Arg,NM_014630.3,True,False,False,False,False,False,False,True,False,NM_017547.4(FOXRED1):c.1289A>G (p.Asn430Ser),NM_017547.4,FOXRED1,c.1289A>G,p.Asn430Ser,15,85342440.0,G,A,p.Gly1046Arg
5,15043.0,single nucleotide variant,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),9640,ZNF592,HGNC:28986,Uncertain significance,0.0,"Jun 29, 2015",150829393.0,-,RCV000000014,"MONDO:MONDO:0033005,MedGen:C4551772,OMIM:25130...",Galloway-Mowat syndrome 1,germline,germline,GRCh38,NC_000015.10,15,84799209.0,84799209.0,na,na,15q25.3,no assertion criteria provided,1.0,-,N,"ClinGen:CA210674,UniProtKB:Q92610#VAR_064583,O...",1.0,4.0,84799209.0,G,A,-,-,-,-,-,-,False,False,p.Gly1046Arg,NM_014630.3,True,False,False,False,False,False,False,True,False,NM_017547.4(FOXRED1):c.1289A>G (p.Asn430Ser),NM_017547.4,FOXRED1,c.1289A>G,p.Asn430Ser,15,84799209.0,G,A,p.Gly1046Arg
6,15044.0,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,0.0,"Dec 01, 2023",267606829.0,-,RCV000000015|RCV000578659|RCV001194045|RCV0033...,"MONDO:MONDO:0032624,MedGen:C4748791,OMIM:61824...","Mitochondrial complex 1 deficiency, nuclear ty...",germline;unknown,germline,GRCh37,NC_000011.9,11,126145284.0,126145284.0,na,na,11q24.2,"criteria provided, multiple submitters, no con...",6.0,-,N,"ClinGen:CA113792,OMIM:613622.0001",3.0,5.0,126145284.0,C,T,-,-,-,-,-,-,True,False,p.Gln232Ter,NM_017547.4,False,True,False,False,False,False,False,False,False,NM_025152.3(NUBPL):c.166G>A (p.Gly56Arg),NM_025152.3,NUBPL,c.166G>A,p.Gly56Arg,11,126145284.0,C,T,p.Gln232Ter
7,15044.0,single nucleotide variant,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),55572,FOXRED1,HGNC:26927,Pathogenic,0.0,"Dec 01, 2023",267606829.0,-,RCV000000015|RCV000578659|RCV001194045|RCV0033...,"MONDO:MONDO:0032624,MedGen:C4748791,OMIM:61824...","Mitochondrial complex 1 deficiency, nuclear ty...",germline;unknown,germline,GRCh38,NC_000011.10,11,126275389.0,126275389.0,na,na,11q24.2,"criteria provided, multiple submitters, no con...",6.0,-,N,"ClinGen:CA113792,OMIM:613622.0001",3.0,5.0,126275389.0,C,T,-,-,-,-,-,-,True,False,p.Gln232Ter,NM_017547.4,False,True,False,False,False,False,False,False,False,NM_025152.3(NUBPL):c.166G>A (p.Gly56Arg),NM_025152.3,NUBPL,c.166G>A,p.Gly56Arg,11,126275389.0,C,T,p.Gln232Ter
8,15045.0,single nucleotide variant,NM_017547.4(FOXRED1):c.1289A>G (p.Asn430Ser),55572,FOXRED1,HGNC:26927,Pathogenic,0.0,"Oct 01, 2010",267606830.0,-,RCV000000016,"MONDO:MONDO:0032624,MedGen:C4748791,OMIM:618241","Mitochondrial complex 1 deficiency, nuclear ty...",germline,germline,GRCh37,NC_000011.9,11,126147412.0,126147412.0,na,na,11q24.2,no assertion criteria provided,1.0,-,N,"ClinGen:CA113794,UniProtKB:Q96CU9#VAR_064571,O...",1.0,6.0,126147412.0,A,G,-,-,-,-,-,-,False,False,p.Asn430Ser,NM_017547.4,True,False,False,False,False,False,False,False,False,NM_000410.4(HFE):c.845G>A (p.Cys282Tyr),NM_000410.4,HFE,c.845G>A,p.Cys282Tyr,11,126147412.0,A,G,p.Asn430Ser
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4236947,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_133433.4(NIPBL):c.874A>G (p.Arg292Gly),NM_133433.4,NIPBL,c.874A>G,p.Arg292Gly,,,,,
4236994,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_005359.6(SMAD4):c.1259G>T (p.Arg420Leu),NM_005359.6,SMAD4,c.1259G>T,p.Arg420Leu,,,,,
4236995,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_005359.6(SMAD4):c.1259G>T (p.Arg420Leu),NM_005359.6,SMAD4,c.1259G>T,p.Arg420Leu,,,,,
4237374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_001082971.2(DDC):c.782G>T (p.Cys261Phe),NM_001082971.2,DDC,c.782G>T,p.Cys261Phe,,,,,


In [7]:
out[0][out[0].Name.isna()]

Unnamed: 0,#AlleleID,Type,Name,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),nsv/esv (dbVar),RCVaccession,PhenotypeIDS,PhenotypeList,Origin,OriginSimple,Assembly,ChromosomeAccession,Chromosome,Start,Stop,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,VariationID,PositionVCF,ReferenceAlleleVCF,AlternateAlleleVCF,SomaticClinicalImpact,SomaticClinicalImpactLastEvaluated,ReviewStatusClinicalImpact,Oncogenicity,OncogenicityLastEvaluated,ReviewStatusOncogenicity,is_pathogenic,is_benign,HGVSp,RefSeq_nuc,is_missense,is_nonsense,is_unknown,is_silent,is_other_protein_variant,p_lp,b_lb,vus,conflicting,Name_name,Name_transcript,Name_geneSymbol,Name_dnaVariant,Name_proteinVariant,CHROM,POS,REF,ALT,hgvs_pro
0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),NM_014630.3,ZNF592,c.3136G>A,p.Gly1046Arg,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg),NM_014630.3,ZNF592,c.3136G>A,p.Gly1046Arg,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),NM_017547.4,FOXRED1,c.694C>T,p.Gln232Ter,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter),NM_017547.4,FOXRED1,c.694C>T,p.Gln232Ter,,,,,
22,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_000410.4(HFE):c.381A>C (p.Gln127His),NM_000410.4,HFE,c.381A>C,p.Gln127His,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4236947,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_133433.4(NIPBL):c.874A>G (p.Arg292Gly),NM_133433.4,NIPBL,c.874A>G,p.Arg292Gly,,,,,
4236994,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_005359.6(SMAD4):c.1259G>T (p.Arg420Leu),NM_005359.6,SMAD4,c.1259G>T,p.Arg420Leu,,,,,
4236995,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_005359.6(SMAD4):c.1259G>T (p.Arg420Leu),NM_005359.6,SMAD4,c.1259G>T,p.Arg420Leu,,,,,
4237374,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,NM_001082971.2(DDC):c.782G>T (p.Cys261Phe),NM_001082971.2,DDC,c.782G>T,p.Cys261Phe,,,,,


In [None]:
pd.DataFrame.from_records(out.values)

In [None]:
clinvar.loc[clinvar.RefSeq_nuc != clinvar.Name_transcript, ["Name",'RefSeq_nuc','Name_transcript']]

In [None]:
parse_clinvar_name("NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg)")

In [None]:
clinvar.shape

In [None]:
paresed_names

In [None]:
clinvar.iloc[0]