In [263]:
pd.set_option('display.max_rows', None)

In [264]:
import numpy as np
import swifter
import pandas as pd
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)
clinvar_variant_summary = pd.read_csv("/mnt/d/mave_calibration/cache/variant_summary_2024-08.txt.gz",delimiter="\t",compression='gzip')

  clinvar_variant_summary = pd.read_csv("/mnt/d/mave_calibration/cache/variant_summary_2024-08.txt.gz",delimiter="\t",compression='gzip')


In [265]:
clinvar_variant_summary.to_pickle("/mnt/d/mave_calibration/cache/variant_summary_2024-08.pkl")

In [266]:
clinvar_variant_summary.shape

(5911112, 40)

# Filter only single nucleotide variant


In [267]:
clinvar_snv = clinvar_variant_summary[clinvar_variant_summary['Type']=='single nucleotide variant']

In [268]:
clinvar_snv.shape

(5336466, 40)

In [269]:
import re
from Bio.PDB.Polypeptide import protein_letters_3to1, protein_letters_3to1_extended
residues = set(list(map(lambda k : k.title(), protein_letters_3to1.keys())))

def parse_protein_variant(s):
    """
    Parse protein conseequence of a variant

    Parameters
    ----------
    s : str
        The input string to parse
    
    Returns
    -------
    reference_aa : str
    """
    # SPECIAL CASE: no protein
    if s == "p.0":
        raise ValueError("No protein")
    
    # Define a regular expression pattern to match the number in the string
    pattern = re.compile(r'(\D*)(\d+)(\D*)')
    
    # Search for the pattern in the input string
    match = pattern.search(s)
    
    if match:
        # Extract the three parts from the match groups
        before_number = match.group(1)
        number = match.group(2)
        after_number = match.group(3)
        wt_aa = before_number.replace("p.","")
        after_number = after_number.replace(")","")
        if wt_aa not in residues and wt_aa not in set(("Ter","Sec","Pyl")):
            raise ValueError(f"Expecting the value before the number to be a residue, not '{wt_aa}', as found in '{s}'")
        try:
            number = int(number)
        except:
            raise ValueError(f"Cannot convert {number} to an integer")
        if after_number[0] == "_":
            raise ValueError("Looks like variant causes a new translation initiation site")
        if "ext" in after_number:
            raise ValueError("Looks like an extension")
        if "^" in after_number:
            raise ValueError("Uncertain")
        if "/" in after_number:
            raise ValueError("Mosaic")
        return wt_aa, number, after_number
    else:
        # Return the original string if no number is found
        raise ValueError("Not a supported protein variant")

def is_protein_variant(name):
    return "(p." in name

def is_silent(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return False
    return variant == "="

def is_nonsense(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return False
    if variant == "*":
        return True
    if variant == "Ter":
        return True
    return False

def is_unknown(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return False
    return variant == "?" or variant == "Xaa"

def is_missense(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return False
    return variant in residues

def is_other_protein_variant(name):
    if not is_protein_variant(name):
        return False
    protein_variant = name.split("p.")[1]
    try:
        wt_aa, position, variant = parse_protein_variant(protein_variant)
    except ValueError:
        return True
    return variant not in {"=", "*", "Ter", "?", "Xaa"} and variant not in residues


a few tests of parse protein variant

In [270]:
parse_protein_variant("p.Ala123Thr")

('Ala', 123, 'Thr')

In [271]:
parse_protein_variant("p.Ala123Ter"), parse_protein_variant("p.Ala123*")

(('Ala', 123, 'Ter'), ('Ala', 123, '*'))

In [272]:
parse_protein_variant("p.Leu2_Met124del"), parse_protein_variant("p.Met1_Leu2insArgSerThrVal")

ValueError: Looks like variant causes a new translation initiation site

In [None]:
parse_protein_variant("p.Ter327Arg")

('Ter', 327, 'Arg')

# Non-protein variants

In [273]:
nonprotein_variant_mask = clinvar_snv['Name'].apply(lambda x: not is_protein_variant(x))

In [274]:
nonprotein_variant_mask.sum()

np.int64(1099086)

In [275]:
clinvar_snv[nonprotein_variant_mask].iloc[0].Name

'NM_000410.4(HFE):c.892+48G>A'

# Missense Variants

In [276]:
missense_mask = clinvar_snv.Name.swifter.apply(is_missense)

Pandas Apply:   0%|          | 0/5336466 [00:00<?, ?it/s]

In [277]:
clinvar_missense = clinvar_snv[missense_mask]

In [278]:
clinvar_missense.shape

(2889704, 40)

In [279]:
clinvar_missense.iloc[0].Name

'NM_014630.3(ZNF592):c.3136G>A (p.Gly1046Arg)'

# Silent Variants

In [280]:
silent_mask = clinvar_snv.Name.swifter.apply(is_silent)
clinvar_silent = clinvar_snv[silent_mask]

Pandas Apply:   0%|          | 0/5336466 [00:00<?, ?it/s]

In [281]:
silent_mask.sum()

np.int64(1213559)

In [282]:
clinvar_snv[silent_mask].iloc[0].Name

'NM_000374.5(UROD):c.942G>A (p.Glu314=)'

In [283]:
clinvar_silent[clinvar_silent.ClinicalSignificance.isin(set(("Pathogenic", "Likely pathogenic", "Pathogenic/Likely pathogenic"))) & \
               ~clinvar_silent.ReviewStatus.isin({'no clasification provided',
                                                  'no assertion criteria provided',
                                                  'no classification for the single variant'})].shape

(1092, 40)

# Nonsense

In [284]:
nonsense_mask = clinvar_snv.Name.swifter.apply(is_nonsense)
clinvar_snv[nonsense_mask].shape

Pandas Apply:   0%|          | 0/5336466 [00:00<?, ?it/s]

(134097, 40)

In [285]:
clinvar_snv[nonsense_mask].iloc[0].Name

'NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter)'

# Unknown Alt-AA

In [286]:
unknown_mask = clinvar_snv.Name.swifter.apply(is_unknown)
clinvar_snv[unknown_mask].shape

Pandas Apply:   0%|          | 0/5336466 [00:00<?, ?it/s]

(18, 40)

In [287]:
clinvar_snv[unknown_mask].iloc[0].Name

'NM_000558.5(HBA1):c.43T>Y (p.Trp15Xaa)'

# Other cases (e.g. extensions)

In [288]:
other_protein_variant_mask = clinvar_snv.Name.swifter.apply(is_other_protein_variant)
clinvar_snv[other_protein_variant_mask].shape

Pandas Apply:   0%|          | 0/5336466 [00:00<?, ?it/s]

(2, 40)

In [289]:
clinvar_snv[other_protein_variant_mask].iloc[0].Name

'NM_015697.9(COQ2):c.30G>A (p.Arg10_Lys11=)'

# Check that all SNVs are accounted for and all categorizations are mutually exclusive

In [290]:
clinvar_snv.shape[0], (missense_mask | silent_mask | nonsense_mask | unknown_mask | other_protein_variant_mask | nonprotein_variant_mask).sum().item()

(5336466, 5336466)

In [291]:
(np.stack([missense_mask, silent_mask, nonsense_mask, unknown_mask, other_protein_variant_mask, nonprotein_variant_mask], axis=1).sum(axis=1) == 1).all()

np.True_

# Summary

In [292]:
categorization = pd.Series(dict(missense=missense_mask.sum(),
                silent=silent_mask.sum(),
                nonsense=nonsense_mask.sum(),
                unknown=unknown_mask.sum(),
                other=other_protein_variant_mask.sum(),
                non_protein_variant=nonprotein_variant_mask.sum())).sort_values()

In [293]:
categorization

other                        2
unknown                     18
nonsense                134097
non_protein_variant    1099086
silent                 1213559
missense               2889704
dtype: int64

In [294]:
def is_valid_refseq_accession(refseq_string):
    """
    Check if a string is a valid RefSeq nucleotide accession number.

    Args:
        refseq_string (str): The string to verify.

    Returns:
        bool: True if the string is a valid RefSeq nucleotide accession, False otherwise.
    """
    # Regular expression pattern for RefSeq nucleotide accession (with optional version)
    # pattern = re.compile(r'^(NM|NP|NR|NG|NC|NT|XM|XP|XR|XT|NC|NG|NT|NR)_\d{6,10}(\.\d+)?$')
    
    # Regular expression pattern for RefSeq nucleotide accession (with required version)
    pattern = re.compile(r'^(NM|NP|NR|NG|NC|NT|XM|XP|XR|XT|NC|NG|NT|NR)_\d{6,10}(\.)(\d+)?$')
    
    # Check if the string matches the pattern
    try:
        return bool(pattern.match(refseq_string))
    except TypeError:
        raise TypeError(f"Expected a string, but got {type(refseq_string)}")

In [306]:
import re

def parse_clinvar_record(clinvar_string):
    """
    Extract transcript, gene symbol (if exists), DNA sequence substitution, and protein substitution (if exists) from a ClinVar record name.
    
    Args:
        clinvar_string (str): The ClinVar record name (e.g., "NM_015697.9(COQ2):c.30G>A (p.Arg10_Lys11=)" or "c.30G>A (p.Arg10_Lys11=)" or "NM_015697.9:c.30G>A" or "NM_015697.9(COQ2):c.30G>A").
    
    Returns:
        dict: A dictionary containing 'transcript', 'gene_symbol', 'dna_substitution', and 'protein_substitution'.
    """
    # Regular expression pattern to match the ClinVar record format with optional transcript, gene symbol, and protein substitution
    pattern = re.compile(
        r'(?:(?P<transcript>NM_\d+\.\d+|\w+_\d+))?(?:\s*(?:\((?P<gene_symbol>[^\)]+)\)|(?P<gene_symbol_no_paren>[^\s:]+)))?:(?P<dna_substitution>c\.[^\s]+)(?:\s*\((?P<protein_substitution>p\.[^\)]+)\))?'
    )
    
    # Search for the pattern in the input string
    match = pattern.search(clinvar_string)
    
    if match:
        # Extract the parts from the match groups
        transcript = match.group('transcript') or ''
        gene_symbol = match.group('gene_symbol') or match.group('gene_symbol_no_paren') or ''
        dna_substitution = match.group('dna_substitution')
        protein_substitution = match.group('protein_substitution') or ''
        
        return {
            'transcript': transcript,
            'gene_symbol': gene_symbol,
            'dna_substitution': dna_substitution,
            'protein_substitution': protein_substitution
        }
    else:
        # Return None or raise an error if the format is incorrect
        return {
            'transcript': '',
            'gene_symbol': '',
            'dna_substitution': '',
            'protein_substitution': ''}

# Example usage
clinvar_strings = [
    "NM_015697.9(COQ2):c.30G>A (p.Arg10_Lys11=)",
    "NM_015697.9:c.30G>A (p.Arg10_Lys11=)",
    "c.30G>A (p.Arg10_Lys11=)", # X
    "NM_000038.5(GENE):c.123A>T (p.Asn41=)",
    "NM_000038.5:c.123A>T (p.Asn41=)",
    "c.123A>T (p.Asn41=)", # X
    "NM_015697.9:c.30G>A (p.Arg10_Lys11=)",
    "FOXRED1: (p.Arg10_Lys11=)", # X
    "NM_015697.9(COQ2):c.30G>A",  # Protein substitution missing
    "NM_015697.9(ABC):c.30G>A",         # Protein substitution and gene symbol missing
    "NC_000023.10:c.33038255C>A",
    "NG_023416.2:g.(88604_176551)_(176714_576894)del" # X
]

for clinvar_string in clinvar_strings:
    try:
        parsed_result = parse_clinvar_record(clinvar_string)
        print(parsed_result)
    except ValueError as e:
        print(e)

{'transcript': 'NM_015697.9', 'gene_symbol': 'COQ2', 'dna_substitution': 'c.30G>A', 'protein_substitution': 'p.Arg10_Lys11='}
{'transcript': 'NM_015697.9', 'gene_symbol': '', 'dna_substitution': 'c.30G>A', 'protein_substitution': 'p.Arg10_Lys11='}
{'transcript': '', 'gene_symbol': '', 'dna_substitution': '', 'protein_substitution': ''}
{'transcript': 'NM_000038.5', 'gene_symbol': 'GENE', 'dna_substitution': 'c.123A>T', 'protein_substitution': 'p.Asn41='}
{'transcript': 'NM_000038.5', 'gene_symbol': '', 'dna_substitution': 'c.123A>T', 'protein_substitution': 'p.Asn41='}
{'transcript': '', 'gene_symbol': '', 'dna_substitution': '', 'protein_substitution': ''}
{'transcript': 'NM_015697.9', 'gene_symbol': '', 'dna_substitution': 'c.30G>A', 'protein_substitution': 'p.Arg10_Lys11='}
{'transcript': '', 'gene_symbol': '', 'dna_substitution': '', 'protein_substitution': ''}
{'transcript': 'NM_015697.9', 'gene_symbol': 'COQ2', 'dna_substitution': 'c.30G>A', 'protein_substitution': ''}
{'transcri

In [307]:
missense_parse = clinvar_snv[clinvar_snv.Name.swifter.apply(is_missense)].Name.swifter.apply(parse_clinvar_record)

Pandas Apply:   0%|          | 0/5336466 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2889704 [00:00<?, ?it/s]

In [308]:
missense_parse_df = pd.DataFrame([d for d in missense_parse],)

In [309]:
missense_parse_df

Unnamed: 0,transcript,gene_symbol,dna_substitution,protein_substitution
0,NM_014630.3,ZNF592,c.3136G>A,p.Gly1046Arg
1,NM_014630.3,ZNF592,c.3136G>A,p.Gly1046Arg
2,NM_017547.4,FOXRED1,c.1289A>G,p.Asn430Ser
3,NM_017547.4,FOXRED1,c.1289A>G,p.Asn430Ser
4,NM_025152.3,NUBPL,c.166G>A,p.Gly56Arg
...,...,...,...,...
2889699,NM_001082971.2,DDC,c.782G>T,p.Cys261Phe
2889700,NM_001082971.2,DDC,c.1060G>A,p.Gly354Ser
2889701,NM_001082971.2,DDC,c.1060G>A,p.Gly354Ser
2889702,NM_001165967.2,HES7,c.113T>C,p.Leu38Pro


In [310]:
clinvar_silent_parse = clinvar_snv[silent_mask].Name.swifter.apply(parse_clinvar_record)

Pandas Apply:   0%|          | 0/1213559 [00:00<?, ?it/s]

In [313]:
silent_parse_df = pd.DataFrame([d for d in clinvar_silent_parse],)

In [314]:
silent_parse_df

Unnamed: 0,transcript,gene_symbol,dna_substitution,protein_substitution
0,NM_000374.5,UROD,c.942G>A,p.Glu314=
1,NM_000374.5,UROD,c.942G>A,p.Glu314=
2,NM_000274.4,OAT,c.1134C>T,p.Asn378=
3,NM_000274.4,OAT,c.1134C>T,p.Asn378=
4,NM_003730.6,RNASET2,c.567G>A,p.Gln189=
...,...,...,...,...
1213554,NM_182925.5,FLT4,c.1548G>A,p.Lys516=
1213555,NM_022893.4,BCL11A,c.384A>G,p.Ala128=
1213556,NM_022893.4,BCL11A,c.384A>G,p.Ala128=
1213557,NM_000533.5,PLP1,c.384C>T,p.Gly128=


In [None]:
fail_cases = clinvar_variant_summary[(clinvar_variant_summary.Type == "single nucleotide variant") & (clinvar_variant_summary.Name.swifter.apply(parse_clinvar_record).str.len() == 0)]

Pandas Apply:   0%|          | 0/5911112 [00:00<?, ?it/s]

In [None]:
fail_cases.Name.shape

(18037,)

In [None]:
parse_res = [d for d in clinvar_snv.Name.swifter.apply(parse_clinvar_record) if len(d)]

Pandas Apply:   0%|          | 0/5336466 [00:00<?, ?it/s]

In [None]:
parse_res = pd.DataFrame.from_records(parse_res)