# UniProt Parser

从UniProt上检索并下载包含Ensemble在内的所有感兴趣的UniProt数据，获取对应UniProt Accession对应的RefSeq ID，存在一对多的情况

In [350]:
import re 
from typing import Any, List
import pandas as pd  
import sys
from proture.download.utils import download_url
from proture.utils import explode
from functools import reduce
pd.set_option('display.max_columns', 100)

def parse_alternative_text(text:str):
    if pd.isna(text):
        return "default"
    else:
        pattern1 = r"Name=.*Displayed"
        search_result = re.search(pattern1, text)
        if search_result:
            iso_text = search_result.group()
            pattern2 = r"(?<=IsoId=).*(?=;)"
            search_result_2 = re.search(pattern2, iso_text)
            if search_result_2: 
                return search_result_2.group()
            else:
                return "default"
        else:
            return "default"
        
def dict_RefSeq(x:str):
    if pd.isna(x):
        return {"default":[]}
    else:
        re_sep=r"(?<=]);"
        is_isinstance_sep = re.search(r"(?<=]);", x)
        if is_isinstance_sep:
            isoform_RefSeq_mapping_dict = {}
            isoform_mapping:List[str] = re.split(r"(?<=]);", x)
            for isoform in isoform_mapping:
                if isoform != "":
                    split_isoform = isoform.split(" ")
                    if len(isoform.split(" ")) >=2:
                        canonical, isoform_id = isoform.split(" ")
                    else:
                        canonical = isoform.strip(";")
                        isoform_id = "default"
                    isoform_id = isoform_id.strip("[").strip("]")
                    canonical = canonical.split(";")
                    
                    if isoform_id not in isoform_RefSeq_mapping_dict.keys():
                        if isinstance(canonical, (list, tuple)):
                            isoform_RefSeq_mapping_dict[isoform_id] = canonical
                        else:
                            raise ValueError(f"{canonical} is not list or tuple, please check")
                    else:
                        if isinstance(canonical, (list, tuple)):
                            isoform_RefSeq_mapping_dict[isoform_id] += canonical

        
            return isoform_RefSeq_mapping_dict

        else:
            return {"default":[i.strip("[").strip("]") for i in x.split(";") if i != ""]}

def load_LRG_RefSeqGene(local_dir:str = None, save_dir:str = "./LRG_RefSeqGene"):
    if local_dir is None:
        # see https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/
        url = "https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/LRG_RefSeqGene"
        download_url(url = url, output_file = save_dir)
        local_dir = save_dir

    return pd.read_csv(local_dir, index_col=False, sep="\t")

def add_lists(two_dimension_list:List[List[Any]]):
    return reduce(lambda x,y: x + y, two_dimension_list)


In [374]:
def UniProt2NP(homo_sapians_uniprot_df:pd.DataFrame, contain_all_uniprot_isoform:bool = False):
    """UniProt Accession mapping to RefSeq Nucletide by LRG, see more at UniProt and LRG:https://ftp.ncbi.nlm.nih.gov/refseq/H_sapiens/RefSeqGene/

    Args:
        homo_sapians_uniprot_df (pd.DataFrame): each row are UniProt Accession entry and must contain two fields of UniProt download result:1):RefSeq;2)Alternative products (isoforms)
        contain_all_uniprot_isoform (bool, optional): output result will contain all RefSeq associated with each UniProt Accession isoform or just the canonical isoform of this entry, if True, will contain RefSeq only if it really associated with canonical isoform of UniProt Accession entry. Defaults to False.

    Returns:
        _type_: _description_
    """
    homo_sapians_uniprot_df = pd.read_csv(homo_sapians_uniprot_dir, index_col=False, sep="\t")

    # step1 parse the canonical isoforms of UniProt Accession and generate each isoform to RefSeq mapping 
    homo_sapians_uniprot_df["canonical isoforms"] = homo_sapians_uniprot_df["Alternative products (isoforms)"].apply(lambda x: parse_alternative_text(x))
    homo_sapians_uniprot_df["canonical RefSeq"] = homo_sapians_uniprot_df["RefSeq"].apply(lambda x: dict_RefSeq(x))

    # step2 get canonical or all isoform of UniProt Accession to associated RefSeq 
    def map2canonical(x):
        return pd.Series({"canonical protein ID":x["canonical RefSeq"].get(x["canonical isoforms"], None)})
        
    if not contain_all_uniprot_isoform:
        homo_sapians_uniprot_df["canonical RefSeq protein"] = homo_sapians_uniprot_df.loc[:, ["canonical RefSeq", "canonical isoforms"]].apply(lambda x: map2canonical(x), axis=1)
    else:
        homo_sapians_uniprot_df["canonical RefSeq protein"] = homo_sapians_uniprot_df["canonical RefSeq"].apply(lambda x: add_lists(list(x.values())))

    del homo_sapians_uniprot_df["canonical isoforms"]
    del homo_sapians_uniprot_df["canonical RefSeq"]
    # step3 explode multi-mapping relationship among canonical UniProt Accession and RefSeq to multi-row.
    homo_sapians_uniprot_df = homo_sapians_uniprot_df.explode("canonical RefSeq protein")

    # step 4 Convert NP to NM by LRG_RefSeqGene file
    LRG_RefSeqGene = load_LRG_RefSeqGene(local_dir = "/p300s/wangmx_group/xutingfeng/statistic/proture/data/RefSeq/LRG_RefSeqGene")
    NM2NP_mapping = LRG_RefSeqGene.loc[:, ["RNA", "Protein"]].drop_duplicates().dropna(how="any")

    hsapians_uniprot_NM = pd.merge(left = homo_sapians_uniprot_df.dropna(subset=["canonical RefSeq protein"], axis=0), right = NM2NP_mapping, left_on="canonical RefSeq protein", right_on= "Protein", how="inner")


    print(f'now have gene:{len(explode(hsapians_uniprot_NM, column="Gene Names", sep =";")["Gene Names"].unique())} in the dataset')
    return hsapians_uniprot_NM

In [375]:
homo_sapians_uniprot_dir = "/p300s/wangmx_group/xutingfeng/statistic/proture/data/UniProt/uniprot_homo_sapians.tsv"  # this file is query from UniProt with fields Ensemble
homo_sapians_uniprot_df = pd.read_csv(homo_sapians_uniprot_dir, index_col=False, sep="\t")

In [381]:
UniProt2NP(homo_sapians_uniprot_df = homo_sapians_uniprot_df, column = "Alternative products (isoforms)", contain_all_uniprot_isoform=True)

now have gene:3105 in the dataset


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,RefSeq,Alternative products (isoforms),canonical RefSeq protein,RNA,Protein
0,Q99933,Q99933,reviewed,BAG1_HUMAN,BAG family molecular chaperone regulator 1 (BA...,BAG1 HAP,NP_001165886.1 [Q99933-4];NP_004314.5 [Q99933-1];,ALTERNATIVE PRODUCTS: Event=Alternative splic...,NP_001165886.1,NM_001172415.2,NP_001165886.1
1,Q9NZK5,Q9NZK5,reviewed,ADA2_HUMAN,Adenosine deaminase 2 (EC 3.5.4.4) (Cat eye sy...,ADA2 ADGF CECR1 IDGFL,NP_001269154.1 [Q9NZK5-1];NP_001269155.1 [Q9NZ...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,NP_001269154.1,NM_001282225.2,NP_001269154.1
2,Q9NZK5,Q9NZK5,reviewed,ADA2_HUMAN,Adenosine deaminase 2 (EC 3.5.4.4) (Cat eye sy...,ADA2 ADGF CECR1 IDGFL,NP_001269154.1 [Q9NZK5-1];NP_001269155.1 [Q9NZ...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,NP_001269155.1,NM_001282226.2,NP_001269155.1
3,Q9NZK5,Q9NZK5,reviewed,ADA2_HUMAN,Adenosine deaminase 2 (EC 3.5.4.4) (Cat eye sy...,ADA2 ADGF CECR1 IDGFL,NP_001269154.1 [Q9NZK5-1];NP_001269155.1 [Q9NZ...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,NP_001269156.1,NM_001282227.2,NP_001269156.1
4,Q9NZK5,Q9NZK5,reviewed,ADA2_HUMAN,Adenosine deaminase 2 (EC 3.5.4.4) (Cat eye sy...,ADA2 ADGF CECR1 IDGFL,NP_001269154.1 [Q9NZK5-1];NP_001269155.1 [Q9NZ...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,NP_001269157.1,NM_001282228.2,NP_001269157.1
...,...,...,...,...,...,...,...,...,...,...,...
8754,Q9NXR8,Q9NXR8,reviewed,ING3_HUMAN,Inhibitor of growth protein 3 (p47ING3),ING3 HSPC301,NP_061944.2 [Q9NXR8-1];NP_938008.1 [Q9NXR8-2];,ALTERNATIVE PRODUCTS: Event=Alternative splic...,NP_061944.2,NM_019071.3,NP_061944.2
8755,Q9NXR8,Q9NXR8,reviewed,ING3_HUMAN,Inhibitor of growth protein 3 (p47ING3),ING3 HSPC301,NP_061944.2 [Q9NXR8-1];NP_938008.1 [Q9NXR8-2];,ALTERNATIVE PRODUCTS: Event=Alternative splic...,NP_938008.1,NM_198267.2,NP_938008.1
8756,Q9HBX9,Q9HBX9,reviewed,RXFP1_HUMAN,Relaxin receptor 1 (Leucine-rich repeat-contai...,RXFP1 LGR7,NP_001240657.1 [Q9HBX9-2];NP_001240658.1 [Q9HB...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,NP_001240657.1,NM_001253728.2,NP_001240657.1
8757,Q9HBX9,Q9HBX9,reviewed,RXFP1_HUMAN,Relaxin receptor 1 (Leucine-rich repeat-contai...,RXFP1 LGR7,NP_001240657.1 [Q9HBX9-2];NP_001240658.1 [Q9HB...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,NP_001240658.1,NM_001253729.2,NP_001240658.1


In [325]:
import re 
def splitName2Attribution(x):
    """
    splitName2Attribution HGSV 命名法：https://varnomen.hgvs.org/recommendations/general/
    refSeq:https://en.wikipedia.org/wiki/RefSeq#:~:text=The%20Reference%20Sequence%20(RefSeq)%20database,was%20first%20introduced%20in%202000.

    Args:
        str类型: "NM_017547.4(FOXRED1):c.694C>T (p.Gln232Ter)"

    Returns:
        ["NM_017547.4", "Gln232Ter","694C>T"]
    """
    ref_pattern = r"[A-Za-z]+_[0-9]+[\.]*[0-9]*"
    AAS_pattern = r"(?<=(p.))[A-Za-z]{0,3}[^\s()]*"
    SNP_pattern = r"(?<=(c.))[\d]*[^\s()]*"
    ref = re.search(ref_pattern, x).group() if re.search(ref_pattern, x) else None 
    AAS = re.search(AAS_pattern, x).group() if  re.search(AAS_pattern, x) else None 
    SNP = re.search(SNP_pattern, x).group() if re.search(SNP_pattern, x) else None 
    return ref, AAS, SNP 

In [326]:
clinvar_variant_summary_dir = "/p300s/wangmx_group/xutingfeng/statistic/proture/data/clinvar/variant_summary_2018-02_filtertxt.gz"
clinvar_variant_summary = pd.read_csv(clinvar_variant_summary_dir, sep="\t", compression="gzip")
clinvar_variant_summary.insert(3, "transcript_accession", clinvar_variant_summary["Name"].apply(lambda x: splitName2Attribution(x)[0]))
clinvar_variant_summary

Unnamed: 0,AlleleID,Type,Name,transcript_accession,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),nsv/esv (dbVar),RCVaccession,PhenotypeIDS,PhenotypeList,Origin,OriginSimple,Assembly,ChromosomeAccession,Chromosome,Start,Stop,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories
0,15044,single nucleotide variant,NM_017547.3(FOXRED1):c.694C>T (p.Gln232Ter),NM_017547.3,55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606829,-,RCV000000015,"MedGen:C1838979,OMIM:252010",Mitochondrial complex I deficiency,germline,germline,GRCh37,NC_000011.9,11,126145284,126145284,C,T,11q24,no assertion criteria provided,1,,N,OMIM Allelic Variant:613622.0001,1
1,15045,single nucleotide variant,NM_017547.3(FOXRED1):c.1289A>G (p.Asn430Ser),NM_017547.3,55572,FOXRED1,HGNC:26927,Pathogenic,1,"Oct 01, 2010",267606830,-,RCV000000016,"MedGen:C1838979,OMIM:252010",Mitochondrial complex I deficiency,germline,germline,GRCh37,NC_000011.9,11,126147412,126147412,A,G,11q24,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613622.0002,UniProtKB (pr...",1
2,15051,single nucleotide variant,NM_000410.3(HFE):c.314T>C (p.Ile105Thr),NM_000410.3,3077,HFE,HGNC:4886,Pathogenic,1,"Jun 01, 1999",28934596,-,RCV000000029,"MedGen:C3469186,OMIM:235200",Hemochromatosis type 1,germline,germline,GRCh37,NC_000006.11,6,26091306,26091306,T,C,6p22.2,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613609.0009,UniProtKB (pr...",1
3,15052,single nucleotide variant,NM_000410.3(HFE):c.277G>C (p.Gly93Arg),NM_000410.3,3077,HFE,HGNC:4886,Pathogenic,1,"Jun 01, 1999",28934597,-,RCV000000030,"MedGen:C3469186,OMIM:235200",Hemochromatosis type 1,germline,germline,GRCh37,NC_000006.11,6,26091269,26091269,G,C,6p22.2,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613609.0010,UniProtKB (pr...",1
4,15056,single nucleotide variant,NM_000410.3(HFE):c.381A>C (p.Gln127His),NM_000410.3,3077,HFE,HGNC:4886,Pathogenic,1,"Aug 01, 1999",28934595,-,RCV000000034,"MedGen:C3469186,OMIM:235200",Hemochromatosis type 1,germline,germline,GRCh37,NC_000006.11,6,26091582,26091582,A,C,6p22.2,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613609.0007,UniProtKB (pr...",1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37299,481081,single nucleotide variant,NM_001110556.1(FLNA):c.853C>T (p.Arg285Cys),NM_001110556.1,2316,FLNA,HGNC:3754,Pathogenic,1,-,-1,-,RCV000577903,"MedGen:C1848213,OMIM:300049,OMIM:300537,SNOMED...",Periventricular nodular heterotopia 1,inherited,germline,GRCh37,NC_000023.10,X,153595780,153595780,G,A,Xq28,no assertion criteria provided,1,,N,-,2
37300,481082,single nucleotide variant,NM_001110556.1(FLNA):c.82A>G (p.Met28Val),NM_001110556.1,2316,FLNA,HGNC:3754,Pathogenic,1,-,-1,-,RCV000577884,"MedGen:C1848213,OMIM:300049,OMIM:300537,SNOMED...",Periventricular nodular heterotopia 1,de novo,germline,GRCh37,NC_000023.10,X,153599532,153599532,T,C,Xq28,no assertion criteria provided,1,,N,-,2
37301,481084,single nucleotide variant,NM_000049.2(ASPA):c.604G>C (p.Ala202Pro),NM_000049.2,443,ASPA,HGNC:756,Likely pathogenic,1,"Sep 28, 2017",147763700,-,RCV000577927,"MedGen:C0206307,OMIM:271900,Orphanet:ORPHA141,...",Spongy degeneration of central nervous system,germline,germline,GRCh37,NC_000017.10,17,3392606,3392606,G,C,17p13.2,"criteria provided, single submitter",1,,N,-,2
37302,481124,single nucleotide variant,NM_015120.4(ALMS1):c.5146A>T (p.Arg1716Ter),NM_015120.4,7840,ALMS1,HGNC:428,Likely pathogenic,1,"Aug 01, 2017",773513360,-,RCV000578023,"MedGen:C0268425,OMIM:203800,Orphanet:ORPHA64,S...",Alstrom syndrome,germline,germline,GRCh37,NC_000002.11,2,73678797,73678797,A,T,2p13.1,"criteria provided, single submitter",1,,N,-,2


In [371]:
clinvar_on_canonical = pd.merge(clinvar_variant_summary, hsapians_uniprot_NM, left_on="transcript_accession", right_on="RNA", how="inner")
clinvar_on_canonical

Unnamed: 0,AlleleID,Type,Name,transcript_accession,GeneID,GeneSymbol,HGNC_ID,ClinicalSignificance,ClinSigSimple,LastEvaluated,RS# (dbSNP),nsv/esv (dbVar),RCVaccession,PhenotypeIDS,PhenotypeList,Origin,OriginSimple,Assembly,ChromosomeAccession,Chromosome,Start,Stop,ReferenceAllele,AlternateAllele,Cytogenetic,ReviewStatus,NumberSubmitters,Guidelines,TestedInGTR,OtherIDs,SubmitterCategories,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,RefSeq,Alternative products (isoforms),canonical isoforms,all_canonical_refseq,RNA,Protein
0,15156,single nucleotide variant,NM_000071.2(CBS):c.919G>A (p.Gly307Ser),NM_000071.2,875,CBS,HGNC:1550,Pathogenic,1,"Jul 19, 2017",121964962,-,RCV000000138;RCV000366433;RCV000173641;RCV0000...,"na;MedGen:C0019880;MedGen:C3150344,OMIM:236200...","HYPERHOMOCYSTEINEMIA, THROMBOTIC, CBS-RELATED;...",germline;unknown,germline,GRCh37,NC_000021.8,21,44483098,44483098,C,T,21q22.3,"criteria provided, multiple submitters, no con...",6,,N,"HGMD:CM930082,OMIM Allelic Variant:613381.0001...",3,P35520,P35520,reviewed,CBS_HUMAN,Cystathionine beta-synthase (EC 4.2.1.22) (Bet...,CBS,NP_000062.1 [P35520-1];NP_001171479.1 [P35520-...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P35520-1,NP_000062.1,NM_000071.2,NP_000062.1
1,15157,single nucleotide variant,NM_000071.2(CBS):c.434C>T (p.Pro145Leu),NM_000071.2,875,CBS,HGNC:1550,Pathogenic,1,"Jun 01, 1993",121964963,-,RCV000000139,MedGen:CN068394,"Homocystinuria, pyridoxine-responsive",germline,germline,GRCh37,NC_000021.8,21,44486370,44486370,G,A,21q22.3,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613381.0002,UniProtKB (pr...",1,P35520,P35520,reviewed,CBS_HUMAN,Cystathionine beta-synthase (EC 4.2.1.22) (Bet...,CBS,NP_000062.1 [P35520-1];NP_001171479.1 [P35520-...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P35520-1,NP_000062.1,NM_000071.2,NP_000062.1
2,15159,single nucleotide variant,NM_000071.2(CBS):c.833T>C (p.Ile278Thr),NM_000071.2,875,CBS,HGNC:1550,Pathogenic,1,"Oct 31, 2017",5742905,-,RCV000000142;RCV000379069;RCV000173640;RCV0000...,"na;MedGen:C0019880;MedGen:C3150344,OMIM:236200...","HYPERHOMOCYSTEINEMIA, THROMBOTIC, CBS-RELATED;...",germline;unknown,germline,GRCh37,NC_000021.8,21,44483184,44483184,A,G,21q22.3,"criteria provided, multiple submitters, no con...",9,,N,"HGMD:CM920136,OMIM Allelic Variant:613381.0004...",3,P35520,P35520,reviewed,CBS_HUMAN,Cystathionine beta-synthase (EC 4.2.1.22) (Bet...,CBS,NP_000062.1 [P35520-1];NP_001171479.1 [P35520-...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P35520-1,NP_000062.1,NM_000071.2,NP_000062.1
3,15160,single nucleotide variant,NM_000071.2(CBS):c.415G>A (p.Gly139Arg),NM_000071.2,875,CBS,HGNC:1550,Pathogenic,1,"Jul 01, 1995",121964965,-,RCV000000143,MedGen:CN068394,"Homocystinuria, pyridoxine-responsive",germline,germline,GRCh37,NC_000021.8,21,44486389,44486389,C,T,21q22.3,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613381.0005,UniProtKB (pr...",1,P35520,P35520,reviewed,CBS_HUMAN,Cystathionine beta-synthase (EC 4.2.1.22) (Bet...,CBS,NP_000062.1 [P35520-1];NP_001171479.1 [P35520-...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P35520-1,NP_000062.1,NM_000071.2,NP_000062.1
4,15162,single nucleotide variant,NM_000071.2(CBS):c.1150A>G (p.Lys384Glu),NM_000071.2,875,CBS,HGNC:1550,Pathogenic,1,"Jan 01, 1997",121964967,-,RCV000000145,MedGen:CN068394,"Homocystinuria, pyridoxine-responsive",germline,germline,GRCh37,NC_000021.8,21,44479409,44479409,T,C,21q22.3,no assertion criteria provided,1,,N,"OMIM Allelic Variant:613381.0007,UniProtKB (pr...",1,P35520,P35520,reviewed,CBS_HUMAN,Cystathionine beta-synthase (EC 4.2.1.22) (Bet...,CBS,NP_000062.1 [P35520-1];NP_001171479.1 [P35520-...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P35520-1,NP_000062.1,NM_000071.2,NP_000062.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10269,432279,single nucleotide variant,NM_002389.4(CD46):c.685C>T (p.Arg229Ter),NM_002389.4,4179,CD46,HGNC:6953,Likely pathogenic,1,"Apr 20, 2017",-1,-,RCV000505668,"MedGen:C2752040,OMIM:612922,Orphanet:ORPHA93576",Atypical hemolytic-uremic syndrome 2,germline,germline,GRCh37,NC_000001.10,1,207940369,207940369,C,T,1q32.2,no assertion criteria provided,1,,N,-,2,P15529,P15529,reviewed,MCP_HUMAN,Membrane cofactor protein (TLX) (Trophoblast l...,CD46 MCP MIC10,NP_002380.3 [P15529-1];NP_722548.1 [P15529-3];...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P15529-1,NP_002380.3,NM_002389.4,NP_002380.3
10270,437833,single nucleotide variant,NM_002389.4(CD46):c.1148C>T (p.Thr383Ile),NM_002389.4,4179,CD46,HGNC:6953,Likely pathogenic,1,"Feb 28, 2017",146803767,-,RCV000513625,MedGen:CN517202,not provided,germline,germline,GRCh37,NC_000001.10,1,207963618,207963618,C,T,1q32.2,"criteria provided, single submitter",1,,N,-,2,P15529,P15529,reviewed,MCP_HUMAN,Membrane cofactor protein (TLX) (Trophoblast l...,CD46 MCP MIC10,NP_002380.3 [P15529-1];NP_722548.1 [P15529-3];...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P15529-1,NP_002380.3,NM_002389.4,NP_002380.3
10271,437890,single nucleotide variant,NM_001198551.1(WT1):c.48C>A (p.Tyr16Ter),NM_001198551.1,7490,WT1,HGNC:12796,Likely pathogenic,1,"Jun 30, 2017",-1,-,RCV000512713,MedGen:CN517202,not provided,germline,germline,GRCh37,NC_000011.9,11,32450128,32450128,G,T,11p13,"criteria provided, single submitter",1,"ACMG2013,ACMG2016",N,-,2,P19544,P19544,reviewed,WT1_HUMAN,Wilms tumor protein (WT33),WT1,NP_000369.3;NP_001185480.1 [P19544-6];NP_00118...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P19544-1,NP_001185480.1,NM_001198551.1,NP_001185480.1
10272,438464,single nucleotide variant,NM_013995.2(LAMP2):c.1A>T (p.Met1Leu),NM_013995.2,3920,LAMP2,HGNC:6501,Likely pathogenic,1,"Mar 31, 2017",-1,-,RCV000512988,MedGen:CN517202,not provided,germline,germline,GRCh37,NC_000023.10,X,119603024,119603024,T,A,Xq24,"criteria provided, single submitter",1,,N,-,2,P13473,P13473,reviewed,LAMP2_HUMAN,Lysosome-associated membrane glycoprotein 2 (L...,LAMP2,NP_001116078.1 [P13473-3];NP_002285.1 [P13473-...,ALTERNATIVE PRODUCTS: Event=Alternative splic...,P13473-1,NP_054701.1,NM_013995.2,NP_054701.1


In [372]:
print(len(explode(clinvar_on_canonical, column="Gene Names", sep=";")["Gene Names"].unique()))

370
