In [1]:
import pandas as pd
import numpy as np
import yaml
import re
from tqdm.autonotebook import tqdm
from pathlib import Path
pd.set_option('display.max_columns', 500)


  from tqdm.autonotebook import tqdm


In [2]:
evidenceDF = pd.read_csv("/data/dzeiberg/mave_calibration/clingenEvidenceRepo/erepo.tabbed.txt",delimiter='\t')
evidenceDF = evidenceDF.assign(GeneSymbol=evidenceDF.loc[:,'HGVS Expressions'].apply(lambda x: re.search(r"\(([^)]+)\)", x).group(1) if re.search(r"\(([^)]+)\)", x) else None))
evidenceDF = evidenceDF.assign(metEvidence=evidenceDF.loc[:,'Applied Evidence Codes (Met)'].astype(str).str.split(',').apply(lambda li: [str(x).strip() for x in li]))

In [3]:
with open("datasets.yaml") as f:
    datasets = yaml.safe_load(f)

In [4]:
genes = set([ds['gene'].upper() for ds in datasets.values() if 'gene' in ds])

In [5]:
evidenceDF = evidenceDF[evidenceDF.GeneSymbol.str.upper().isin(genes)]

In [6]:
datadir = Path("/data/dzeiberg/mave_calibration/processed_datasets")
scoresets = {}
for dataset_name, dsInfo in datasets.items():
    if "pipeline" in dsInfo:
        pipeline = dsInfo["pipeline"]
        scoreset_file = datadir / f"{dataset_name}_pipeline_{pipeline}.json"
        if scoreset_file.exists():
            scoresets[dataset_name] = pd.read_json(scoreset_file)

In [7]:
len(scoresets)

25

In [8]:
evidenceDF.head()

Unnamed: 0,#Variation,ClinVar Variation Id,Allele Registry Id,HGVS Expressions,HGNC Gene Symbol,Disease,Mondo Id,Mode of Inheritance,Assertion,Applied Evidence Codes (Met),Applied Evidence Codes (Not Met),Summary of interpretation,PubMed Articles,Expert Panel,Guideline,Approval Date,Published Date,Retracted,Evidence Repo Link,Uuid,GeneSymbol,metEvidence
82,NM_000314.6(PTEN):c.737C>T (p.Pro246Leu),142269,CA000559,"NM_000314.6:c.737C>T, NM_000314.5:c.737C>T, NM...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Pathogenic,"PM2, PS2_Very Strong, PS4, PP2",,PTEN c.737C>T (p.P246L) meets criteria to be c...,,PTEN VCEP,,2017-10-18,2018-12-10,False,https://erepo.genome.network/evrepo/ui/classif...,835a14e6-4a41-47df-a1ff-271c872adf39,PTEN,"[PM2, PS2_Very Strong, PS4, PP2]"
83,NM_000314.6(PTEN):c.-1170C>T,127662,CA151481,"NM_000314.6:c.-1170C>T, NM_001126049.1:c.-812G...",KLLN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Uncertain Significance,BP5,,PTEN c.-1170C>T (NC_000010.10: g.89623056C>T) ...,,PTEN VCEP,,2017-11-08,2018-12-10,False,https://erepo.genome.network/evrepo/ui/classif...,8c50af60-fe53-4c9b-b60a-3735d21c8645,PTEN,[BP5]
84,NM_000314.6(PTEN):c.209+3A>T,185989,CA000135,"NM_000314.6:c.209+3A>T, NM_000314.5:c.209+3A>T...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Uncertain Significance,PP3,"PM2, PM6, PVS1, BA1, BP4, BP2, PS3, PS1, PS2, ...",PTEN c.209+3A>T (IVS3+3A>T) is currently class...,,PTEN VCEP,,2020-03-23,2020-03-30,False,https://erepo.genome.network/evrepo/ui/classif...,049d3e03-b5ad-4a47-92e9-ee077179049a,PTEN,[PP3]
85,NM_000314.6(PTEN):c.235G>A (p.Ala79Thr),41682,CA000358,"NM_000314.6:c.235G>A, NM_000314.5:c.235G>A, NM...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Likely Benign,"BS2_Supporting, BS1, PP2",,PTEN c.235G>A (p.A79T) meets criteria to be cl...,,PTEN VCEP,,2018-04-06,2018-12-10,False,https://erepo.genome.network/evrepo/ui/classif...,0340272b-ab4c-4672-94b8-40e0525d8eda,PTEN,"[BS2_Supporting, BS1, PP2]"
86,NM_000314.6(PTEN):c.304_306dupAAA (p.Lys102_Pr...,142681,CA169101,"NM_000314.6:c.304_306dupAAA, NM_000314.5:c.304...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Uncertain Significance,PM2,,PTEN c.304_306dupAAA (p.K102_P103insK) is curr...,,PTEN VCEP,,2017-11-08,2018-12-10,False,https://erepo.genome.network/evrepo/ui/classif...,742e08c9-7bda-4006-95f1-22f34ec0d9e1,PTEN,[PM2]


In [9]:
evidenceDF.shape

(381, 22)

In [10]:
evidenceDF = evidenceDF.assign(Expressions=evidenceDF.loc[:,'HGVS Expressions'].str.split(', ').apply(tuple))

In [11]:
import re

def extract_transcript_and_variant(coding_string):
    # Regex pattern to match the transcript, allowing for optional parentheses (gene name) and capturing the variant
    pattern = r"(NM_\d+\.\d+)(?:\([A-Za-z0-9_]+\))?:?(c\.\d+[-]?\d*[A-Z]>[A-Z])"
    match = re.search(pattern, coding_string)
    if match:
        transcript = match.group(1)
        variant = match.group(2)
        return f"{transcript}:{variant}"
    return None

def extract_hgvs_nuc(coding_string):
    # Updated regex pattern to match both "c.1233T>G" and "c.135-1G>T" formats
    pattern = r"(c\.\d+[-]?\d*[A-Z]>[A-Z])"
    match = re.search(pattern, coding_string)
    if match:
        return match.group(1)
    return None

# Example usage
string1 = "NM_007294.4(BRCA1):c.1233T>G (p.Asp411Glu)"
string2 = "NM_007294.4:c.135-1G>T"

# Example usage
string1 = "NM_007294.4(BRCA1):c.1233T>G (p.Asp411Glu)"
string2 = "NM_007294.4:c.135-1G>T"
result1 = extract_transcript_and_variant(string1)
result2 = extract_transcript_and_variant(string2)

print(result1)  # Output: NM_007294.4:c.1233T>G
print(result2)  # Output: NM_007294.4:c.135-1G>T

def extract_amino_acid_substitution(coding_string):
    # Regex pattern to match the amino acid substitution in the form "p.Asp411Glu"
    pattern = r"(p\.[A-Z][a-z]{2}\d+[A-Z][a-z]{2})"
    match = re.search(pattern, coding_string)
    if match:
        return match.group(1)
    return None

# Example usage
string1 = "NM_007294.4(BRCA1):c.1233T>G (p.Asp411Glu)"
string2 = "NM_007294.4:c.135-1G>T (p.Gly12Ser)"
result1 = extract_amino_acid_substitution(string1)
result2 = extract_amino_acid_substitution(string2)

print(result1)  # Output: p.Asp411Glu
print(result2)  # Output: p.Gly12Ser

NM_007294.4:c.1233T>G
NM_007294.4:c.135-1G>T
p.Asp411Glu
p.Gly12Ser


In [12]:
evidenceDF = evidenceDF.assign(hgvs_nuc_full=evidenceDF.Expressions.apply(lambda li: tuple(list(set((extract_transcript_and_variant(x) for x in li)) - {None,}))),
                               hgvs_nuc=evidenceDF.Expressions.apply(lambda li: tuple(list(set((extract_hgvs_nuc(x) for x in li)) - {None,}))),
                               hgvs_pro=evidenceDF.Expressions.apply(lambda li: tuple(list(set((extract_amino_acid_substitution(x) for x in li)) - {None,}))))
evidenceDF = evidenceDF.assign(hgvs_pro=evidenceDF.hgvs_pro.apply(lambda x: x[0] if len(x) else ""))

In [13]:
scoreset_with_evidence = {}
for scoresetName, ss in scoresets.items():
    ss= ss.dropna(subset=['author_transcript','hgvs_pro'])
    ss = ss[ss.hgvs_pro.str.len() > 0]
    if 'author_transcript' not in ss.columns:
        ss = ss.assign(author_transcript=datasets[scoresetName]['author_transcript'])
    ss = ss.assign(avg_score=ss.scores.apply(lambda x: sum(x)/len(x)))
    sse = pd.merge(ss, evidenceDF[evidenceDF.GeneSymbol == datasets[scoresetName]['gene']], on=['hgvs_pro'],suffixes=('_mave', '_errepo'),how='inner')
    scoreset_with_evidence[scoresetName] = sse

In [14]:
for scoresetName, sse in scoreset_with_evidence.items():
    if sse.shape[0]:
        print(f"{scoresetName}: {sse.shape[0]:,d}")

Adamovich_BRCA1_Cisplatin: 2
Adamovich_BRCA1_HDR: 2
Findlay_BRCA1_SGE: 4
Hu_BRCA2_HDR: 5
Kato_TP53: 108
Giacomelli_1: 108
Giacomelli_2: 108
Giacomelli_3: 108
Mighell_PTEN: 114
Matreyek_PTEN_VampSeq: 73
Buckley_VHL_SGE: 22
Starita_BRCA1_HDR: 1


In [15]:
df = pd.concat([scoreset_with_evidence[dataset_name].loc[:,['hgvs_pro','avg_score','#Variation', 'ClinVar Variation Id', 'Allele Registry Id',
       'HGVS Expressions', 'HGNC Gene Symbol', 'Disease', 'Mondo Id',
       'Mode of Inheritance', 'Assertion', 'Applied Evidence Codes (Met)',
       'Applied Evidence Codes (Not Met)', 'Summary of interpretation',
       'PubMed Articles', 'Expert Panel', 'Guideline', 'Approval Date',
       'Published Date', 'Retracted', 'Evidence Repo Link', 'Uuid',
       'GeneSymbol', 'metEvidence', 'Expressions', 'hgvs_pro']].assign(scoreset_source=dataset_name) for dataset_name in scoreset_with_evidence.keys()])

  df = pd.concat([scoreset_with_evidence[dataset_name].loc[:,['hgvs_pro','avg_score','#Variation', 'ClinVar Variation Id', 'Allele Registry Id',


In [21]:
df.loc[:,'#Variation'].unique().shape

(253,)

In [29]:
evidenceDF[(evidenceDF.loc[:,'#Variation'].isin(set(df.loc[:,'#Variation'].values))) & \
           np.logical_not(evidenceDF.loc[:,'Applied Evidence Codes (Met)'].str.contains('BS3').values) & \
            np.logical_not(evidenceDF.loc[:,'Applied Evidence Codes (Met)'].str.contains('PS3').values)]

Unnamed: 0,#Variation,ClinVar Variation Id,Allele Registry Id,HGVS Expressions,HGNC Gene Symbol,Disease,Mondo Id,Mode of Inheritance,Assertion,Applied Evidence Codes (Met),Applied Evidence Codes (Not Met),Summary of interpretation,PubMed Articles,Expert Panel,Guideline,Approval Date,Published Date,Retracted,Evidence Repo Link,Uuid,GeneSymbol,metEvidence,Expressions,hgvs_nuc_full,hgvs_nuc,hgvs_pro
82,NM_000314.6(PTEN):c.737C>T (p.Pro246Leu),142269,CA000559,"NM_000314.6:c.737C>T, NM_000314.5:c.737C>T, NM...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Pathogenic,"PM2, PS2_Very Strong, PS4, PP2",,PTEN c.737C>T (p.P246L) meets criteria to be c...,,PTEN VCEP,,2017-10-18,2018-12-10,False,https://erepo.genome.network/evrepo/ui/classif...,835a14e6-4a41-47df-a1ff-271c872adf39,PTEN,"[PM2, PS2_Very Strong, PS4, PP2]","(NM_000314.6:c.737C>T, NM_000314.5:c.737C>T, N...","(NM_001304717.2:c.1256C>T, NM_000314.5:c.737C>...","(c.692C>T, c.641C>T, c.1256C>T, c.737C>T, c.14...",p.Pro246Leu
85,NM_000314.6(PTEN):c.235G>A (p.Ala79Thr),41682,CA000358,"NM_000314.6:c.235G>A, NM_000314.5:c.235G>A, NM...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Likely Benign,"BS2_Supporting, BS1, PP2",,PTEN c.235G>A (p.A79T) meets criteria to be cl...,,PTEN VCEP,,2018-04-06,2018-12-10,False,https://erepo.genome.network/evrepo/ui/classif...,0340272b-ab4c-4672-94b8-40e0525d8eda,PTEN,"[BS2_Supporting, BS1, PP2]","(NM_000314.6:c.235G>A, NM_000314.5:c.235G>A, N...","(NM_001304717.5:c.754G>A, NM_000314.7:c.235G>A...","(c.235G>A, c.133G>A, c.190G>A, c.754G>A, c.139...",p.Ala79Thr
94,NM_000314.6(PTEN):c.987_990delTAAA (p.Asn329Ly...,189441,CA000656,"NM_000314.6:c.987_990delTAAA, NM_000314.5:c.98...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Pathogenic,"PM2, PVS1, PS4_Supporting",,PTEN c.987_990delTAAA (p.N329KfsX14) meets cri...,,PTEN VCEP,,2017-10-18,2018-12-10,False,https://erepo.genome.network/evrepo/ui/classif...,e41ad782-4c9c-4ae6-ba02-ec2e11c967fc,PTEN,"[PM2, PVS1, PS4_Supporting]","(NM_000314.6:c.987_990delTAAA, NM_000314.5:c.9...",(),(),p.Asn329Lys
95,NM_000314.6(PTEN):c.892C>T (p.Gln298Ter),187657,CA000219,"NM_000314.6:c.892C>T, NC_000010.11:g.87960984C...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Pathogenic,"PM2, PVS1, PS4_Supporting",,PTEN c.892C>T (p.Q298X) meets criteria to be c...,,PTEN VCEP,,2017-10-18,2018-12-10,False,https://erepo.genome.network/evrepo/ui/classif...,f890d81d-90ed-4411-a0fd-a89669dd616a,PTEN,"[PM2, PVS1, PS4_Supporting]","(NM_000314.6:c.892C>T, NC_000010.11:g.87960984...","(NM_001304718.2:c.301C>T, NM_001304718.1:c.301...","(c.796C>T, c.1411C>T, c.892C>T, c.301C>T, c.84...",p.Gln298Ter
96,NM_000314.6(PTEN):c.964A>T (p.Lys322Ter),185213,CA000235,"NM_000314.6:c.964A>T, NC_000010.11:g.87961056A...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Likely Pathogenic,"PVS1, PM2","PM6, BP2, BP4, BA1, PS4, PS2, PS1, PS3, BS4, B...",PTEN c.964A>T (p.Lys322Ter) meets criteria to ...,,PTEN VCEP,,2020-03-23,2020-03-26,False,https://erepo.genome.network/evrepo/ui/classif...,bfdfbf81-6f26-43f8-aa4b-bbb17ebaf54a,PTEN,"[PVS1, PM2]","(NM_000314.6:c.964A>T, NC_000010.11:g.87961056...","(NM_001304717.5:c.1483A>T, NM_001304717.2:c.14...","(c.919A>T, c.964A>T, c.868A>T, c.1483A>T, c.37...",p.Lys322Ter
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7226,NM_000551.4(VHL):c.257C>T (p.Pro86Leu),182977,CA020186,"NM_000551.4:c.257C>T, NC_000003.12:g.10142104C...",VHL,von Hippel-Lindau disease,MONDO:0008667,Autosomal dominant inheritance,Pathogenic,"PM2_Supporting, PS4, PP1_Strong",,The variant NM_000551.4(VHL):c.257C>T (p.Pro86...,,VHL VCEP,,2024-06-25,2024-06-25,False,https://erepo.genome.network/evrepo/ui/classif...,318690ae-2743-4195-8d9b-e2e84ff94494,VHL,"[PM2_Supporting, PS4, PP1_Strong]","(NM_000551.4:c.257C>T, NC_000003.12:g.10142104...","(NM_198156.3:c.257C>T, NM_198156.2:c.257C>T, N...","(c.257C>T,)",p.Pro86Leu
7228,NM_000551.4(VHL):c.241C>T (p.Pro81Ser),2233,CA020148,"NM_000551.4:c.241C>T, NC_000003.12:g.10142088C...",VHL,von Hippel-Lindau disease,MONDO:0008667,Autosomal dominant inheritance,Benign,BA1,,The variant NM_000551.4(VHL):c.241C>T (p.Pro81...,,VHL VCEP,,2024-06-25,2024-06-25,False,https://erepo.genome.network/evrepo/ui/classif...,416afbe6-3dd6-40d9-9e61-d3c54b017b5e,VHL,[BA1],"(NM_000551.4:c.241C>T, NC_000003.12:g.10142088...","(NM_000551.3:c.241C>T, NM_001354723.2:c.241C>T...","(c.241C>T,)",p.Pro81Ser
7231,NM_000551.4(VHL):c.154G>A (p.Glu52Lys),161402,CA020056,"NM_000551.4:c.154G>A, NC_000003.12:g.10142001G...",VHL,von Hippel-Lindau disease,MONDO:0008667,Autosomal dominant inheritance,Benign,BA1,PP3,The variant NM_000551.3(VHL):c.154G>A (p.Glu52...,,VHL VCEP,,2024-06-25,2024-06-25,False,https://erepo.genome.network/evrepo/ui/classif...,9f120c06-bb61-4115-9c9c-187ef481f769,VHL,[BA1],"(NM_000551.4:c.154G>A, NC_000003.12:g.10142001...","(NM_198156.3:c.154G>A, NM_000551.4:c.154G>A, N...","(c.154G>A,)",p.Glu52Lys
7418,NM_000551.4(VHL):c.263G>A (p.Trp88Ter),182978,CA020197,"NM_000551.4:c.263G>A, NC_000003.12:g.10142110G...",VHL,von Hippel-Lindau disease,MONDO:0008667,Autosomal dominant inheritance,Pathogenic,"PVS1, PM2_Supporting, PS4",,The variant NM_000551.4(VHL):c.263G>A (p.Trp88...,,VHL VCEP,,2024-06-25,2024-07-17,False,https://erepo.genome.network/evrepo/ui/classif...,eb3883a6-80f5-45f8-9c0f-a2928a4e24ca,VHL,"[PVS1, PM2_Supporting, PS4]","(NM_000551.4:c.263G>A, NC_000003.12:g.10142110...","(NM_198156.2:c.263G>A, NM_001354723.1:c.263G>A...","(c.263G>A,)",p.Trp88Ter


In [None]:
assign_assay_evidence_strength(-2.2, thresholdsP['Kato_TP53'], thresholdsB['Kato_TP53'])

6

In [195]:
def assign_assay_evidence_strength(score, pathogenic_score_thresholds, benign_score_thresholds):
    is_inverted = (pathogenic_score_thresholds[0] > benign_score_thresholds[0]) or (pathogenic_score_thresholds[0] < pathogenic_score_thresholds[1])
    if np.isnan(score):
        return 0
    for threshold,points in list(zip(pathogenic_score_thresholds,[1,2,3,4,5,6,7,8]))[::-1]:
        if np.isnan(threshold):
            continue
        if is_inverted and score >= threshold:
            return points
        if (not is_inverted) and score <= threshold:
            return points
    for threshold,points in list(zip(benign_score_thresholds,[-1,-2,-3,-4,-5,-6,-7,-8]))[::-1]:
        if np.isnan(threshold):
            continue
        if is_inverted and score <= threshold:
            return points
        if (not is_inverted) and score >= threshold:
            return points
    return 0

thresholdsP = {}
thresholdsB = {}
for dsname in df.scoreset_source.unique():
    pipeline = datasets[dsname]['pipeline']
    tauP = np.load(f"/data/dzeiberg/mave_calibration/figs_10_09_24/{dsname}_pipeline_{pipeline}/Tau_p.npy")
    tauB = np.load(f"/data/dzeiberg/mave_calibration/figs_10_09_24/{dsname}_pipeline_{pipeline}/Tau_b.npy")
    thresholdsP[dsname] = tauP
    thresholdsB[dsname] = tauB

In [198]:
strengths = []
for idx,r in df.iterrows():
    strength = assign_assay_evidence_strength(r.avg_score, thresholdsP[r.scoreset_source], thresholdsB[r.scoreset_source])
    strengths.append(strength)
df = df.assign(strength=strengths)

In [216]:
df = df.assign(strengthAbs=df.strength.abs()).sort_values(by='strengthAbs',ascending=False)
dfMax = df.drop_duplicates(subset=['hgvs_pro','GeneSymbol'],keep='first')

In [229]:
dfMax.shape

(251, 29)

In [230]:
dfMax.GeneSymbol.value_counts()

GeneSymbol
PTEN     114
TP53     109
VHL       18
BRCA2      5
BRCA1      5
Name: count, dtype: int64

In [231]:
dfMax.scoreset_source.value_counts()

scoreset_source
Kato_TP53                95
Mighell_PTEN             75
Matreyek_PTEN_VampSeq    39
Buckley_VHL_SGE          18
Giacomelli_3              9
Hu_BRCA2_HDR              5
Findlay_BRCA1_SGE         4
Giacomelli_1              4
Giacomelli_2              1
Starita_BRCA1_HDR         1
Name: count, dtype: int64

In [218]:
dfMax.groupby("Assertion").strength.value_counts().unstack().fillna(0).astype(int)

strength,-6,-5,-4,0,1,2,5,6,7,8
Assertion,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Benign,15,0,0,3,0,0,0,0,0,0
Likely Benign,44,3,0,5,0,0,0,1,0,0
Likely Pathogenic,0,0,0,22,0,1,1,5,12,2
Pathogenic,1,1,0,33,1,0,1,21,11,13
Uncertain Significance,12,2,1,32,0,0,0,6,2,0


In [251]:
dfMax[(dfMax.strength >= 1) & (dfMax.loc[:,'Applied Evidence Codes (Met)'].str.contains("PS3"))].strength.value_counts().sort_index()

strength
2     1
5     1
6    25
7    16
8    10
Name: count, dtype: int64

In [242]:
dfMax[(dfMax.loc[:,'Applied Evidence Codes (Met)'].str.contains("PS3"))].shape

(91, 29)

In [259]:
dfMax[(dfMax.strength < 1) & (dfMax.loc[:,'Applied Evidence Codes (Met)'].str.contains("PS3")) & \
    (dfMax.scoreset_source != "Mighell_PTEN")]

Unnamed: 0,hgvs_pro,avg_score,#Variation,ClinVar Variation Id,Allele Registry Id,HGVS Expressions,HGNC Gene Symbol,Disease,Mondo Id,Mode of Inheritance,Assertion,Applied Evidence Codes (Met),Applied Evidence Codes (Not Met),Summary of interpretation,PubMed Articles,Expert Panel,Guideline,Approval Date,Published Date,Retracted,Evidence Repo Link,Uuid,GeneSymbol,metEvidence,Expressions,hgvs_pro.1,scoreset_source,strength,strengthAbs
51,p.Tyr27Ser,0.508829,NM_000314.8(PTEN):c.80A>C (p.Tyr27Ser),280724,CA10603162,"NM_000314.8:c.80A>C, NC_000010.11:g.87894025A>...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Likely Pathogenic,"PS2, PS3_Supporting, PP2, PP3, PS4_Supporting,...",BS3,NM_000314.8(PTEN):c.80A>C (p.Tyr27Ser) meets c...,10866302,PTEN VCEP,,2023-08-04,2023-10-19,False,https://erepo.genome.network/evrepo/ui/classif...,1d6c53d1-6cb2-4e76-a5e8-dcc135e646ff,PTEN,"[PS2, PS3_Supporting, PP2, PP3, PS4_Supporting...","(NM_000314.8:c.80A>C, NC_000010.11:g.87894025A...",p.Tyr27Ser,Matreyek_PTEN_VampSeq,0,0
17,p.Leu188Val,-0.391271,NM_000551.4(VHL):c.562C>G (p.Leu188Val),2225,CA020488,"NM_000551.4:c.562C>G, NC_000003.12:g.10149885C...",VHL,von Hippel-Lindau disease,MONDO:0008667,Autosomal dominant inheritance,Uncertain Significance,"PS3_Supporting, PP3, BS2, PM1, BS1",,The variant NM_000551.4(VHL):c.562C>G (p.Leu18...,"15574766, 17700531, 16452184, 12097293, 190302...",VHL VCEP,,2024-06-25,2024-06-25,False,https://erepo.genome.network/evrepo/ui/classif...,03093ff9-64cb-4fd3-a97b-d60308334b8c,VHL,"[PS3_Supporting, PP3, BS2, PM1, BS1]","(NM_000551.4:c.562C>G, NC_000003.12:g.10149885...",p.Leu188Val,Buckley_VHL_SGE,0,0
12,p.Phe91Leu,-0.137526,NM_000551.4(VHL):c.273C>A (p.Phe91Leu),411978,CA16611270,"NM_000551.4:c.273C>A, NC_000003.12:g.10142120C...",VHL,von Hippel-Lindau disease,MONDO:0008667,Autosomal dominant inheritance,Uncertain Significance,"PS3_Supporting, PP3, PM2_Supporting, PM1",PS4,The NM_000551.3(VHL):c.273C>A (p.Phe91Leu) var...,31337753,VHL VCEP,,2024-06-25,2024-06-25,False,https://erepo.genome.network/evrepo/ui/classif...,d6ad76b6-e4e0-490b-8d57-e1a11cc12139,VHL,"[PS3_Supporting, PP3, PM2_Supporting, PM1]","(NM_000551.4:c.273C>A, NC_000003.12:g.10142120...",p.Phe91Leu,Buckley_VHL_SGE,0,0
62,p.Met35Val,1.061209,NM_000314.6(PTEN):c.103A>G (p.Met35Val),231916,CA10578906,"NM_000314.6:c.103A>G, NC_000010.11:g.87894048A...",PTEN,PTEN hamartoma tumor syndrome,MONDO:0017623,Autosomal dominant inheritance,Pathogenic,"PS2_Very Strong, PS3_Supporting, PP3, PP2, PM2...","PM6, PVS1, BA1, BP4, BP1, BP2, BP3, PS1, BP5, ...",NM_000314.8(PTEN):c.103A>G (p.Met35Val) meets ...,,PTEN VCEP,,2024-02-09,2024-03-04,False,https://erepo.genome.network/evrepo/ui/classif...,4f6345e3-25b1-48cd-9697-1a5624715071,PTEN,"[PS2_Very Strong, PS3_Supporting, PP3, PP2, PM...","(NM_000314.6:c.103A>G, NC_000010.11:g.87894048...",p.Met35Val,Matreyek_PTEN_VampSeq,0,0
1,p.Arg64Pro,-0.477015,NM_000551.4(VHL):c.191G>C (p.Arg64Pro),2226,CA020089,"NM_000551.4:c.191G>C, NC_000003.12:g.10142038G...",VHL,von Hippel-Lindau disease,MONDO:0008667,Autosomal dominant inheritance,Pathogenic,"PS3_Supporting, PP3, PM2_Supporting, PS4, PM6_...",PP1,The NM_000551.3(VHL):c.191G>C (p.Arg64Pro) var...,"16452184, 11331612, 15611064",VHL VCEP,,2024-06-25,2024-07-17,False,https://erepo.genome.network/evrepo/ui/classif...,7122665e-2b75-4e58-a96a-965f11cf866f,VHL,"[PS3_Supporting, PP3, PM2_Supporting, PS4, PM6...","(NM_000551.4:c.191G>C, NC_000003.12:g.10142038...",p.Arg64Pro,Buckley_VHL_SGE,0,0


In [266]:
dfMax[(dfMax.strength < 1) & \
    (dfMax.loc[:,'Applied Evidence Codes (Met)'].str.contains("PS3")) & \
        (dfMax.GeneSymbol == "PTEN")].loc[:,["Summary of interpretation",'Applied Evidence Codes (Met)']]

Unnamed: 0,Summary of interpretation,Applied Evidence Codes (Met)
51,NM_000314.8(PTEN):c.80A>C (p.Tyr27Ser) meets c...,"PS2, PS3_Supporting, PP2, PP3, PS4_Supporting,..."
2,PTEN c.16A>G (p.Lys6Glu) meets criteria to be ...,"PS2_Very Strong, PM2, PS4_Moderate, PS3_Suppor..."
4,NM_000314.8(PTEN):c.40A>G (p.Arg14Gly) is curr...,"PS3_Supporting, PP2, PP3, PM2_Supporting, PS4_..."
6,NM_000314.8(PTEN):c.44G>C (p.Arg15Thr) meets c...,"PS2, PP2, PP3, PS3_Moderate, PM2_Supporting"
7,NM_000314.8(PTEN):c.44G>A (p.Arg15Lys) meets c...,"PP2, PS3_Moderate, PM6_Strong, PM2_Supporting"
10,NM_000314.8(PTEN):c.49C>G (p.Gln17Glu) is curr...,"PM6, BP4, PS3_Supporting, PP2, PM2_Supporting"
19,NM_000314.8(PTEN):c.106G>A (p.Gly36Arg) meets ...,"PS4_Moderate, PP1, PP2, PP3, PM2_Supporting, P..."
21,NM_000314.8(PTEN):c.107G>A (p.Gly36Glu) meets ...,"PS2, PP2, PP3, PM2_Supporting, PS3_Moderate, P..."
22,NM_000314.8(PTEN):c.113C>T (p.Pro38Leu) meets ...,"PS2_Very Strong, PP2, PP3, PS3_Moderate, PS4_S..."
23,NM_000314.8(PTEN):c.112C>T (p.Pro38Ser) is cur...,"PP2, PP3, PM2_Supporting, PS3_Moderate"


In [270]:
import textwrap

In [298]:
dfMax.loc[(dfMax.strength > 0) & (dfMax.loc[:,'Applied Evidence Codes (Not Met)'].str.contains("PS3")),['Summary of interpretation','scoreset_source','strength']]

Unnamed: 0,Summary of interpretation,scoreset_source,strength
0,The c.5509T>G variant in BRCA1 is a missense v...,Findlay_BRCA1_SGE,8
32,PTEN c.564T>A (p.Y188X) meets criteria to be c...,Matreyek_PTEN_VampSeq,7
52,PTEN c.964A>T (p.Lys322Ter) meets criteria to ...,Matreyek_PTEN_VampSeq,7
21,This variant has a BayesDel score > 0.16 and A...,Kato_TP53,6
104,This variant is absent in the gnomAD cohort (P...,Kato_TP53,6
25,NM_000314.8(PTEN):c.424C>T (p.Arg142Trp) is cu...,Matreyek_PTEN_VampSeq,6
51,This variant is absent from the gnomAD non-can...,Giacomelli_1,6
68,The NM_000546.6: c.524G>T variant in TP53 is a...,Giacomelli_1,6
10,This variant has a BayesDel score < 0.16 and A...,Kato_TP53,6
52,This variant has a BayesDel score < 0.16 and A...,Kato_TP53,6


In [None]:
dfMax[(dfMax.loc[:,'Applied Evidence Codes (Met)'].str.contains("BS3"))].shape

(95, 29)

In [309]:
dfMax[(dfMax.loc[:,'Applied Evidence Codes (Met)'].str.contains("BS3")) & \
     (dfMax.loc[:,'strength'] < 0)].shape

(74, 29)

In [303]:
pd.set_option('display.max_colwidth', None)

In [324]:
dfMax.loc[(dfMax.loc[:,'Applied Evidence Codes (Met)'].str.contains("BS3")) & \
     (dfMax.loc[:,'strength'] >= 0) & \
          (dfMax.loc[:,'GeneSymbol'] != "PTEN")]

Unnamed: 0,hgvs_pro,avg_score,#Variation,ClinVar Variation Id,Allele Registry Id,HGVS Expressions,HGNC Gene Symbol,Disease,Mondo Id,Mode of Inheritance,Assertion,Applied Evidence Codes (Met),Applied Evidence Codes (Not Met),Summary of interpretation,PubMed Articles,Expert Panel,Guideline,Approval Date,Published Date,Retracted,Evidence Repo Link,Uuid,GeneSymbol,metEvidence,Expressions,hgvs_pro.1,scoreset_source,strength,strengthAbs
72,p.Arg267Gln,-2.240292,NM_000546.5(TP53):c.800G>A (p.Arg267Gln),127823,CA000424,"NM_000546.5:c.800G>A, NC_000017.11:g.7673820C>T, CM000679.2:g.7673820C>T, NC_000017.10:g.7577138C>T, CM000679.1:g.7577138C>T, NC_000017.9:g.7517863C>T, NG_017013.2:g.18731G>A, LRG_321:g.18731G>A, LRG_321t1:c.800G>A, NM_001126112.2:c.800G>A, LRG_321t2:c.800G>A, NM_001126113.2:c.800G>A, LRG_321t4:c.800G>A, NM_001126114.2:c.800G>A, LRG_321t3:c.800G>A, NM_001126115.1:c.404G>A, LRG_321t5:c.404G>A, NM_001126116.1:c.404G>A, LRG_321t6:c.404G>A, NM_001126117.1:c.404G>A, LRG_321t7:c.404G>A, NM_001126118.1:c.683G>A, LRG_321t8:c.683G>A, NM_001276695.1:c.683G>A, NM_001276696.1:c.683G>A, NM_001276697.1:c.323G>A, NM_001276698.1:c.323G>A, NM_001276699.1:c.323G>A, NM_001276760.1:c.683G>A, NM_001276761.1:c.683G>A, NM_001276695.2:c.683G>A, NM_001276696.2:c.683G>A, NM_001276697.2:c.323G>A, NM_001276698.2:c.323G>A, NM_001276699.2:c.323G>A, NM_001276760.2:c.683G>A, NM_001276761.2:c.683G>A, ENST00000269305.8:c.800G>A, ENST00000359597.8:n.800G>A, ENST00000413465.6:n.782+361G>A, ENST00000420246.6:c.800G>A, ENST00000445888.6:c.800G>A, ENST00000455263.6:c.800G>A, ENST00000504290.5:c.404G>A, ENST00000504937.5:c.404G>A, ENST00000509690.5:c.404G>A, ENST00000510385.5:c.404G>A, ENST00000610292.4:c.683G>A, ENST00000610538.4:c.683G>A, ENST00000610623.4:c.323G>A, ENST00000615910.4:n.767G>A, ENST00000617185.4:c.800G>A, ENST00000618944.4:c.323G>A, ENST00000619186.4:c.323G>A, ENST00000619485.4:c.683G>A, ENST00000620739.4:c.683G>A, ENST00000622645.4:c.683G>A, ENST00000635293.1:c.683G>A, NM_000546.5(TP53):c.800G>A (p.Arg267Gln)",TP53,Li-Fraumeni syndrome,MONDO:0018875,Autosomal dominant inheritance,Uncertain Significance,"PP3, PS4_Supporting, BS3_Supporting","PM1, PM5, BA1, BP4, BS1","This variant has a BayesDel score > 0.16 and Align GVGD (Zebrafish) is Class 15 or higher (PP3). This variant has been reported in 2 probands meeting Chrompret criteria (PS4_Supporting; PMID:25584008, internal laboratory contributor). Transactivation assays show partially functional variant according to Kato, et al. and there is no evidence of a dominant negative effect or loss of function according to Giacomelli, et al. (BS3_Supporting; PMID: 12826609, 30224644). In summary, the clinical significance of TP53 c.800G>A (p.Arg267Gln) is uncertain for Li-Fraumeni syndrome. ACMG/AMP criteria applied, as specified by the TP53 Expert Panel: PP3, PS4_Supporting, BS3_Supporting.","1562462, 25584008, 27210295, 10435620, 21343334, 25584008, 30224644, 12826609, 29979965",TP53 VCEP,,2020-08-11,2020-08-14,False,https://erepo.genome.network/evrepo/ui/classification/CA000424/MONDO:0018875/009,ca16e21b-853c-433e-88e5-601b2de6c525,TP53,"[PP3, PS4_Supporting, BS3_Supporting]","(NM_000546.5:c.800G>A, NC_000017.11:g.7673820C>T, CM000679.2:g.7673820C>T, NC_000017.10:g.7577138C>T, CM000679.1:g.7577138C>T, NC_000017.9:g.7517863C>T, NG_017013.2:g.18731G>A, LRG_321:g.18731G>A, LRG_321t1:c.800G>A, NM_001126112.2:c.800G>A, LRG_321t2:c.800G>A, NM_001126113.2:c.800G>A, LRG_321t4:c.800G>A, NM_001126114.2:c.800G>A, LRG_321t3:c.800G>A, NM_001126115.1:c.404G>A, LRG_321t5:c.404G>A, NM_001126116.1:c.404G>A, LRG_321t6:c.404G>A, NM_001126117.1:c.404G>A, LRG_321t7:c.404G>A, NM_001126118.1:c.683G>A, LRG_321t8:c.683G>A, NM_001276695.1:c.683G>A, NM_001276696.1:c.683G>A, NM_001276697.1:c.323G>A, NM_001276698.1:c.323G>A, NM_001276699.1:c.323G>A, NM_001276760.1:c.683G>A, NM_001276761.1:c.683G>A, NM_001276695.2:c.683G>A, NM_001276696.2:c.683G>A, NM_001276697.2:c.323G>A, NM_001276698.2:c.323G>A, NM_001276699.2:c.323G>A, NM_001276760.2:c.683G>A, NM_001276761.2:c.683G>A, ENST00000269305.8:c.800G>A, ENST00000359597.8:n.800G>A, ENST00000413465.6:n.782+361G>A, ENST00000420246.6:c.800G>A, ENST00000445888.6:c.800G>A, ENST00000455263.6:c.800G>A, ENST00000504290.5:c.404G>A, ENST00000504937.5:c.404G>A, ENST00000509690.5:c.404G>A, ENST00000510385.5:c.404G>A, ENST00000610292.4:c.683G>A, ENST00000610538.4:c.683G>A, ENST00000610623.4:c.323G>A, ENST00000615910.4:n.767G>A, ENST00000617185.4:c.800G>A, ENST00000618944.4:c.323G>A, ENST00000619186.4:c.323G>A, ENST00000619485.4:c.683G>A, ENST00000620739.4:c.683G>A, ENST00000622645.4:c.683G>A, ENST00000635293.1:c.683G>A, NM_000546.5(TP53):c.800G>A (p.Arg267Gln))",p.Arg267Gln,Kato_TP53,6,6
68,p.Arg175Leu,0.940932,NM_000546.6(TP53):c.524G>T (p.Arg175Leu),182963,CA000252,"NM_000546.6:c.524G>T, NC_000017.11:g.7675088C>A, CM000679.2:g.7675088C>A, NC_000017.10:g.7578406C>A, CM000679.1:g.7578406C>A, NC_000017.9:g.7519131C>A, NG_017013.2:g.17463G>T, LRG_321:g.17463G>T, ENST00000503591.2:c.524G>T, ENST00000508793.6:c.524G>T, ENST00000509690.6:c.128G>T, ENST00000514944.6:c.245G>T, ENST00000604348.6:c.503G>T, ENST00000269305.9:c.524G>T, ENST00000269305.8:c.524G>T, ENST00000359597.8:c.524G>T, ENST00000413465.6:c.524G>T, ENST00000420246.6:c.524G>T, ENST00000445888.6:c.524G>T, ENST00000455263.6:c.524G>T, ENST00000504290.5:c.128G>T, ENST00000504937.5:c.128G>T, ENST00000505014.5:n.780G>T, ENST00000509690.5:c.128G>T, ENST00000510385.5:c.128G>T, ENST00000514944.5:c.245G>T, ENST00000574684.1:n.32G>T, ENST00000610292.4:c.407G>T, ENST00000610538.4:c.407G>T, ENST00000610623.4:c.47G>T, ENST00000615910.4:c.491G>T, ENST00000617185.4:c.524G>T, ENST00000618944.4:c.47G>T, ENST00000619186.4:c.47G>T, ENST00000619485.4:c.407G>T, ENST00000620739.4:c.407G>T, ENST00000622645.4:c.407G>T, ENST00000635293.1:c.407G>T, NM_000546.5:c.524G>T, LRG_321t1:c.524G>T, NM_001126112.2:c.524G>T, LRG_321t2:c.524G>T, NM_001126113.2:c.524G>T, LRG_321t4:c.524G>T, NM_001126114.2:c.524G>T, LRG_321t3:c.524G>T, NM_001126115.1:c.128G>T, LRG_321t5:c.128G>T, NM_001126116.1:c.128G>T, LRG_321t6:c.128G>T, NM_001126117.1:c.128G>T, LRG_321t7:c.128G>T, NM_001126118.1:c.407G>T, LRG_321t8:c.407G>T, NM_001276695.1:c.407G>T, NM_001276696.1:c.407G>T, NM_001276697.1:c.47G>T, NM_001276698.1:c.47G>T, NM_001276699.1:c.47G>T, NM_001276760.1:c.407G>T, NM_001276761.1:c.407G>T, NM_001276695.2:c.407G>T, NM_001276696.2:c.407G>T, NM_001276697.2:c.47G>T, NM_001276698.2:c.47G>T, NM_001276699.2:c.47G>T, NM_001276760.2:c.407G>T, NM_001276761.2:c.407G>T, NM_001126112.3:c.524G>T, NM_001126113.3:c.524G>T, NM_001126114.3:c.524G>T, NM_001126115.2:c.128G>T, NM_001126116.2:c.128G>T, NM_001126117.2:c.128G>T, NM_001126118.2:c.407G>T, NM_001276695.3:c.407G>T, NM_001276696.3:c.407G>T, NM_001276697.3:c.47G>T, NM_001276698.3:c.47G>T, NM_001276699.3:c.47G>T, NM_001276760.3:c.407G>T, NM_001276761.3:c.407G>T, NM_000546.6(TP53):c.524G>T (p.Arg175Leu)",TP53,Li-Fraumeni syndrome,MONDO:0018875,Autosomal dominant inheritance,Likely Pathogenic,"PM1, PM5, PP4, PM2_Supporting, BS3_Supporting, PS4_Moderate, PP3_Moderate","PP1, BS2, BA1, BP4, PS3, PS2, PS1, BS1","The NM_000546.6: c.524G>T variant in TP53 is a missense variant predicted to cause substitution of arginine by leucine at amino acid 175 (p.Arg175Leu). This variant is absent from gnomAD v4.1.0 (PM2_Supporting). This variant has been reported in 4 unrelated probands meeting Revised Chompret criteria. Based on this evidence, this variant scores 2 total points meeting the TP53 VCEP phenotype scoring criteria of 2-3.5 points. (PS4_Moderate; PMID: 16707427; Internal lab contributors: Invitae, Ambry). In vitro assays performed in yeast and/or human cell lines showed partially functional transactivation, and retained growth suppression activity indicating that this variant does not impact protein function (BS3_Supporting; PMIDs: 12826609, 29979965, 30224644).Computational predictor scores (BayesDel = 0.57303; Align GVGD = Class C65) are above recommended thresholds (BayesDel > 0.16 and an Align GVGD Class of 65), evidence that correlates with impact to TP53 via protein change (PP3_Moderate). At least one individual with this variant was found to have a variant allele fraction of ≤35%, which is a significant predictor of variant pathogenicity (PP4_Moderate, PMID: 34906512, Internal lab contributor: Invitae). This variant resides within a codon (NM_00546.4: 175, 245, 248, 249, 273, 282) of TP53 that is defined as a mutational hotspot by the ClinGen TP53 VCEP (PM1; PMID: 8023157). Another missense variant(c.524G>A, p.Arg175His) (ClinVar Variation ID: 12374), in the same codon has been classified as pathogenic for Li-Fraumeni syndrome by the ClinGen TP53 VCEP’s specifications. (PM5). In summary, this variant meets the criteria to be classified as Likely Pathogenic for Li Fraumeni syndrome based on the ACMG/AMP criteria applied, as specified by the ClinGen TP53 VCEP: PS4_Moderate, PM2_Supporting, BS3_Supporting, PP3_Moderate, PP4, PM1, PM5 (Bayesian Points: 9; VCEP specifications version 2.0; 9/6/2024)",,TP53 VCEP,,2024-09-06,2024-09-06,False,https://erepo.genome.network/evrepo/ui/classification/CA000252/MONDO:0018875/009,6aa7e5db-edd6-4608-a046-e82aa9ea21df,TP53,"[PM1, PM5, PP4, PM2_Supporting, BS3_Supporting, PS4_Moderate, PP3_Moderate]","(NM_000546.6:c.524G>T, NC_000017.11:g.7675088C>A, CM000679.2:g.7675088C>A, NC_000017.10:g.7578406C>A, CM000679.1:g.7578406C>A, NC_000017.9:g.7519131C>A, NG_017013.2:g.17463G>T, LRG_321:g.17463G>T, ENST00000503591.2:c.524G>T, ENST00000508793.6:c.524G>T, ENST00000509690.6:c.128G>T, ENST00000514944.6:c.245G>T, ENST00000604348.6:c.503G>T, ENST00000269305.9:c.524G>T, ENST00000269305.8:c.524G>T, ENST00000359597.8:c.524G>T, ENST00000413465.6:c.524G>T, ENST00000420246.6:c.524G>T, ENST00000445888.6:c.524G>T, ENST00000455263.6:c.524G>T, ENST00000504290.5:c.128G>T, ENST00000504937.5:c.128G>T, ENST00000505014.5:n.780G>T, ENST00000509690.5:c.128G>T, ENST00000510385.5:c.128G>T, ENST00000514944.5:c.245G>T, ENST00000574684.1:n.32G>T, ENST00000610292.4:c.407G>T, ENST00000610538.4:c.407G>T, ENST00000610623.4:c.47G>T, ENST00000615910.4:c.491G>T, ENST00000617185.4:c.524G>T, ENST00000618944.4:c.47G>T, ENST00000619186.4:c.47G>T, ENST00000619485.4:c.407G>T, ENST00000620739.4:c.407G>T, ENST00000622645.4:c.407G>T, ENST00000635293.1:c.407G>T, NM_000546.5:c.524G>T, LRG_321t1:c.524G>T, NM_001126112.2:c.524G>T, LRG_321t2:c.524G>T, NM_001126113.2:c.524G>T, LRG_321t4:c.524G>T, NM_001126114.2:c.524G>T, LRG_321t3:c.524G>T, NM_001126115.1:c.128G>T, LRG_321t5:c.128G>T, NM_001126116.1:c.128G>T, LRG_321t6:c.128G>T, NM_001126117.1:c.128G>T, LRG_321t7:c.128G>T, NM_001126118.1:c.407G>T, LRG_321t8:c.407G>T, NM_001276695.1:c.407G>T, NM_001276696.1:c.407G>T, NM_001276697.1:c.47G>T, NM_001276698.1:c.47G>T, NM_001276699.1:c.47G>T, NM_001276760.1:c.407G>T, NM_001276761.1:c.407G>T, NM_001276695.2:c.407G>T, NM_001276696.2:c.407G>T, NM_001276697.2:c.47G>T, NM_001276698.2:c.47G>T, NM_001276699.2:c.47G>T, NM_001276760.2:c.407G>T, NM_001276761.2:c.407G>T, NM_001126112.3:c.524G>T, NM_001126113.3:c.524G>T, NM_001126114.3:c.524G>T, NM_001126115.2:c.128G>T, NM_001126116.2:c.128G>T, NM_001126117.2:c.128G>T, NM_001126118.2:c.407G>T, NM_001276695.3:c.407G>T, NM_001276696.3:c.407G>T, NM_001276697.3:c.47G>T, NM_001276698.3:c.47G>T, NM_001276699.3:c.47G>T, NM_001276760.3:c.407G>T, NM_001276761.3:c.407G>T, NM_000546.6(TP53):c.524G>T (p.Arg175Leu))",p.Arg175Leu,Giacomelli_1,6,6
0,p.Ala102Gly,-0.046066,NM_007294.4(BRCA1):c.305C>G (p.Ala102Gly),37505,CA002009,"NM_007294.4:c.305C>G, NC_000017.11:g.43104258G>C, CM000679.2:g.43104258G>C, NC_000017.10:g.41256275G>C, CM000679.1:g.41256275G>C, NC_000017.9:g.38509801G>C, NG_005905.2:g.113726C>G, LRG_292:g.113726C>G, ENST00000354071.8:n.369C>G, ENST00000461574.2:c.305C>G, ENST00000470026.6:c.305C>G, ENST00000473961.6:c.305C>G, ENST00000476777.6:c.305C>G, ENST00000477152.6:c.227C>G, ENST00000478531.6:c.305C>G, ENST00000489037.2:c.227C>G, ENST00000493919.6:c.164C>G, ENST00000494123.6:c.305C>G, ENST00000497488.2:c.-218-9398C>G, ENST00000618469.2:c.305C>G, ENST00000634433.2:c.305C>G, ENST00000644379.2:c.305C>G, ENST00000644555.2:c.164C>G, ENST00000652672.2:c.164C>G, ENST00000484087.6:c.305C>G, ENST00000700083.1:n.1276C>G, ENST00000700182.1:c.227C>G, ENST00000700183.1:c.*219C>G, ENST00000700184.1:n.548C>G, ENST00000357654.9:c.305C>G, ENST00000471181.7:c.305C>G, ENST00000642945.1:c.*179C>G, ENST00000644555.1:c.164C>G, ENST00000652672.1:c.164C>G, ENST00000352993.7:c.305C>G, ENST00000354071.7:c.305C>G, ENST00000357654.7:c.305C>G, ENST00000461221.5:c.*91C>G, ENST00000461798.5:c.*91C>G, ENST00000468300.5:c.305C>G, ENST00000470026.5:c.305C>G, ENST00000471181.6:c.305C>G, ENST00000473961.5:c.28C>G, ENST00000476777.5:c.305C>G, ENST00000477152.5:c.227C>G, ENST00000478531.5:c.305C>G, ENST00000484087.5:c.53C>G, ENST00000487825.5:c.53C>G, ENST00000489037.1:c.227C>G, ENST00000491747.6:c.305C>G, ENST00000492859.5:c.*241C>G, ENST00000493795.5:c.164C>G, ENST00000493919.5:c.164C>G, ENST00000494123.5:c.305C>G, ENST00000497488.1:c.-218-9398C>G, ENST00000586385.5:c.4+20924C>G, ENST00000591534.5:c.-44+21013C>G, ENST00000591849.5:c.-99+21013C>G, ENST00000634433.1:c.305C>G, NM_007294.3:c.305C>G, LRG_292t1:c.305C>G, NM_007297.3:c.164C>G, NM_007298.3:c.305C>G, NM_007299.3:c.305C>G, NM_007300.3:c.305C>G, NR_027676.1:n.444C>G, NM_007297.4:c.164C>G, NM_007299.4:c.305C>G, NM_007300.4:c.305C>G, NR_027676.2:n.485C>G, NM_007294.4(BRCA1):c.305C>G (p.Ala102Gly)",BRCA1,BRCA1-related cancer predisposition,MONDO:0011450,Autosomal dominant inheritance,Benign,"PM2_Supporting, BP1_Strong, BS3, BP5_Strong",,"The c.305C>G variant in BRCA1 is a missense variant predicted to cause substitution of Alanine by Glycine at amino acid 102 (p.Ala102Gly). This variant is absent from gnomAD v2.1 (exomes only, non-cancer subset, read depth ≥25) and gnomAD v3.1 (non-cancer subset, read depth ≥25) (PM2_Supporting met). This missense variant is located outside of a key functional domain and was not predicted to alter mRNA splicing using the SpliceAI predictor (score 0.00, score threshold <0.1) (BP1_Strong met). Reported by one calibrated study to exhibit protein function similar to benign control variants (PMID: 30219179) (BS3 met). Multifactorial likelihood ratio analysis using clinically calibrated data produced a combined LR for this variant of 0.015 (based on Co-occurrence LR=1.177; Family History LR=0.0127), below the threshold for strong benign evidence (LR <0.05) (BP5_Strong met; PMID: 31131967, 31853058). In summary, this variant meets the criteria to be classified as a Benign variant for BRCA1-related cancer predisposition based on the ACMG/AMP criteria applied as specified by the ENIGMA BRCA1/2 VCEP (PM2_Supporting, BP1_Strong, BS3, BP5_Strong).",30219179,ENIGMA BRCA1 and BRCA2 VCEP,,2024-06-12,2024-06-11,False,https://erepo.genome.network/evrepo/ui/classification/CA002009/MONDO:0011450/092,de5081dd-8e3e-444d-9826-44632cbee0f3,BRCA1,"[PM2_Supporting, BP1_Strong, BS3, BP5_Strong]","(NM_007294.4:c.305C>G, NC_000017.11:g.43104258G>C, CM000679.2:g.43104258G>C, NC_000017.10:g.41256275G>C, CM000679.1:g.41256275G>C, NC_000017.9:g.38509801G>C, NG_005905.2:g.113726C>G, LRG_292:g.113726C>G, ENST00000354071.8:n.369C>G, ENST00000461574.2:c.305C>G, ENST00000470026.6:c.305C>G, ENST00000473961.6:c.305C>G, ENST00000476777.6:c.305C>G, ENST00000477152.6:c.227C>G, ENST00000478531.6:c.305C>G, ENST00000489037.2:c.227C>G, ENST00000493919.6:c.164C>G, ENST00000494123.6:c.305C>G, ENST00000497488.2:c.-218-9398C>G, ENST00000618469.2:c.305C>G, ENST00000634433.2:c.305C>G, ENST00000644379.2:c.305C>G, ENST00000644555.2:c.164C>G, ENST00000652672.2:c.164C>G, ENST00000484087.6:c.305C>G, ENST00000700083.1:n.1276C>G, ENST00000700182.1:c.227C>G, ENST00000700183.1:c.*219C>G, ENST00000700184.1:n.548C>G, ENST00000357654.9:c.305C>G, ENST00000471181.7:c.305C>G, ENST00000642945.1:c.*179C>G, ENST00000644555.1:c.164C>G, ENST00000652672.1:c.164C>G, ENST00000352993.7:c.305C>G, ENST00000354071.7:c.305C>G, ENST00000357654.7:c.305C>G, ENST00000461221.5:c.*91C>G, ENST00000461798.5:c.*91C>G, ENST00000468300.5:c.305C>G, ENST00000470026.5:c.305C>G, ENST00000471181.6:c.305C>G, ENST00000473961.5:c.28C>G, ENST00000476777.5:c.305C>G, ENST00000477152.5:c.227C>G, ENST00000478531.5:c.305C>G, ENST00000484087.5:c.53C>G, ENST00000487825.5:c.53C>G, ENST00000489037.1:c.227C>G, ENST00000491747.6:c.305C>G, ENST00000492859.5:c.*241C>G, ENST00000493795.5:c.164C>G, ENST00000493919.5:c.164C>G, ENST00000494123.5:c.305C>G, ENST00000497488.1:c.-218-9398C>G, ENST00000586385.5:c.4+20924C>G, ENST00000591534.5:c.-44+21013C>G, ENST00000591849.5:c.-99+21013C>G, ENST00000634433.1:c.305C>G, NM_007294.3:c.305C>G, LRG_292t1:c.305C>G, NM_007297.3:c.164C>G, NM_007298.3:c.305C>G, NM_007299.3:c.305C>G, NM_007300.3:c.305C>G, NR_027676.1:n.444C>G, NM_007297.4:c.164C>G, NM_007299.4:c.305C>G, NM_007300.4:c.305C>G, NR_027676.2:n.485C>G, NM_007294.4(BRCA1):c.305C>G (p.Ala102Gly))",p.Ala102Gly,Starita_BRCA1_HDR,0,0


In [320]:
thresholdsP['Giacomelli_2']

array([0.52617947, 0.54530177, 0.57726334, 0.621791  , 0.70374374,
              nan,        nan,        nan])

In [321]:
thresholdsB['Giacomelli_2']

array([ 0.07543938,  0.0481218 ,  0.00441367, -0.06388029,         nan,
               nan,         nan,         nan])

In [318]:
scoresets['Giacomelli_2'][scoresets['Giacomelli_2'].hgvs_pro == "p.Ala69Gly"]

Unnamed: 0,index,hgvs_pro,accession,hgvs_nt,hgvs_splice,score,author_transcript,is_synonymous,scores,num_p_lp,num_b_lb,num_conflicting,num_VUS,clinvar_alleleIDs,clinvar_records,clinvar_spliceAI_max,AF,gnomAD_spliceAI_score,gnomAD_variants_VCF_INFO,labels,author_labels
6733,6733,p.Ala69Gly,urn:mavedb:00000068-b-1#6750,,,1.012334,NM_000546,False,[1.0123342207],0.0,0.0,0.0,14.0,230112|230112|230112|230112|230112|230112|230112|230112|230112|230112|230112|230112|230112|230112,14.0,0.13,,,,"[VUS, non-synonymous]",


In [327]:
scoresets['Kato_TP53'][scoresets['Kato_TP53'].hgvs_pro == "p.Arg267Gln"]

Unnamed: 0,index,hgvs_pro,ProtDescription,AAchange,Codon_Number,WAF1nWT,MDM2nWT,BAXnWT,h1433snWT,AIP1nWT,GADD45nWT,NOXAnWT,P53R2nWT,WAF1nWT_Saos2,MDM2nWT_Saos2,BAXnWT_Saos2,h1433snWT_Saos2,AIP1nWT_Saos2,PUMAnWT_Saos2,SubG1nWT_Saos2,Oligomerisation_yeast,author_transcript,is_synonymous,scores,num_p_lp,num_b_lb,num_conflicting,num_VUS,clinvar_alleleIDs,clinvar_records,clinvar_spliceAI_max,AF,gnomAD_spliceAI_score,gnomAD_variants_VCF_INFO,labels,author_labels,scores_pre_normalize
1734,1734,p.Arg267Gln,p.R267Q,R267Q,267,16.1,23.6,18.3,16.9,19.3,78.3,61.8,123.0,,,,,,,,,NM_000546,False,[-2.2402917724],0.0,0.0,7.0,12.0,458574|458574|458574|458574|458574|458574|458574|127823|127823|127823|127823|127823|127823|127823|127823|127823|127823|127823|127823,19.0,0.08,4e-06,0.08,17:7673703:C:T|17:7673703:C:T|17:7673703:C:T|17:7673703:C:T|17:7673703:C:T|17:7673703:C:T|17:7673703:C:T|17:7673703:C:T|17:7673703:C:T|17:7673703:C:T|17:7673703:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T|17:7673820:C:T,"[gnomAD, VUS, conflicting, non-synonymous]",Intermediate,[21.45]


In [329]:
kato_ss = scoresets["Kato_TP53"]

In [334]:
kato_ss.columns

Index(['index', 'hgvs_pro', 'ProtDescription', 'AAchange', 'Codon_Number',
       'WAF1nWT', 'MDM2nWT', 'BAXnWT', 'h1433snWT', 'AIP1nWT', 'GADD45nWT',
       'NOXAnWT', 'P53R2nWT', 'WAF1nWT_Saos2', 'MDM2nWT_Saos2', 'BAXnWT_Saos2',
       'h1433snWT_Saos2', 'AIP1nWT_Saos2', 'PUMAnWT_Saos2', 'SubG1nWT_Saos2',
       'Oligomerisation_yeast', 'author_transcript', 'is_synonymous', 'scores',
       'num_p_lp', 'num_b_lb', 'num_conflicting', 'num_VUS',
       'clinvar_alleleIDs', 'clinvar_records', 'clinvar_spliceAI_max', 'AF',
       'gnomAD_spliceAI_score', 'gnomAD_variants_VCF_INFO', 'labels',
       'author_labels', 'scores_pre_normalize'],
      dtype='object')

In [335]:
katoblbscores = np.concatenate(kato_ss.loc[kato_ss.num_b_lb > 0,'scores_pre_normalize'].values)

In [336]:
katoblbscores.mean(), katoblbscores.std()

(np.float64(92.41856060606061), np.float64(31.678266857736443))

In [338]:
thresholdsP['Kato_TP53'] * katoblbscores.std() + katoblbscores.mean()

array([27.19976659, 26.91420474, 26.55725242, 25.98612871, 25.05805268,
       23.05911971,         nan,         nan])

In [339]:
thresholdsB['Kato_TP53'] * katoblbscores.std() + katoblbscores.mean()

array([47.61743915, 48.04578193, 48.6882961 , 49.54498166, 50.90140047,
       54.0175942 ,         nan,         nan])

In [342]:
vampseq = scoresets['Matreyek_PTEN_VampSeq']

In [345]:
thresholdsP['Matreyek_PTEN_VampSeq']

array([0.49045362, 0.48483205, 0.477524  , 0.46796733, 0.45335123,
       0.42749199, 0.37464919,        nan])

In [348]:
vampseq[(vampseq.author_labels == "Intermediate") & (vampseq.scores.apply(lambda s: s[0] > thresholdsP['Matreyek_PTEN_VampSeq'][0]))].shape

(671, 45)

In [350]:
vampseq[vampseq.author_labels == "Intermediate"].shape

(1013, 45)

In [352]:
vampseq[vampseq.author_labels == "Functionally Abnormal"]

Unnamed: 0,index,hgvs_pro,accession,hgvs_nt,hgvs_splice,score,sd,expts,se,lower_ci,upper_ci,score1,score2,score3,score4,score5,score6,score7,score8,median_w_ave,exp1_w_ave,exp2_w_ave,exp3_w_ave,exp4_w_ave,exp5_w_ave,exp6_w_ave,exp7_w_ave,exp8_w_ave,snv,abundance_class,author_transcript,is_synonymous,scores,num_p_lp,num_b_lb,num_conflicting,num_VUS,clinvar_alleleIDs,clinvar_records,clinvar_spliceAI_max,AF,gnomAD_spliceAI_score,gnomAD_variants_VCF_INFO,labels,author_labels


In [353]:
vampseq.author_labels.value_counts()

author_labels
Functionally_Normal      2478
Intermediate             1013
Functionally_Abnormal     918
Name: count, dtype: int64