- Collects uniprot features
- NSEuronet, ClinVar, gnomAD datasets (missense variants)
- REVEL & BayesDelx2 datasets
- Genomic coordinates <--> protein position functions
- df for stat score analysis

In [1]:
import pandas as pd
import os
#from Bio import SeqIO
#from io import StringIO
import requests
import time
import json
import re 

import seaborn as sns

In [4]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_rows', None) 

In [2]:
# Set gene
gene = 'PTPN11'

# Dictionary of Uniprot IDs for each gene
uniprot_ids = {
    "PTPN11": "Q06124",    
}

#### Processing functions

In [21]:
# Universal tools for dealing with proteins

aaconv = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
     'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M',
     'H1S': 'H', 'H2S': 'H'}

aacharge = {
         'C': 0, 'D': -1, 'S': 0, 'Q': 0, 'K': 1,
         'I': 0, 'P': 0, 'T': 0, 'F': 0, 'N': 0, 
         'G': 0, 'H': 0, 'L': 0, 'R': 1, 'W': 0, 
         'A': 0, 'V':0, 'E': -1, 'Y': 0, 'M': 0 }

aachargeHP = {
         'C': 0, 'D': -1, 'S': 0, 'Q': 0, 'K': 1,
         'I': 0, 'P': 0, 'T': 0, 'F': 0, 'N': 0, 
         'G': 0, 'H': 1, 'L': 0, 'R': 1, 'W': 0, 
         'A': 0, 'V':0, 'E': -1, 'Y': 0, 'M': 0 }

In [None]:
"""

# Functions to test tables processing for any unexpected results

def threetoone(x):
    if len(x) % 3 != 0: 
        raise ValueError('Input length should be a multiple of three')

    y = ''
    for i in range(len(x) // 3):
            y += aaconv[x[3 * i : 3 * i + 3]]
    return y

def testTable(table,mutation="Protein_desc",threeLetter=False):
    mutation = table[mutation].apply(lambda x: str(x).split(".")[-1] if pd.notna(x) else "")
    locations=mutation.apply(lambda x: int(re.findall('\d+', x)[0]))
    if threeLetter:
        fromAA = mutation.apply(lambda x: aaconv[re.split('\d+',x)[0].upper()])
        toAA = mutation.apply(lambda x: aaconv[re.split('\d+',x)[1].upper()])
    else:
        fromAA = mutation.apply(lambda x: re.findall('[A-Z]+', x.upper())[0])
        toAA = mutation.apply(lambda x: re.findall('[A-Z]+', x.upper())[1])
    problem_idx = set()
    problem_idx.update(locations[locations > protein_length].index.tolist())
    problem_idx.update(fromAA[~fromAA.isin(aacharge.keys())].index.tolist())
    problem_idx.update(toAA[~toAA.isin(aacharge.keys())].index.tolist())

    return problem_idx

"""

In [17]:
# Function to filter table for missense mutations
def filterTable_missense(df, gene, mutation_col="Protein_desc", gene_col="Gene", threeLetter=False):

    # Handle gene column logic
    if gene_col in df.columns:
        df = df.loc[df[gene_col].astype(str).str.contains(gene, na=False)]
    else:
        df.loc[:, gene_col] = gene
        
    # Safe extraction of protein change
    mutation = df[mutation_col].apply(lambda x: str(x).split(".")[-1] if pd.notna(x) else "")

    # Extract positions (integer)
    locations = mutation.apply(lambda x: int(re.findall(r'\d+', x)[0]) if re.findall(r'\d+', x) else -1)

    # Extract AA from/to
    if threeLetter:
        fromAA = mutation.apply(lambda x: aaconv[re.split(r'\d+', x)[0].upper()] if re.split(r'\d+', x)[0].upper() in aaconv else "")
        toAA   = mutation.apply(lambda x: aaconv[re.split(r'\d+', x)[1].upper()] if len(re.split(r'\d+', x)) > 1 and re.split(r'\d+', x)[1].upper() in aaconv else "")
    else:
        fromAA = mutation.apply(lambda x: re.findall(r'[A-Z]+', x.upper())[0] if re.findall(r'[A-Z]+', x.upper()) else "")
        toAA   = mutation.apply(lambda x: re.findall(r'[A-Z]+', x.upper())[1] if len(re.findall(r'[A-Z]+', x.upper())) > 1 else "")

    # Identify problematic rows
    problem_idx = set()
    problem_idx.update(locations[locations > protein_length].index.tolist())
    problem_idx.update(fromAA[~fromAA.isin(aacharge.keys())].index.tolist())
    problem_idx.update(toAA[~toAA.isin(aacharge.keys())].index.tolist())
    problem_idx.update(df.index[fromAA == toAA]) # to check no synonomous variants in form A1A

    missense_df = df.drop(index=problem_idx, errors='ignore')

    # Report mismatched rows
    notmissense_df = df.loc[df.index.isin(problem_idx)]
    #check_notmissense_rows = notmissense_df.loc[notmissense_df[mutation_col].notna()]

    """
    if verbose:
        print(notmissense_df)
    else:
        check_notmissense_rows = notmissense_df.loc[notmissense_df[mutation_col].notna()]
        print(check_notmissense_rows)"""
    
    return missense_df, notmissense_df #, check_notmissense_rows


In [18]:
# Function to extend tables

def extendTableProtein(table,mutation="Protein_desc",threeLetter=False, DCharge=False):
    table["Protein_change"] = table[mutation].astype(str).apply(lambda x: x.split(".")[-1])
    table["AApos"] = table["Protein_change"].apply(lambda x:int( re.findall('\d+', x)[0])) # extract amino acid position
    if threeLetter:
        table["Protein_change"] = table["Protein_change"].str.upper()
        table["AAfrom"] = table["Protein_change"].apply(lambda x: aaconv[re.split('\d+',x)[0].upper()])
        table["AAto"] = table["Protein_change"].apply(lambda x: aaconv[re.split('\d+',x)[1].upper()])
        table["Protein_change"] = table["AAfrom"] + table["AApos"].astype(str) + table["AAto"]
    else:
        table["AAfrom"] = table["Protein_change"].apply(lambda x: re.findall('[A-Z]+', x.upper())[0])
        table["AAto"] = table["Protein_change"].apply(lambda x: re.findall('[A-Z]+', x.upper())[1])
    if DCharge:
        table["DCharge"] = table["Protein_change"].apply(lambda x: aacharge[re.findall('[A-Z]+', x.upper())[1]] - aacharge[re.findall('[A-Z]+', x.upper())[0]]) 
        table["DChargeHP"] = table["Protein_change"].apply(lambda x: aachargeHP[re.findall('[A-Z]+', x.upper())[1]] - aachargeHP[re.findall('[A-Z]+', x.upper())[0]]) 
    return table

def extendTableDNA(table,mutation="cDNA"):
    table["chr"] = chrom # extracted from ClinVar
    table["codon"] = table[mutation].apply(lambda x: re.sub("\D+","",x))
    table["ref"] = table[mutation].apply(lambda x: re.split("\d",re.split('>',x)[0])[-1])
    table["alt"] = table[mutation].apply(lambda x: re.split('>',x)[-1])
    return table

### Read in data

#### Uniprot

In [10]:
# Fetch sequence length, domains and sites from UniProt

def get_uniprot_features(gene):
    uniprot_id = uniprot_ids[gene]
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()

    sequence_length = len(data['sequence']['value'])
    
    domains_list = []
    sites_list = []
    other_features = []
    
    # Assign colors for each feature type
    feature_colours = {
        'Domain': "#66c2a5",
        'Region': "#fc8d62",
        'Active site': "#e78ac3",
        'Binding site': "#a6d854"
    }
    
    for feature in data.get('features', []):
        ftype = feature['type']
        start = feature['location']['start']['value']
        end = feature['location']['end']['value']
        name = feature.get('description', ftype)
        
        # Collect domains
        if ftype in ['Domain', 'Region']:
            domains_list.append([name, start, end, feature_colours[ftype]])
        # Collect sites
        elif ftype in ['Active site', 'Binding site']:
            sites_list.append([name, start, end, feature_colours[ftype]])
        
        else:
            other_features.append({
                "type": ftype,
                "name": name,
                "start": start,
                "end": end
            })

    """
    # Assign distinct colors for domains
    palette = sns.color_palette("Set2", n_colors=len(domains_list))
    for i, domain in enumerate(domains_list):
        domain[3] = palette[i]
    """
    
    return sequence_length, domains_list, sites_list, other_features

In [19]:
protein_length, domains, sites, others = get_uniprot_features(gene)

print("Gene:", gene)

print("Sequence Length:", protein_length)

print("Domains / regions:")
for domain in domains:
    print (domain)

print("\nActive / binding sites:")
for site in sites:
    print (site)

print("\nOther features:")
for other in others:
    print (other)

# Save features dictionaries
uniprot_features = {
    "domains": domains,
    "sites": sites,
    "other features": others
}

with open(f"{gene}_uniprotfeatures.json", "w") as f:
    json.dump(uniprot_features, f, indent=2)

Gene: PTPN11
Sequence Length: 593
Domains / regions:
['SH2 1', 6, 102, '#66c2a5']
['SH2 2', 112, 216, '#66c2a5']
['Tyrosine-protein phosphatase', 247, 517, '#66c2a5']
['Disordered', 548, 571, '#fc8d62']

Active / binding sites:
['Phosphocysteine intermediate', 459, 459, '#e78ac3']
['', 425, 425, '#a6d854']
['', 459, 465, '#a6d854']
['', 506, 506, '#a6d854']

Other features:
{'type': 'Initiator methionine', 'name': 'Removed', 'start': 1, 'end': 1}
{'type': 'Chain', 'name': 'Tyrosine-protein phosphatase non-receptor type 11', 'start': 2, 'end': 593}
{'type': 'Compositional bias', 'name': 'Pro residues', 'start': 559, 'end': 568}
{'type': 'Modified residue', 'name': 'N-acetylthreonine', 'start': 2, 'end': 2}
{'type': 'Modified residue', 'name': 'Phosphotyrosine', 'start': 62, 'end': 62}
{'type': 'Modified residue', 'name': 'Phosphotyrosine', 'start': 66, 'end': 66}
{'type': 'Modified residue', 'name': 'Phosphotyrosine; by PDGFR', 'start': 542, 'end': 542}
{'type': 'Modified residue', 'nam

#### NSEuronet

In [141]:
# Manually load the NSEuronetData.csv (for all RAS genes)

df1 = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\NSEuroNetData.csv", sep=';', header=None)
df1_headers = ["Gene", "cDNA", "Protein_desc", "Disease"] #headers to add to df
df1.columns = df1_headers

print(df1.head())

     Gene       cDNA Protein_desc          Disease
0    KRAS    c.65A>G       p.Q22R  Noonan syndrome
1  PTPN11   c.922A>G      p.N308D  Noonan syndrome
2  PTPN11   c.184T>G       p.Y62D  Noonan syndrome
3  PTPN11   c.922A>G      p.N308D  Noonan syndrome
4  PTPN11  c.1232C>T      p.T411M  Noonan syndrome


In [142]:
# Filter NSEuronet df for missense variants
df1_missense, df1_notmissense = filterTable_missense(df1, gene, mutation_col="Protein_desc", gene_col="Gene", threeLetter=False)


In [143]:
# Extend NSEuronet df to create Protein_change, AA and charge columns
NSEuronet_df = extendTableProtein(df1_missense,mutation="Protein_desc",threeLetter=False,DCharge=True)
NSEuronet_df

Unnamed: 0,Gene,cDNA,Protein_desc,Disease,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP
1,PTPN11,c.922A>G,p.N308D,Noonan syndrome,N308D,308,N,D,-1,-1
2,PTPN11,c.184T>G,p.Y62D,Noonan syndrome,Y62D,62,Y,D,-1,-1
3,PTPN11,c.922A>G,p.N308D,Noonan syndrome,N308D,308,N,D,-1,-1
4,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0
5,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0
6,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0
7,PTPN11,c.1529A>G,p.Q510R,NF1-Noonan syndrome,Q510R,510,Q,R,1,1
8,PTPN11,c.179G>C,p.G60A,Noonan syndrome,G60A,60,G,A,0,0
9,PTPN11,c.182A>G,p.D61G,Noonan syndrome,D61G,61,D,G,1,1
10,PTPN11,c.184T>G,p.Y62D,Noonan syndrome,Y62D,62,Y,D,-1,-1


#### ClinVar

NCBI E-utilities rate is limited to 3 requests per second (without an API key). Can request an NCBI API key to increase limit to ~10 per second.

Try again teXML release (gzipped) and parse for gene of interest

ClinVar maintains a complete set of variant data on an FTP server. Weekly update cycle but only the realease on the 1st Thursday of the month is archived. Download this below. https://pmc.ncbi.nlm.nih.gov/tools/ftp/

In [30]:
# Manually load the clinvar_result.txt download (for single gene e.g. PTPN11)

df2 = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\clinvar_result.txt", sep="\t", low_memory=False)
print(df2.head())


                  Name              Gene(s) Protein change  \
0  GRCh38/hg38 12p1...  LOC130008261|LOC...            NaN   
1  GRCh38/hg38 12q2...  HECTD4|LOC130008...            NaN   
2  GRCh38/hg38 12q2...  HECTD4|LOC130008...            NaN   
3  GRCh38/hg38 12q2...          PTPN11|RPL6            NaN   
4  NM_002834.3(PTPN...          PTPN11|RPL6            NaN   

          Condition(s)     Accession  GRCh37Chromosome       GRCh37Location  \
0            See cases  VCV000150740              12.0   282465 - 133773393   
1            See cases  VCV000059818              12.0  112741234 - 1131...   
2            See cases  VCV000059819              12.0  112745336 - 1131...   
3            See cases  VCV000145960              12.0  112854667 - 1128...   
4  Noonan syndrome ...  VCV000882155              12.0            112856599   

   GRCh38Chromosome       GRCh38Location  VariationID  AlleleID(s)  \
0              12.0   121271 - 133196807       150740       160491   
1           

In [31]:
# Filter ClinVar df for missense variants

# But first extract Protein_desc from 'Name' column
df2['Protein_desc'] = (df2['Name'].str.extract(r'(p\.[A-Za-z0-9]+)').iloc[:, 0].str.replace(')', '', regex=False).str.strip())
df2['cDNA_desc'] = df2['Name'].str.extract(r'(c\.[0-9]+[ACGT]>[ACGT])')[0]


with pd.option_context('display.max_colwidth', 15):
    df2_missense, df2_notmissense = filterTable_missense(df2, gene, mutation_col="Protein_desc",  gene_col="Gene(s)", threeLetter=True)


In [None]:
print(df2["Molecular consequence"].unique())


In [None]:
with pd.option_context('display.max_colwidth', None):
    print(df2_missense.head())
    #print(df2_missense[df2_missense["Molecular consequence"].str.lower() != "missense variant"])


In [32]:
# Extend ClinVar df to create Protein_change, AA and charge columns
ClinVar_df = extendTableProtein(df2_missense,mutation="Protein_desc",threeLetter=True,DCharge=True)
ClinVar_df

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,Protein_desc,cDNA_desc,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP
24,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,,p.Met1Arg,c.2T>G,M1R,1,M,R,1,1
25,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,,p.Thr2Ile,c.5C>T,T2I,2,T,I,0,0
30,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,,p.Arg4Gly,c.10C>G,R4G,4,R,G,-1,-1
31,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,,p.Arg4Gln,c.11G>A,R4Q,4,R,Q,-1,-1
58,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,,p.Trp6Cys,c.18G>T,W6C,6,W,C,0,0
59,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002568101,12.0,112884083,12.0,112446279,2568101,2734639,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 22, 2023",criteria provide...,,,,,,,,p.Trp6Cys,c.18G>C,W6C,6,W,C,0,0
60,NM_002834.5(PTPN...,PTPN11,N10Y,not provided|Car...,VCV002587734,12.0,112884093,12.0,112446289,2587734,2764072,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 3, 2024",criteria provide...,,,,,,,,p.Asn10Tyr,c.28A>T,N10Y,10,N,Y,0,0
61,NM_002834.5(PTPN...,PTPN11,N10H,Metachondromatosis,VCV001684677,12.0,112884093,12.0,112446289,1684677,1676705,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 20, 2022",criteria provide...,,,,,,,,p.Asn10His,c.28A>C,N10H,10,N,H,0,1
62,NM_002834.5(PTPN...,PTPN11,N10D,Cardiovascular p...,VCV000838860,12.0,112884093,12.0,112446289,838860,839322,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jul 3, 2025",criteria provide...,,,,,,,,p.Asn10Asp,c.28A>G,N10D,10,N,D,-1,-1
63,NM_002834.5(PTPN...,PTPN11,N10T,RASopathy|not pr...,VCV002705464,12.0,112884094,12.0,112446290,2705464,2856602,rs200613531,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Aug 22, 2024",criteria provide...,,,,,,,,p.Asn10Thr,c.29A>C,N10T,10,N,T,0,0


In [46]:
print(ClinVar_df[ClinVar_df['Germline classification'].str.contains('enign')])

                     Name Gene(s)       Protein change         Condition(s)  \
73    NM_002834.5(PTPN...  PTPN11                 N18S            RASopathy   
280   NM_002834.5(PTPN...  PTPN11         K131R, K130R            RASopathy   
623   NM_002834.5(PTPN...  PTPN11         I309V, I308V            RASopathy   
626   NM_002834.5(PTPN...  PTPN11         M310T, M311T            RASopathy   
678   NM_002834.5(PTPN...  PTPN11         R343Q, R342Q            RASopathy   
690   NM_002834.5(PTPN...  PTPN11         R351Q, R350Q            RASopathy   
874   NM_002834.5(PTPN...  PTPN11  T467S, T468S, T472S      Noonan syndrome   
886   NM_002834.5(PTPN...  PTPN11  I473V, I474V, I478V    Noonan syndrome 1   
1053  NM_002834.5(PTPN...  PTPN11  T553M, T557M, T552M            RASopathy   
1069  NM_002834.5(PTPN...  PTPN11  L560F, L564F, L559F            RASopathy   
1116  NM_002834.5(PTPN...  PTPN11  L584M, L589M, L585M  Cardiovascular p...   

         Accession  GRCh37Chromosome GRCh37Location

#### gnomAD

In [14]:
df3 = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\gnomAD_v4.1.0_ENSG00000179295_2025_11_07_16_02_00.csv")

In [61]:
df3.head()

Unnamed: 0,gnomAD ID,Chromosome,Position,rsIDs,Reference,Alternate,Source,Filters - exomes,Filters - genomes,Transcript,HGVS Consequence,Protein Consequence,Transcript Consequence,VEP Annotation,ClinVar Germline Classification,ClinVar Variation ID,Flags,Allele Count,Allele Number,Allele Frequency,Homozygote Count,Hemizygote Count,Filters - joint,GroupMax FAF group,GroupMax FAF frequency,cadd,revel_max,spliceai_ds_max,pangolin_largest_ds,phylop,sift_max,polyphen_max,Allele Count African/African American,Allele Number African/African American,Homozygote Count African/African American,Hemizygote Count African/African American,Allele Count Admixed American,Allele Number Admixed American,Homozygote Count Admixed American,Hemizygote Count Admixed American,Allele Count Ashkenazi Jewish,Allele Number Ashkenazi Jewish,Homozygote Count Ashkenazi Jewish,Hemizygote Count Ashkenazi Jewish,Allele Count East Asian,Allele Number East Asian,Homozygote Count East Asian,Hemizygote Count East Asian,Allele Count European (Finnish),Allele Number European (Finnish),Homozygote Count European (Finnish),Hemizygote Count European (Finnish),Allele Count Middle Eastern,Allele Number Middle Eastern,Homozygote Count Middle Eastern,Hemizygote Count Middle Eastern,Allele Count European (non-Finnish),Allele Number European (non-Finnish),Homozygote Count European (non-Finnish),Hemizygote Count European (non-Finnish),Allele Count Amish,Allele Number Amish,Homozygote Count Amish,Hemizygote Count Amish,Allele Count South Asian,Allele Number South Asian,Homozygote Count South Asian,Hemizygote Count South Asian,Allele Count Remaining,Allele Number Remaining,Homozygote Count Remaining,Hemizygote Count Remaining
0,12-112419037-C-T,12,112419037,,C,T,gnomAD Exomes,PASS,,ENST00000351677.7,c.-75C>T,,c.-75C>T,5_prime_UTR_variant,,,,1,1525952,6.553286e-07,0,0,PASS,,,20.6,,0.0,0.01,5.52,,,0,71260,0,0,0,50442,0,0,0,28274,0,0,0,39958,0,0,0,50866,0,0,0,4318,0,0,1,1137830,0,0,0,912,0,0,0,82804,0,0,0,59288,0,0
1,12-112419039-G-A,12,112419039,,G,A,gnomAD Exomes,PASS,,ENST00000351677.7,c.-73G>A,,c.-73G>A,5_prime_UTR_variant,,,,1,1530186,6.535153e-07,0,0,PASS,,,20.7,,0.0,0.01,4.56,,,0,71576,0,0,0,50554,0,0,0,28322,0,0,0,40160,0,0,0,51070,0,0,0,4332,0,0,0,1140800,0,0,0,912,0,0,0,83048,0,0,1,59412,0,0
2,12-112419039-G-GGT,12,112419039,rs2037471954,G,GGT,"gnomAD Exomes,gn...",PASS,PASS,ENST00000351677.7,c.-73_-72insGT,,c.-73_-72insGT,5_prime_UTR_variant,,,,2,1530078,1.307123e-06,0,0,PASS,eas,8e-06,20.4,,0.0,0.08,4.56,,,0,71458,0,0,0,50534,0,0,0,28322,0,0,2,40176,0,0,0,51070,0,0,0,4354,0,0,0,1140810,0,0,0,912,0,0,0,83052,0,0,0,59390,0,0
3,12-112419041-C-T,12,112419041,rs1463949594,C,T,"gnomAD Exomes,gn...",PASS,PASS,ENST00000351677.7,c.-71C>T,,c.-71C>T,5_prime_UTR_variant,Uncertain signif...,882209.0,,24,1531396,1.567198e-05,0,0,PASS,nfe,1.3e-05,21.1,,0.0,0.01,4.57,,,0,71484,0,0,0,50526,0,0,0,28330,0,0,0,40194,0,0,0,51302,0,0,0,4348,0,0,22,1141780,0,0,0,912,0,0,0,83096,0,0,2,59424,0,0
4,12-112419042-C-T,12,112419042,,C,T,gnomAD Exomes,PASS,,ENST00000351677.7,c.-70C>T,,c.-70C>T,5_prime_UTR_variant,,,,1,1531496,6.529563e-07,0,0,PASS,,,20.5,,0.0,-0.04,0.995,,,0,71606,0,0,0,50556,0,0,0,28330,0,0,0,40168,0,0,0,51386,0,0,0,4330,0,0,1,1141678,0,0,0,912,0,0,0,83062,0,0,0,59468,0,0


In [15]:
len(df3)

2189

In [22]:
df3_missense, df3_notmissense = filterTable_missense(df3, gene, mutation_col="Protein Consequence", threeLetter=True)


In [25]:
gnomAD_df = extendTableProtein(df3_missense,mutation="Protein Consequence",threeLetter=True,DCharge=True)

In [26]:
gnomAD_df

Unnamed: 0,gnomAD ID,Chromosome,Position,rsIDs,Reference,Alternate,Source,Filters - exomes,Filters - genomes,Transcript,HGVS Consequence,Protein Consequence,Transcript Consequence,VEP Annotation,ClinVar Germline Classification,ClinVar Variation ID,Flags,Allele Count,Allele Number,Allele Frequency,Homozygote Count,Hemizygote Count,Filters - joint,GroupMax FAF group,GroupMax FAF frequency,cadd,revel_max,spliceai_ds_max,pangolin_largest_ds,phylop,sift_max,polyphen_max,Allele Count African/African American,Allele Number African/African American,Homozygote Count African/African American,Hemizygote Count African/African American,Allele Count Admixed American,Allele Number Admixed American,Homozygote Count Admixed American,Hemizygote Count Admixed American,Allele Count Ashkenazi Jewish,Allele Number Ashkenazi Jewish,Homozygote Count Ashkenazi Jewish,Hemizygote Count Ashkenazi Jewish,Allele Count East Asian,Allele Number East Asian,Homozygote Count East Asian,Hemizygote Count East Asian,Allele Count European (Finnish),Allele Number European (Finnish),Homozygote Count European (Finnish),Hemizygote Count European (Finnish),Allele Count Middle Eastern,Allele Number Middle Eastern,Homozygote Count Middle Eastern,Hemizygote Count Middle Eastern,Allele Count European (non-Finnish),Allele Number European (non-Finnish),Homozygote Count European (non-Finnish),Hemizygote Count European (non-Finnish),Allele Count Amish,Allele Number Amish,Homozygote Count Amish,Hemizygote Count Amish,Allele Count South Asian,Allele Number South Asian,Homozygote Count South Asian,Hemizygote Count South Asian,Allele Count Remaining,Allele Number Remaining,Homozygote Count Remaining,Hemizygote Count Remaining,Gene,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP
54,12-112419116-C-T,12,112419116,rs267606990,C,T,gnomAD Exomes,PASS,,ENST00000351677.7,p.Thr2Ile,p.Thr2Ile,c.5C>T,missense_variant,Pathogenic/Likel...,13349.0,,4,1528266,2.617345e-06,0,0,PASS,nfe,7e-07,31.0,0.213,0.01,-0.02,8.35,0.03,0.068,0,70690,0,0,0,49938,0,0,1,28078,0,0,0,39198,0,0,0,53300,0,0,0,4318,0,0,3,1140260,0,0,0,912,0,0,0,82348,0,0,0,59224,0,0,PTPN11,T2I,2,T,I,0,0
58,12-112419121-C-G,12,112419121,rs886041517,C,G,gnomAD Exomes,PASS,,ENST00000351677.7,p.Arg4Gly,p.Arg4Gly,c.10C>G,missense_variant,Conflicting clas...,280283.0,,2,1526086,1.310542e-06,0,0,PASS,nfe,2.9e-07,33.0,0.518,0.05,-0.05,8.35,0.01,0.289,0,70428,0,0,0,49746,0,0,0,27996,0,0,0,38958,0,0,0,53024,0,0,0,4312,0,0,2,1139370,0,0,0,910,0,0,0,82190,0,0,0,59152,0,0,PTPN11,R4G,4,R,G,-1,-1
59,12-112419122-G-A,12,112419122,,G,A,gnomAD Exomes,PASS,,ENST00000351677.7,p.Arg4Gln,p.Arg4Gln,c.11G>A,missense_variant,Likely pathogenic,2729546.0,,1,1523540,6.563661e-07,0,0,PASS,,,28.4,0.393,0.0,0.0,8.25,0.01,0.344,0,70172,0,0,0,49524,0,0,0,27924,0,0,0,38626,0,0,0,52776,0,0,0,4304,0,0,1,1138308,0,0,0,912,0,0,0,81938,0,0,0,59056,0,0,PTPN11,R4Q,4,R,Q,-1,-1
195,12-112446278-G-T,12,112446278,,G,T,gnomAD Exomes,PASS,,ENST00000351677.7,p.Trp6Leu,p.Trp6Leu,c.17G>T,missense_variant,,,,1,1613914,6.196117e-07,0,0,PASS,,,32.0,0.897,0.0,0.0,6.27,0.0,0.999,0,75018,0,0,0,59984,0,0,0,29602,0,0,0,44880,0,0,0,64026,0,0,0,6048,0,0,1,1179880,0,0,0,912,0,0,0,91074,0,0,0,62490,0,0,PTPN11,W6L,6,W,L,0,0
196,12-112446279-G-T,12,112446279,rs79203122,G,T,gnomAD Exomes,PASS,,ENST00000351677.7,p.Trp6Cys,p.Trp6Cys,c.18G>T,missense_variant,Uncertain signif...,2785856.0,,3,1613968,1.858773e-06,0,0,PASS,nfe,2.8e-07,32.0,0.92,0.0,-0.01,6.27,0.02,0.987,0,75030,0,0,0,59986,0,0,0,29604,0,0,0,44876,0,0,0,64030,0,0,0,6048,0,0,2,1179906,0,0,0,912,0,0,0,91082,0,0,1,62494,0,0,PTPN11,W6C,6,W,C,0,0
197,12-112446286-C-T,12,112446286,rs566068139,C,T,gnomAD Genomes,,PASS,ENST00000351677.7,p.Pro9Ser,p.Pro9Ser,c.25C>T,missense_variant,,,,1,1613686,6.196992e-07,0,0,PASS,,,22.6,0.662,0.0,0.0,6.35,,,0,74830,0,0,0,59952,0,0,0,29606,0,0,0,44886,0,0,1,63988,0,0,0,6072,0,0,0,1179914,0,0,0,910,0,0,0,91056,0,0,0,62472,0,0,PTPN11,P9S,9,P,S,0,0
198,12-112446286-C-G,12,112446286,rs566068139,C,G,gnomAD Genomes,,PASS,ENST00000351677.7,p.Pro9Ala,p.Pro9Ala,c.25C>G,missense_variant,,,,1,1613804,6.196539e-07,0,0,PASS,,,21.8,0.54,0.0,0.0,6.35,0.09,0.145,1,74952,0,0,0,59972,0,0,0,29606,0,0,0,44874,0,0,0,63988,0,0,0,6050,0,0,0,1179906,0,0,0,910,0,0,0,91052,0,0,0,62494,0,0,PTPN11,P9A,9,P,A,0,0
199,12-112446287-C-G,12,112446287,rs536503257,C,G,gnomAD Exomes,PASS,,ENST00000351677.7,p.Pro9Arg,p.Pro9Arg,c.26C>G,missense_variant,,,,2,1613982,1.239171e-06,0,0,PASS,,,22.5,0.591,0.0,0.0,6.35,0.31,0.206,0,75034,0,0,0,59996,0,0,0,29606,0,0,0,44878,0,0,2,63992,0,0,0,6050,0,0,0,1179950,0,0,0,912,0,0,0,91068,0,0,0,62496,0,0,PTPN11,P9R,9,P,R,1,1
200,12-112446287-C-A,12,112446287,rs536503257,C,A,gnomAD Genomes,,PASS,ENST00000351677.7,p.Pro9Gln,p.Pro9Gln,c.26C>A,missense_variant,,,,1,1613982,6.195856e-07,0,0,PASS,,,23.8,0.68,0.0,0.0,6.35,0.07,0.418,0,75034,0,0,0,59996,0,0,0,29606,0,0,0,44878,0,0,0,63992,0,0,0,6050,0,0,0,1179950,0,0,0,912,0,0,1,91068,0,0,0,62496,0,0,PTPN11,P9Q,9,P,Q,0,0
201,12-112446289-A-G,12,112446289,rs368633510,A,G,gnomAD Exomes,PASS,,ENST00000351677.7,p.Asn10Asp,p.Asn10Asp,c.28A>G,missense_variant,Uncertain signif...,838860.0,,5,1614078,3.097744e-06,0,0,PASS,nfe,7.9e-07,22.7,0.372,0.03,0.01,4.55,0.26,0.012,0,75058,0,0,1,60006,0,0,0,29602,0,0,0,44882,0,0,0,64032,0,0,0,6052,0,0,4,1179960,0,0,0,912,0,0,0,91082,0,0,0,62492,0,0,PTPN11,N10D,10,N,D,-1,-1


In [27]:
len(gnomAD_df)

446

In [47]:
print(gnomAD_df[gnomAD_df['ClinVar Germline Classification'].str.contains('enign', na=False)])

             gnomAD ID  Chromosome   Position         rsIDs Reference  \
212   12-112446314-A-G          12  112446314   rs587778635         A   
516   12-112453254-A-G          12  112453254   rs397516805         A   
1132  12-112477722-A-G          12  112477722   rs201787206         A   
1135  12-112477729-T-C          12  112477729   rs201226824         T   
1238  12-112477951-G-A          12  112477951   rs535800148         G   
1251  12-112477975-G-A          12  112477975   rs397507534         G   
1494  12-112486466-C-G          12  112486466   rs377257792         C   
1639  12-112488483-A-G          12  112488483  rs2135915117         A   
1905  12-112502202-C-T          12  112502202   rs148176616         C   
1918  12-112502222-C-T          12  112502222   rs397516797         C   
2116  12-112504735-C-A          12  112504735           NaN         C   

     Alternate               Source Filters - exomes Filters - genomes  \
212          G  gnomAD Exomes,gn...             P

# ^^ CHROMOSONE 2

#### Meta predictors

## ?? get chromosone range from uniprot but it only has GRCH38 >>

In [20]:
def get_chr_range(df, gene, gene_col="Gene(s)", loc_col="GRCh38Location", chrom_col="GRCh38Chromosome"):
    # Filter for specified gene
    gene_df = df[df[gene_col].astype(str).str.contains(gene, na=False)].copy()

    # Convert location column to numeric (ignore errors)
    gene_df[loc_col] = pd.to_numeric(gene_df[loc_col], errors='coerce')

    # Drop missing positions
    #gene_df = gene_df.dropna(subset=[loc_col])

    # Find chromosome, min, and max positions
    chrom = gene_df[chrom_col].dropna().unique()[0]
    min_pos = gene_df[loc_col].min()
    max_pos = gene_df[loc_col].max()

    print(f" Gene: {gene}")
    print(f" Chromosome: {chrom}")
    print(f" Position range: {int(min_pos)} - {int(max_pos)}")

    return chrom, min_pos, max_pos


##### REVEL

Downloaded from https://sites.google.com/site/revelgenomics/downloads

Try again to curl from dfNSFP

In [16]:
# Manually download the REVEL data (for all genes)

# Read just the first 10 rows (6GB REVEL file)
dfa = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\revel-v1.3_all_chromosomes\revel_with_transcript_ids", nrows=10)
dfa

Unnamed: 0,chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid
0,1,35142,35142,G,A,T,M,0.027,ENST00000417324
1,1,35142,35142,G,C,T,R,0.035,ENST00000417324
2,1,35142,35142,G,T,T,K,0.043,ENST00000417324
3,1,35143,35143,T,A,T,S,0.018,ENST00000417324
4,1,35143,35143,T,C,T,A,0.034,ENST00000417324
5,1,35143,35143,T,G,T,P,0.039,ENST00000417324
6,1,35144,35144,A,C,C,W,0.012,ENST00000417324
7,1,35145,35145,C,A,C,F,0.023,ENST00000417324
8,1,35145,35145,C,G,C,S,0.029,ENST00000417324
9,1,35145,35145,C,T,C,Y,0.016,ENST00000417324


In [73]:
chrom, min_pos, max_pos = get_chr_range(ClinVar_df, gene, gene_col="Gene(s)", loc_col="GRCh38Location", chrom_col="GRCh38Chromosome")


 Gene: PTPN11
 Chromosome: 12.0
 Position range: 112419113 - 112504754


In [97]:


revel_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\revel-v1.3_all_chromosomes\revel_with_transcript_ids"

chunksize = 1_000_000
ptpn11_revel_chunks = []

for chunk in pd.read_csv(revel_file, chunksize=chunksize, low_memory=False):
    # Convert positions in revel_file to numeric
    chunk['grch38_pos'] = pd.to_numeric(chunk['grch38_pos'], errors='coerce')
    
    # Only keep PTPN11 chromosome
    chunk = chunk[chunk['chr'] == chrom]
    
    # Keep only positions within PTPN11 range
    filtered_chunk = chunk[
        (chunk['grch38_pos'] >= min_pos) &
        (chunk['grch38_pos'] <= max_pos)
    ]
    
    if not filtered_chunk.empty:
        ptpn11_revel_chunks.append(filtered_chunk)
    
    # Early stop: REVEL is sorted by grch38_pos
    if chunk['grch38_pos'].max() > max_pos:
        break

# Concatenate all filtered chunks
if ptpn11_revel_chunks:
    ptpn11_revel = pd.concat(ptpn11_revel_chunks)
    ptpn11_revel.to_csv( r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv", index=False)


##### BayesDel

Downloaded from https://drive.google.com/drive/folders/1K4LI6ZSsUGBhHoChUtegC8bgCt7hbQlA (2017! version)

Need to install and run VICTOR locally (Linux)


In [74]:
chrom37, min_pos37, max_pos37 = get_chr_range(ClinVar_df, gene, gene_col="Gene(s)", loc_col="GRCh37Location", chrom_col="GRCh37Chromosome")


 Gene: PTPN11
 Chromosome: 12.0
 Position range: 112856917 - 112942558


In [98]:
# Unzipped file path for gene specific chrom and GRCH37 location
bayesdel_noAF_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\BayesDel_170824_noAF\BayesDel_170824_noAF\BayesDel_170824_noAF_chr12\BayesDel_170824_noAF_chr12"

with open(bayesdel_noAF_file, "r") as f:
    for _ in range(10):
        print(f.readline().strip())


#Chr	Start	ref	alt	BayesDel_nsfp33a_noAF
12	176049	A	C	-0.217265
12	176049	A	G	-0.171535
12	176049	A	T	-0.21727
12	176050	T	A	-0.288296
12	176050	T	C	-0.265607
12	176050	T	G	-0.301534
12	176051	G	A	-0.0488746
12	176051	G	C	-0.0488725
12	176051	G	T	-0.048873


In [99]:
def load_bayesdel_region(file_path, min_pos, max_pos, output_csv=None, chunksize=100_000, columns=None):
    chunks = []
    if columns is None:
        columns = ["Chr", "Start", "Ref", "Alt", "Score"]

    for chunk in pd.read_csv(file_path, sep="\t", names=columns, chunksize=chunksize, comment="#"):
        chunk["Start"] = pd.to_numeric(chunk["Start"], errors="coerce")
        # Filter by genomic position
        filtered = chunk[(chunk["Start"] >= min_pos) & (chunk["Start"] <= max_pos)]

        if not filtered.empty:
            chunks.append(filtered)

    if chunks:
        df_region = pd.concat(chunks, ignore_index=True)

    # Save to CSV 
    if output_csv:
        df_region.to_csv(output_csv, index=False)
        
    return df_region





In [100]:
bayesdel_noAF_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\BayesDel_170824_noAF\BayesDel_170824_noAF\BayesDel_170824_noAF_chr12\BayesDel_170824_noAF_chr12"
bayesdel_addAF_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\BayesDel_170824_noAF\BayesDel_170824_noAF\BayesDel_170824_noAF_chr12\BayesDel_170824_noAF_chr12"

# Load noAF region and save output
PTPN11_bayesdel_noAF = load_bayesdel_region(bayesdel_noAF_file, min_pos37, max_pos37, output_csv=r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_BayesDel_noAF.csv")

# Load addAF region and save output
PTPN11_bayesdel_addAF = load_bayesdel_region(bayesdel_addAF_file, min_pos37, max_pos37, output_csv=r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_BayesDel_addAF.csv")

#print("\nnoAF columns:", df_noAF.columns.tolist())
#print("\naddAF columns:", df_addAF.columns.tolist())

#### Merge Revel & BayesDel

In [103]:
print(revel_df["Ensembl_transcriptid"].unique())


['ENST00000392597;ENST00000351677;ENST00000392596' 'ENST00000530818'
 'ENST00000392596' 'ENST00000392597;ENST00000351677' 'ENST00000351677'
 'ENST00000392597']


In [10]:
revel_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv")


In [11]:
revel_df.head()

Unnamed: 0,chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid
0,12,112856917,112419113.0,T,A,M,K,0.294,ENST00000392597;...
1,12,112856917,112419113.0,T,C,M,T,0.315,ENST00000392597;...
2,12,112856917,112419113.0,T,G,M,R,0.202,ENST00000392597;...
3,12,112856918,112419114.0,G,A,M,I,0.222,ENST00000392597;...
4,12,112856918,112419114.0,G,C,M,I,0.222,ENST00000392597;...


In [12]:
# Load PTPN11_REVEL and BayesDel
revel_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv")
bayesdel_noAF_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_BayesDel_noAF.csv")
bayesdel_addAF_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_BayesDel_addAF.csv")

# Merge PTPN11_REVEL scores ?????on ClinVar
bayesdel_noAF_df = bayesdel_noAF_df.rename(columns={"Score": "BayesDel_noAF"})
bayesdel_addAF_df = bayesdel_addAF_df.rename(columns={"Score": "BayesDel_addAF"})

revel_bayesdel_noAF_df = pd.merge(
    revel_df,
    bayesdel_noAF_df,
    left_on=['hg19_pos', 'ref', 'alt'],
    right_on=['Start', 'Ref', 'Alt'],
    how='left',
)

revel_bayesdel_noAF_df = revel_bayesdel_noAF_df.drop(columns=['Chr','Start', 'Ref', 'Alt'])

revel_bayesdel_df = pd.merge(
    revel_bayesdel_noAF_df,
    bayesdel_addAF_df,
    left_on=['hg19_pos', 'ref', 'alt'],
    right_on=['Start', 'Ref', 'Alt'],
    how='left',
)

revel_bayesdel_df = revel_bayesdel_df.drop(columns=['Chr','Start', 'Ref', 'Alt'])



In [13]:
revel_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv")


In [17]:
revel_df.head(30)

Unnamed: 0,chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid
0,12,112856917,112419113.0,T,A,M,K,0.294,ENST00000392597;...
1,12,112856917,112419113.0,T,C,M,T,0.315,ENST00000392597;...
2,12,112856917,112419113.0,T,G,M,R,0.202,ENST00000392597;...
3,12,112856918,112419114.0,G,A,M,I,0.222,ENST00000392597;...
4,12,112856918,112419114.0,G,C,M,I,0.222,ENST00000392597;...
5,12,112856918,112419114.0,G,T,M,I,0.222,ENST00000392597;...
6,12,112856919,112419115.0,A,C,T,P,0.194,ENST00000392597;...
7,12,112856919,112419115.0,A,G,T,A,0.054,ENST00000392597;...
8,12,112856919,112419115.0,A,T,T,S,0.048,ENST00000392597;...
9,12,112856920,112419116.0,C,A,T,K,0.155,ENST00000392597;...


#### Convert protein to genomic coordinates

In [18]:
# ENST00000635625 ENST00000351677 ENST00000392597

def getPTPN11GC(position, delay = 0.2):
    server = "https://rest.ensembl.org"
    adjusted_position = position + 165
    ext = "/map/cdna/ENST00000351677/"+ str(adjusted_position) + ".." + str(adjusted_position) + "?"
    
    time.sleep(delay)
    r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
    if not r.ok:
      r.raise_for_status()
      sys.exit()
 
    decoded = r.json()
    return(decoded['mappings'][0]['start'])

In [144]:
NSEuronet_df = extendTableDNA(NSEuronet_df,mutation="cDNA")

In [145]:
NSEuronet_df

Unnamed: 0,Gene,cDNA,Protein_desc,Disease,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP,chr,codon,ref,alt
1,PTPN11,c.922A>G,p.N308D,Noonan syndrome,N308D,308,N,D,-1,-1,12.0,922,A,G
2,PTPN11,c.184T>G,p.Y62D,Noonan syndrome,Y62D,62,Y,D,-1,-1,12.0,184,T,G
3,PTPN11,c.922A>G,p.N308D,Noonan syndrome,N308D,308,N,D,-1,-1,12.0,922,A,G
4,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0,12.0,1232,C,T
5,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0,12.0,1232,C,T
6,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0,12.0,1232,C,T
7,PTPN11,c.1529A>G,p.Q510R,NF1-Noonan syndrome,Q510R,510,Q,R,1,1,12.0,1529,A,G
8,PTPN11,c.179G>C,p.G60A,Noonan syndrome,G60A,60,G,A,0,0,12.0,179,G,C
9,PTPN11,c.182A>G,p.D61G,Noonan syndrome,D61G,61,D,G,1,1,12.0,182,A,G
10,PTPN11,c.184T>G,p.Y62D,Noonan syndrome,Y62D,62,Y,D,-1,-1,12.0,184,T,G


In [146]:
NSEuronet_df["pos"] = NSEuronet_df["codon"].apply(lambda x: getPTPN11GC(int(x)))


In [153]:
NSEuronet_df

Unnamed: 0,Gene,cDNA,Protein_desc,Disease,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP,chr,codon,ref,alt,pos
1,PTPN11,c.922A>G,p.N308D,Noonan syndrome,N308D,308,N,D,-1,-1,12.0,922,A,G,112477719
2,PTPN11,c.184T>G,p.Y62D,Noonan syndrome,Y62D,62,Y,D,-1,-1,12.0,184,T,G,112450364
3,PTPN11,c.922A>G,p.N308D,Noonan syndrome,N308D,308,N,D,-1,-1,12.0,922,A,G,112477719
4,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0,12.0,1232,C,T,112486482
5,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0,12.0,1232,C,T,112486482
6,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0,12.0,1232,C,T,112486482
7,PTPN11,c.1529A>G,p.Q510R,NF1-Noonan syndrome,Q510R,510,Q,R,1,1,12.0,1529,A,G,112489105
8,PTPN11,c.179G>C,p.G60A,Noonan syndrome,G60A,60,G,A,0,0,12.0,179,G,C,112450359
9,PTPN11,c.182A>G,p.D61G,Noonan syndrome,D61G,61,D,G,1,1,12.0,182,A,G,112450362
10,PTPN11,c.184T>G,p.Y62D,Noonan syndrome,Y62D,62,Y,D,-1,-1,12.0,184,T,G,112450364


##### Check with gnomAD and ClinVar

In [135]:
checkgnomAD = extendTableDNA(gnomAD_df,mutation="Transcript Consequence")

In [136]:
checkgnomAD["pos"] = checkgnomAD["codon"].apply(lambda x: getPTPN11GC(int(x)))


In [45]:
checkgnomAD.head()

Unnamed: 0,gnomAD ID,Chromosome,Position,rsIDs,Reference,Alternate,Source,Filters - exomes,Filters - genomes,Transcript,HGVS Consequence,Protein Consequence,Transcript Consequence,VEP Annotation,ClinVar Germline Classification,ClinVar Variation ID,Flags,Allele Count,Allele Number,Allele Frequency,Homozygote Count,Hemizygote Count,Filters - joint,GroupMax FAF group,GroupMax FAF frequency,cadd,revel_max,spliceai_ds_max,pangolin_largest_ds,phylop,sift_max,polyphen_max,Allele Count African/African American,Allele Number African/African American,Homozygote Count African/African American,Hemizygote Count African/African American,Allele Count Admixed American,Allele Number Admixed American,Homozygote Count Admixed American,Hemizygote Count Admixed American,Allele Count Ashkenazi Jewish,Allele Number Ashkenazi Jewish,Homozygote Count Ashkenazi Jewish,Hemizygote Count Ashkenazi Jewish,Allele Count East Asian,Allele Number East Asian,Homozygote Count East Asian,Hemizygote Count East Asian,Allele Count European (Finnish),Allele Number European (Finnish),Homozygote Count European (Finnish),Hemizygote Count European (Finnish),Allele Count Middle Eastern,Allele Number Middle Eastern,Homozygote Count Middle Eastern,Hemizygote Count Middle Eastern,Allele Count European (non-Finnish),Allele Number European (non-Finnish),Homozygote Count European (non-Finnish),Hemizygote Count European (non-Finnish),Allele Count Amish,Allele Number Amish,Homozygote Count Amish,Hemizygote Count Amish,Allele Count South Asian,Allele Number South Asian,Homozygote Count South Asian,Hemizygote Count South Asian,Allele Count Remaining,Allele Number Remaining,Homozygote Count Remaining,Hemizygote Count Remaining,Gene,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP,chr,codon,ref,alt,pos
54,12-112419116-C-T,12,112419116,rs267606990,C,T,gnomAD Exomes,PASS,,ENST00000351677.7,p.Thr2Ile,p.Thr2Ile,c.5C>T,missense_variant,Pathogenic/Likel...,13349.0,,4,1528266,2.617345e-06,0,0,PASS,nfe,7e-07,31.0,0.213,0.01,-0.02,8.35,0.03,0.068,0,70690,0,0,0,49938,0,0,1,28078,0,0,0,39198,0,0,0,53300,0,0,0,4318,0,0,3,1140260,0,0,0,912,0,0,0,82348,0,0,0,59224,0,0,PTPN11,T2I,2,T,I,0,0,12.0,5,C,T,112419116
58,12-112419121-C-G,12,112419121,rs886041517,C,G,gnomAD Exomes,PASS,,ENST00000351677.7,p.Arg4Gly,p.Arg4Gly,c.10C>G,missense_variant,Conflicting clas...,280283.0,,2,1526086,1.310542e-06,0,0,PASS,nfe,2.9e-07,33.0,0.518,0.05,-0.05,8.35,0.01,0.289,0,70428,0,0,0,49746,0,0,0,27996,0,0,0,38958,0,0,0,53024,0,0,0,4312,0,0,2,1139370,0,0,0,910,0,0,0,82190,0,0,0,59152,0,0,PTPN11,R4G,4,R,G,-1,-1,12.0,10,C,G,112419121
59,12-112419122-G-A,12,112419122,,G,A,gnomAD Exomes,PASS,,ENST00000351677.7,p.Arg4Gln,p.Arg4Gln,c.11G>A,missense_variant,Likely pathogenic,2729546.0,,1,1523540,6.563661e-07,0,0,PASS,,,28.4,0.393,0.0,0.0,8.25,0.01,0.344,0,70172,0,0,0,49524,0,0,0,27924,0,0,0,38626,0,0,0,52776,0,0,0,4304,0,0,1,1138308,0,0,0,912,0,0,0,81938,0,0,0,59056,0,0,PTPN11,R4Q,4,R,Q,-1,-1,12.0,11,G,A,112419122
195,12-112446278-G-T,12,112446278,,G,T,gnomAD Exomes,PASS,,ENST00000351677.7,p.Trp6Leu,p.Trp6Leu,c.17G>T,missense_variant,,,,1,1613914,6.196117e-07,0,0,PASS,,,32.0,0.897,0.0,0.0,6.27,0.0,0.999,0,75018,0,0,0,59984,0,0,0,29602,0,0,0,44880,0,0,0,64026,0,0,0,6048,0,0,1,1179880,0,0,0,912,0,0,0,91074,0,0,0,62490,0,0,PTPN11,W6L,6,W,L,0,0,12.0,17,G,T,112446278
196,12-112446279-G-T,12,112446279,rs79203122,G,T,gnomAD Exomes,PASS,,ENST00000351677.7,p.Trp6Cys,p.Trp6Cys,c.18G>T,missense_variant,Uncertain signif...,2785856.0,,3,1613968,1.858773e-06,0,0,PASS,nfe,2.8e-07,32.0,0.92,0.0,-0.01,6.27,0.02,0.987,0,75030,0,0,0,59986,0,0,0,29604,0,0,0,44876,0,0,0,64030,0,0,0,6048,0,0,2,1179906,0,0,0,912,0,0,0,91082,0,0,1,62494,0,0,PTPN11,W6C,6,W,C,0,0,12.0,18,G,T,112446279


In [139]:
filteredG = checkgnomAD[checkgnomAD["Position"] != checkgnomAD["pos"]]
#print(filteredG[["Transcript Consequence", "Position", "pos"]])
filteredG

Unnamed: 0,gnomAD ID,Chromosome,Position,rsIDs,Reference,Alternate,Source,Filters - exomes,Filters - genomes,Transcript,HGVS Consequence,Protein Consequence,Transcript Consequence,VEP Annotation,ClinVar Germline Classification,ClinVar Variation ID,Flags,Allele Count,Allele Number,Allele Frequency,Homozygote Count,Hemizygote Count,Filters - joint,GroupMax FAF group,GroupMax FAF frequency,cadd,revel_max,spliceai_ds_max,pangolin_largest_ds,phylop,sift_max,polyphen_max,Allele Count African/African American,Allele Number African/African American,Homozygote Count African/African American,Hemizygote Count African/African American,Allele Count Admixed American,Allele Number Admixed American,Homozygote Count Admixed American,Hemizygote Count Admixed American,Allele Count Ashkenazi Jewish,Allele Number Ashkenazi Jewish,Homozygote Count Ashkenazi Jewish,Hemizygote Count Ashkenazi Jewish,Allele Count East Asian,Allele Number East Asian,Homozygote Count East Asian,Hemizygote Count East Asian,Allele Count European (Finnish),Allele Number European (Finnish),Homozygote Count European (Finnish),Hemizygote Count European (Finnish),Allele Count Middle Eastern,Allele Number Middle Eastern,Homozygote Count Middle Eastern,Hemizygote Count Middle Eastern,Allele Count European (non-Finnish),Allele Number European (non-Finnish),Homozygote Count European (non-Finnish),Hemizygote Count European (non-Finnish),Allele Count Amish,Allele Number Amish,Homozygote Count Amish,Hemizygote Count Amish,Allele Count South Asian,Allele Number South Asian,Homozygote Count South Asian,Hemizygote Count South Asian,Allele Count Remaining,Allele Number Remaining,Homozygote Count Remaining,Hemizygote Count Remaining,Gene,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP,chr,codon,ref,alt,pos
879,12-112457295-C-A,12,112457295,,C,A,gnomAD Exomes,PASS,,ENST00000530818.1,p.Ala99Asp,p.Ala99Asp,c.296C>A,missense_variant,,,,1,362304,2.760113e-06,0,0,PASS,,,2.21,,0.0,0.0,-0.177,,,0,45290,0,0,0,28910,0,0,0,10074,0,0,0,8140,0,0,0,21214,0,0,0,1116,0,0,0,184172,0,0,0,908,0,0,1,50396,0,0,0,12084,0,0,PTPN11,A99D,99,A,D,-1,-1,12.0,296,C,A,112450476
880,12-112457299-G-T,12,112457299,rs60920694,G,T,"gnomAD Exomes,gn...",PASS,PASS,ENST00000530818.1,p.Glu100Asp,p.Glu100Asp,c.300G>T,missense_variant,,,,4,354554,1.128178e-05,0,0,PASS,nfe,4e-06,2.29,,0.0,0.0,0.581,,,0,44920,0,0,0,28138,0,0,0,9764,0,0,0,7874,0,0,0,21066,0,0,1,1104,0,0,3,179942,0,0,0,912,0,0,0,49130,0,0,0,11704,0,0,PTPN11,E100D,100,E,D,0,0,12.0,300,G,T,112450480
881,12-112457304-A-T,12,112457304,rs1050233131,A,T,"gnomAD Exomes,gn...",PASS,PASS,ENST00000530818.1,p.Asp102Val,p.Asp102Val,c.305A>T,missense_variant,,,,13,350510,3.708881e-05,0,0,PASS,nfe,3e-05,4.71,,0.0,0.0,0.448,,,1,44826,0,0,2,27930,0,0,0,9514,0,0,0,7694,0,0,0,20938,0,0,0,1096,0,0,10,177662,0,0,0,912,0,0,0,48374,0,0,0,11564,0,0,PTPN11,D102V,102,D,V,1,1,12.0,305,A,T,112450485
882,12-112457305-T-G,12,112457305,rs945278667,T,G,gnomAD Genomes,,PASS,ENST00000530818.1,p.Asp102Glu,p.Asp102Glu,c.306T>G,missense_variant,,,,1,350772,2.850855e-06,0,0,PASS,,,6.04,,0.0,0.0,0.449,,,1,44954,0,0,0,28246,0,0,0,9520,0,0,0,7702,0,0,0,20944,0,0,0,1098,0,0,0,177312,0,0,0,912,0,0,0,48518,0,0,0,11566,0,0,PTPN11,D102E,102,D,E,0,0,12.0,306,T,G,112450486
884,12-112457313-A-G,12,112457313,rs1432432570,A,G,gnomAD Genomes,,PASS,ENST00000530818.1,p.Gln105Arg,p.Gln105Arg,c.314A>G,missense_variant,,,,1,342850,2.916727e-06,0,0,AC0,,,6.45,,0.18,0.07,-0.339,,,1,44770,0,0,0,27948,0,0,0,9214,0,0,0,7486,0,0,0,20680,0,0,0,1064,0,0,0,172350,0,0,0,912,0,0,0,47142,0,0,0,11284,0,0,PTPN11,Q105R,105,Q,R,1,1,12.0,314,A,G,112450494
885,12-112457314-G-C,12,112457314,rs1479284129,G,C,gnomAD Exomes,PASS,,ENST00000530818.1,p.Gln105His,p.Gln105His,c.315G>C,missense_variant,,,,1,341174,2.931056e-06,0,0,PASS,,,6.09,,0.0,0.0,0.581,,,0,44802,0,0,1,27810,0,0,0,9146,0,0,0,7418,0,0,0,20622,0,0,0,1036,0,0,0,171304,0,0,0,912,0,0,0,46886,0,0,0,11238,0,0,PTPN11,Q105H,105,Q,H,0,1,12.0,315,G,C,112450495
1491,12-112486463-G-T,12,112486463,rs779291780,G,T,gnomAD Exomes,PASS,,ENST00000635625.1,p.Ala409Ser,p.Ala409Ser,c.1225G>T,missense_variant,,,,1,1611062,6.207086e-07,0,0,PASS,,,22.7,,0.0,0.0,8.61,,,0,74986,0,0,0,60000,0,0,0,29594,0,0,0,44858,0,0,0,64032,0,0,0,6056,0,0,1,1177214,0,0,0,912,0,0,0,91022,0,0,0,62388,0,0,PTPN11,A409S,409,A,S,0,0,12.0,1225,G,T,112486475
1492,12-112486464-C-T,12,112486464,,C,T,gnomAD Exomes,PASS,,ENST00000635625.1,p.Ala409Val,p.Ala409Val,c.1226C>T,missense_variant,,,,1,1611848,6.204059e-07,0,0,PASS,,,21.9,,0.0,0.0,5.81,,,0,75008,0,0,0,60000,0,0,0,29594,0,0,0,44866,0,0,0,64032,0,0,0,6056,0,0,1,1177928,0,0,0,912,0,0,0,91040,0,0,0,62412,0,0,PTPN11,A409V,409,A,V,0,0,12.0,1226,C,T,112486476
1494,12-112486466-C-G,12,112486466,rs377257792,C,G,gnomAD Exomes,PASS,,ENST00000635625.1,p.Leu410Val,p.Leu410Val,c.1228C>G,missense_variant,Likely benign,2869426.0,,1,1612686,6.200835e-07,0,0,PASS,,,13.2,,0.0,-0.01,1.92,,,0,75020,0,0,0,60012,0,0,0,29598,0,0,0,44872,0,0,0,64036,0,0,0,6056,0,0,1,1178690,0,0,0,912,0,0,0,91050,0,0,0,62440,0,0,PTPN11,L410V,410,L,V,0,0,12.0,1228,C,G,112486478
1498,12-112486469-C-T,12,112486469,,C,T,gnomAD Exomes,PASS,,ENST00000635625.1,p.Leu411Phe,p.Leu411Phe,c.1231C>T,missense_variant,,,,1,1613320,6.198398e-07,0,0,PASS,,,19.6,,0.0,0.0,8.71,,,0,75046,0,0,0,60016,0,0,0,29602,0,0,0,44880,0,0,0,64040,0,0,0,6058,0,0,0,1179242,0,0,0,912,0,0,1,91068,0,0,0,62456,0,0,PTPN11,L411F,411,L,F,0,0,12.0,1231,C,T,112486481


In [148]:
unique_consequences = filteredG['Transcript Consequence'].unique()


In [151]:
# Convert NSEuronet cDNA column to a set for faster lookup
cDNA_set = set(NSEuronet_df['cDNA'].dropna())

# Check which unique_consequences are in NSEuronet cDNA
consequences_in_NSEuronet = [cons for cons in unique_consequences if cons in cDNA_set]

print(consequences_in_NSEuronet)

[]


In [64]:
len(ClinVar_df)

557

In [68]:
with pd.option_context('display.max_colwidth', None):
    print(ClinVar_df[ClinVar_df["cDNA_desc"].isna()])

# Though missense, drop for the case of checking genomic coordinate mapping
checkClinVar = ClinVar_df.dropna(subset=["cDNA_desc"])


                                                      Name Gene(s)  \
144     NM_002834.5(PTPN11):c.180_181delinsAA (p.Asp61Asn)  PTPN11   
154     NM_002834.5(PTPN11):c.184_185delinsAT (p.Tyr62Ile)  PTPN11   
160     NM_002834.5(PTPN11):c.188_189delinsGC (p.Tyr63Cys)  PTPN11   
500    NM_002834.5(PTPN11):c.771_772delinsAA (p.Glu258Lys)  PTPN11   
508    NM_002834.5(PTPN11):c.781_782delinsTA (p.Leu261Tyr)  PTPN11   
875  NM_002834.5(PTPN11):c.1402_1403delinsGA (p.Thr468Glu)  PTPN11   
956  NM_002834.5(PTPN11):c.1506_1507delinsCC (p.Gly503Arg)  PTPN11   
967  NM_002834.5(PTPN11):c.1517_1518delinsCC (p.Gln506Pro)  PTPN11   

          Protein change              Condition(s)     Accession  \
144           D61N, D60N              not provided  VCV000372703   
154           Y62I, Y61I                 RASopathy  VCV002814413   
160           Y63C, Y62C              not provided  VCV000372674   
500         E257K, E258K  Cardiovascular phenotype  VCV004147357   
508         L260Y, L261Y     

In [69]:
checkClinVar = extendTableDNA(checkClinVar,mutation="cDNA_desc")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["chr"] = chrom # extracted from ClinVar
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["codon"] = table[mutation].apply(lambda x: re.sub("\D+","",x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  table["ref"] = table[mutation].apply(lambda x: re.split("\d",re.split('>',x)[0])[-1])
A va

In [None]:
checkClinVar["pos"] = checkClinVar["codon"].apply(lambda x: getPTPN11GC(int(x)))


In [50]:
checkClinVar.head()

NameError: name 'checkClinVar' is not defined

In [124]:
# ENST00000635625

filteredC = checkClinVar[checkClinVar["GRCh38Location"] != checkClinVar["pos"]]
print(filteredC[["Protein_change","cDNA_desc", "GRCh38Location", "pos"]])

     Protein_change  cDNA_desc  GRCh38Location        pos
778           G409A  c.1226G>C       112486476  112486464
780           N410H  c.1228A>C       112486478  112486466
781           T411K  c.1232C>A       112486482  112486470
782           T411M  c.1232C>T       112486482  112486470
784           E412G  c.1235A>G       112486485  112486473
785           T414M  c.1241C>T       112486491  112486479
789           V415F  c.1243G>T       112486493  112486481
791           R421W  c.1261C>T       112486511  112486499
792           R421Q  c.1262G>A       112486512  112486500
796           P424L  c.1271C>T       112486521  112486509
798           H426N  c.1276C>A       112486526  112486514
799           H426P  c.1277A>C       112486527  112486515
802           V428L  c.1282G>C       112486532  112486520
803           V428L  c.1282G>T       112486532  112486520
804           V428M  c.1282G>A       112486532  112486520
805           S430T  c.1289G>C       112486539  112486527
807           

In [118]:
filteredC["pos"].dtype


dtype('int64')

In [134]:
# ENST00000351677 with adjustment 


checkClinVar["GRCh38Location"] = pd.to_numeric(checkClinVar["GRCh38Location"], errors="coerce")
filteredC = checkClinVar[checkClinVar["GRCh38Location"] != checkClinVar["pos"]]
print(filteredC[["Protein_change","cDNA_desc", "GRCh38Location", "pos"]])

Empty DataFrame
Columns: [Protein_change, cDNA_desc, GRCh38Location, pos]
Index: []


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  checkClinVar["GRCh38Location"] = pd.to_numeric(checkClinVar["GRCh38Location"], errors="coerce")


In [138]:
print(checkClinVar[["Protein_change","cDNA_desc", "GRCh38Location", "pos"]])

     Protein_change  cDNA_desc  GRCh38Location        pos
24              M1R     c.2T>G       112419113  112419113
25              T2I     c.5C>T       112419116  112419116
30              R4G    c.10C>G       112419121  112419121
31              R4Q    c.11G>A       112419122  112419122
58              W6C    c.18G>T       112446279  112446279
59              W6C    c.18G>C       112446279  112446279
60             N10Y    c.28A>T       112446289  112446289
61             N10H    c.28A>C       112446289  112446289
62             N10D    c.28A>G       112446289  112446289
63             N10T    c.29A>C       112446290  112446290
64             N10S    c.29A>G       112446290  112446290
65             I11T    c.32T>C       112446293  112446293
66             T12A    c.34A>G       112446295  112446295
69             V14L    c.40G>T       112446301  112446301
72             N18D    c.52A>G       112446313  112446313
73             N18S    c.53A>G       112446314  112446314
75            

#### Convert genomic coordinates to protein positions

In [44]:
print(revel_bayesdel_df[revel_bayesdel_df.isna().any(axis=1)])


Empty DataFrame
Columns: [chr, hg19_pos, grch38_pos, ref, alt, aaref, aaalt, REVEL, Ensembl_transcriptid, BayesDel_noAF, BayesDel_addAF]
Index: []


In [43]:
# Filter out suspect transcript IDs
revel_bayesdel_df = revel_bayesdel_df[revel_bayesdel_df['Ensembl_transcriptid'].str.contains("ENST00000392597|ENST00000351677", na=False)]


In [47]:
def annotatePP(df, transcript_id="ENST00000351677", batch_size=200, delay=0.2):
    server = "https://rest.ensembl.org"
    ext = "/vep/human/region"
    headers = {"Content-Type": "application/json", "Accept": "application/json"}

    protein_positions = [None] * len(df)  # pre-allocate list

    # Split dataframe into batches
    for start in range(0, len(df), batch_size):
        batch = df.iloc[start:start+batch_size]
        variants = [
            f"{row['chr']} {int(row['grch38_pos'])} {int(row['grch38_pos'])} {row['ref']}/{row['alt']}"
            for idx, row in batch.iterrows()
        ]
        payload = {"variants": variants}
        if transcript_id:
            payload["transcript_id"] = transcript_id

        try:
            time.sleep(delay)
            r = requests.post(server + ext, headers=headers, json=payload)
            r.raise_for_status()
            decoded = r.json()
        except Exception as e:
            print(f"Error querying VEP batch {start}-{start+batch_size}: {e}")
            continue

        # Extract protein positions for each variant in the batch
        for i, result in enumerate(decoded):
            consequences = result.get("transcript_consequences", [])
            protein_pos = None
            if transcript_id:
                for t in consequences:
                    if t["transcript_id"] == transcript_id:
                        protein_pos = t.get("protein_start")
                        break
            elif consequences:
                protein_pos = consequences[0].get("protein_start")
            protein_positions[start + i] = protein_pos

    df['protein_pos'] = protein_positions
    return df


In [48]:
revel_bayesdel_df = annotatePP(revel_bayesdel_df)


In [82]:
revel_bayesdel_df.head()

Unnamed: 0,chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid,BayesDel_noAF,BayesDel_addAF,protein_pos
0,12,112856917,112419113,T,A,M,K,0.294,ENST00000392597;...,0.66,0.66,1.0
1,12,112856917,112419113,T,C,M,T,0.315,ENST00000392597;...,0.66,0.66,1.0
2,12,112856917,112419113,T,G,M,R,0.202,ENST00000392597;...,0.66,0.66,1.0
3,12,112856918,112419114,G,A,M,I,0.222,ENST00000392597;...,0.66,0.66,1.0
4,12,112856918,112419114,G,C,M,I,0.222,ENST00000392597;...,0.66,0.66,1.0


In [92]:
revel_bayesdel_df.to_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL_BayesDel.csv", index=False)


In [93]:
revel_bayesdel_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL_BayesDel.csv")
revel_bayesdel_df.head()

Unnamed: 0,chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid,BayesDel_noAF,BayesDel_addAF,protein_pos
0,12,112856917,112419113,T,A,M,K,0.294,ENST00000392597;...,0.66,0.66,1.0
1,12,112856917,112419113,T,C,M,T,0.315,ENST00000392597;...,0.66,0.66,1.0
2,12,112856917,112419113,T,G,M,R,0.202,ENST00000392597;...,0.66,0.66,1.0
3,12,112856918,112419114,G,A,M,I,0.222,ENST00000392597;...,0.66,0.66,1.0
4,12,112856918,112419114,G,C,M,I,0.222,ENST00000392597;...,0.66,0.66,1.0


In [70]:
checkClinVar.head()

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,Protein_desc,cDNA_desc,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP,chr,codon,ref,alt
24,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,,p.Met1Arg,c.2T>G,M1R,1,M,R,1,1,12,2,T,G
25,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,,p.Thr2Ile,c.5C>T,T2I,2,T,I,0,0,12,5,C,T
30,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,,p.Arg4Gly,c.10C>G,R4G,4,R,G,-1,-1,12,10,C,G
31,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,,p.Arg4Gln,c.11G>A,R4Q,4,R,Q,-1,-1,12,11,G,A
58,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,,p.Trp6Cys,c.18G>T,W6C,6,W,C,0,0,12,18,G,T


In [76]:
def annotatePP_2(df, transcript_id="ENST00000351677", batch_size=200, delay=0.2):
    server = "https://rest.ensembl.org"
    ext = "/vep/human/region"
    headers = {"Content-Type": "application/json", "Accept": "application/json"}

    protein_positions = [None] * len(df)  # pre-allocate list

    # Split dataframe into batches
    for start in range(0, len(df), batch_size):
        batch = df.iloc[start:start+batch_size]
        variants = [
            f"{int(row['GRCh38Chromosome'])} {int(row['GRCh38Location'])} {int(row['GRCh38Location'])} {row['ref']}/{row['alt']}"
            for idx, row in batch.iterrows()
        ]
        payload = {"variants": variants}
        if transcript_id:
            payload["transcript_id"] = transcript_id

        try:
            time.sleep(delay)
            r = requests.post(server + ext, headers=headers, json=payload)
            r.raise_for_status()
            decoded = r.json()
        except Exception as e:
            print(f"Error querying VEP batch {start}-{start+batch_size}: {e}")
            continue

        # Extract protein positions for each variant in the batch
        for i, result in enumerate(decoded):
            consequences = result.get("transcript_consequences", [])
            protein_pos = None
            if transcript_id:
                for t in consequences:
                    if t["transcript_id"] == transcript_id:
                        protein_pos = t.get("protein_start")
                        break
            elif consequences:
                protein_pos = consequences[0].get("protein_start")
            protein_positions[start + i] = protein_pos

    df['protein_pos'] = protein_positions
    return df


In [77]:
checkClinVar = annotatePP_2(checkClinVar)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['protein_pos'] = protein_positions


In [80]:
filteredC_2 = checkClinVar[checkClinVar["AApos"] != checkClinVar["protein_pos"]]


In [81]:
filteredC_2

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,Protein_desc,cDNA_desc,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP,chr,codon,ref,alt,protein_pos


### Create datasets

#### Meta-predictor analysis

In [169]:
revel_bayesdel_df_agg["grch38_pos"].dtype

dtype('int64')

In [23]:
# Load PTPN11_REVEL_BayesDel
revel_bayesdel_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL_BayesDel.csv")

# Take aggregate REVEL and BayesDel for multiple SNVs
revel_bayesdel_df['grch38_pos'] = pd.to_numeric(revel_bayesdel_df['grch38_pos'], errors='coerce').astype('Int64')
#revel_bayesdel_df_agg = revel_bayesdel_df.groupby(['aaref','aaalt','protein_pos'], as_index=False)[['REVEL','BayesDel_noAF','BayesDel_addAF']].mean()
revel_bayesdel_df_agg = (
    revel_bayesdel_df
        .groupby(['aaref', 'aaalt', 'protein_pos'], as_index=False)[
            ['REVEL', 'BayesDel_noAF', 'BayesDel_addAF']].mean()
        .rename(columns={
            'aaref': 'AAfrom',
            'aaalt': 'AAto',
            'protein_pos': 'AApos'
        })
)


revel_bayesdel_df_agg.to_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL_BayesDel_agg.csv", index=False)


In [None]:
"""NEED TO UPDATE THIS AS COLUMNS RENAMED AND ALS CHECK IF SHOULD MERGE ON AAPOS INSTEAD"""

# Merge mean scores on ClinVar
revel_bayesdel_df_agg['grch38_pos'] =revel_bayesdel_df_agg['grch38_pos'].astype(str)
df_stat_long = pd.merge(
    ClinVar_df,
    revel_bayesdel_df_agg,
    left_on=['GRCh38Location', 'AAfrom', 'AAto'], 
    right_on=['grch38_pos', 'aaref', 'aaalt'],
    how='left'  # keep all extended ClinVar missense columns
)

In [174]:
len(df_stat_long)

557

In [181]:
df_stat[df_stat.isna().any(axis=1)]


Unnamed: 0,GRCh38Location,GRCh37Location,Protein_change,Germline classification,REVEL,BayesDel_noAF,BayesDel_addAF,AAfrom,AAto,AApos


In [177]:
# Clean df_stat for analysis
#df_stat = df_stat.dropna(subset=['REVEL', 'Germline classification']) # Keep only rows where REVEL or ClinVar classification is not NaN
#df_stat['REVEL'] = pd.to_numeric(df_stat['REVEL'], errors='coerce') # Make sure REVEL is numeric
#df_stat['Germline classification'] = df_stat['Germline classification'].astype(str).str.strip()

# Select only the columns needed for statistical classification
columns_to_keep = [
    'GRCh38Location',
    'GRCh37Location',
    'Protein_change',
    'Germline classification',
    'REVEL',
    'BayesDel_noAF',
    'BayesDel_addAF',
    'AAfrom',
    'AAto',
    'AApos'
]

df_stat = df_stat_long[columns_to_keep]

# Quick check
print(df_stat.head())

  GRCh38Location GRCh37Location Protein_change Germline classification  REVEL  \
0      112419113      112856917            M1R    Likely pathogenic     0.202   
1      112419116      112856920            T2I  Pathogenic/Likel...     0.213   
2      112419121      112856925            R4G  Conflicting clas...     0.518   
3      112419122      112856926            R4Q    Likely pathogenic     0.393   
4      112446279      112884083            W6C  Uncertain signif...     0.920   

   BayesDel_noAF  BayesDel_addAF AAfrom AAto  AApos  
0       0.660000        0.660000      M    R      1  
1       0.010807        0.010807      T    I      2  
2       0.044882        0.044882      R    G      4  
3      -0.022043       -0.022043      R    Q      4  
4       0.512606        0.512606      W    C      6  


In [180]:
df_stat = df_stat.dropna()

In [182]:
# Save merged file
df_stat.to_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_ClinVar_REVEL_BayesDel.csv", index=False)


##### For validating meta predictors with NSEuronet

In [20]:
""" FDONE THIS ABOVE ALREADY
# Load PTPN11_REVEL_BayesDel
revel_bayesdel_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL_BayesDel.csv")

# Take aggregate REVEL and BayesDel for multiple SNVs
revel_bayesdel_df_agg = revel_bayesdel_df.groupby(['grch38_pos','protein_pos','aaref','aaalt','Ensembl_transcriptid'], as_index=False)[['REVEL','BayesDel_noAF','BayesDel_addAF']].mean()

"""

In [21]:
len(revel_bayesdel_df_agg)


3569

In [161]:

# Merge mean scores on NSEuronet
df_statNSEuronet = pd.merge(
    NSEuronet_df,
    revel_bayesdel_df_agg,
    left_on=['pos', 'AAfrom', 'AAto'],
    right_on=['grch38_pos', 'aaref', 'aaalt'],
    how='left'  # keep all extended ClinVar missense columns
)


In [162]:
# Save 
df_statNSEuronet.to_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_NSEuronet_REVEL_BayesDel.csv", index=False)


##### For density of meta predictors across gnomAD variants

In [40]:
# Merge mean scores on ClinVar
df_statgnomAD = pd.merge(
    gnomAD_df,
    revel_bayesdel_df_agg,
    left_on=['Position', 'AAfrom', 'AAto'],
    right_on=['grch38_pos', 'aaref', 'aaalt'],
    how='left'  # keep all extended ClinVar missense columns
)

In [41]:
df_statgnomAD.head()

Unnamed: 0,gnomAD ID,Chromosome,Position,rsIDs,Reference,Alternate,Source,Filters - exomes,Filters - genomes,Transcript,HGVS Consequence,Protein Consequence,Transcript Consequence,VEP Annotation,ClinVar Germline Classification,ClinVar Variation ID,Flags,Allele Count,Allele Number,Allele Frequency,Homozygote Count,Hemizygote Count,Filters - joint,GroupMax FAF group,GroupMax FAF frequency,cadd,revel_max,spliceai_ds_max,pangolin_largest_ds,phylop,sift_max,polyphen_max,Allele Count African/African American,Allele Number African/African American,Homozygote Count African/African American,Hemizygote Count African/African American,Allele Count Admixed American,Allele Number Admixed American,Homozygote Count Admixed American,Hemizygote Count Admixed American,Allele Count Ashkenazi Jewish,Allele Number Ashkenazi Jewish,Homozygote Count Ashkenazi Jewish,Hemizygote Count Ashkenazi Jewish,Allele Count East Asian,Allele Number East Asian,Homozygote Count East Asian,Hemizygote Count East Asian,Allele Count European (Finnish),Allele Number European (Finnish),Homozygote Count European (Finnish),Hemizygote Count European (Finnish),Allele Count Middle Eastern,Allele Number Middle Eastern,Homozygote Count Middle Eastern,Hemizygote Count Middle Eastern,Allele Count European (non-Finnish),Allele Number European (non-Finnish),Homozygote Count European (non-Finnish),Hemizygote Count European (non-Finnish),Allele Count Amish,Allele Number Amish,Homozygote Count Amish,Hemizygote Count Amish,Allele Count South Asian,Allele Number South Asian,Homozygote Count South Asian,Hemizygote Count South Asian,Allele Count Remaining,Allele Number Remaining,Homozygote Count Remaining,Hemizygote Count Remaining,Gene,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP,grch38_pos,aaref,aaalt,REVEL,BayesDel_noAF,BayesDel_addAF
0,12-112419116-C-T,12,112419116,rs267606990,C,T,gnomAD Exomes,PASS,,ENST00000351677.7,p.Thr2Ile,p.Thr2Ile,c.5C>T,missense_variant,Pathogenic/Likel...,13349.0,,4,1528266,2.617345e-06,0,0,PASS,nfe,7e-07,31.0,0.213,0.01,-0.02,8.35,0.03,0.068,0,70690,0,0,0,49938,0,0,1,28078,0,0,0,39198,0,0,0,53300,0,0,0,4318,0,0,3,1140260,0,0,0,912,0,0,0,82348,0,0,0,59224,0,0,PTPN11,T2I,2,T,I,0,0,112419116.0,T,I,0.213,0.010807,0.010807
1,12-112419121-C-G,12,112419121,rs886041517,C,G,gnomAD Exomes,PASS,,ENST00000351677.7,p.Arg4Gly,p.Arg4Gly,c.10C>G,missense_variant,Conflicting clas...,280283.0,,2,1526086,1.310542e-06,0,0,PASS,nfe,2.9e-07,33.0,0.518,0.05,-0.05,8.35,0.01,0.289,0,70428,0,0,0,49746,0,0,0,27996,0,0,0,38958,0,0,0,53024,0,0,0,4312,0,0,2,1139370,0,0,0,910,0,0,0,82190,0,0,0,59152,0,0,PTPN11,R4G,4,R,G,-1,-1,112419121.0,R,G,0.518,0.044882,0.044882
2,12-112419122-G-A,12,112419122,,G,A,gnomAD Exomes,PASS,,ENST00000351677.7,p.Arg4Gln,p.Arg4Gln,c.11G>A,missense_variant,Likely pathogenic,2729546.0,,1,1523540,6.563661e-07,0,0,PASS,,,28.4,0.393,0.0,0.0,8.25,0.01,0.344,0,70172,0,0,0,49524,0,0,0,27924,0,0,0,38626,0,0,0,52776,0,0,0,4304,0,0,1,1138308,0,0,0,912,0,0,0,81938,0,0,0,59056,0,0,PTPN11,R4Q,4,R,Q,-1,-1,112419122.0,R,Q,0.393,-0.022043,-0.022043
3,12-112446278-G-T,12,112446278,,G,T,gnomAD Exomes,PASS,,ENST00000351677.7,p.Trp6Leu,p.Trp6Leu,c.17G>T,missense_variant,,,,1,1613914,6.196117e-07,0,0,PASS,,,32.0,0.897,0.0,0.0,6.27,0.0,0.999,0,75018,0,0,0,59984,0,0,0,29602,0,0,0,44880,0,0,0,64026,0,0,0,6048,0,0,1,1179880,0,0,0,912,0,0,0,91074,0,0,0,62490,0,0,PTPN11,W6L,6,W,L,0,0,112446278.0,W,L,0.897,0.550287,0.550287
4,12-112446279-G-T,12,112446279,rs79203122,G,T,gnomAD Exomes,PASS,,ENST00000351677.7,p.Trp6Cys,p.Trp6Cys,c.18G>T,missense_variant,Uncertain signif...,2785856.0,,3,1613968,1.858773e-06,0,0,PASS,nfe,2.8e-07,32.0,0.92,0.0,-0.01,6.27,0.02,0.987,0,75030,0,0,0,59986,0,0,0,29604,0,0,0,44876,0,0,0,64030,0,0,0,6048,0,0,2,1179906,0,0,0,912,0,0,0,91082,0,0,1,62494,0,0,PTPN11,W6C,6,W,C,0,0,112446279.0,W,C,0.92,0.512606,0.512606


In [42]:
# Save 
df_statgnomAD.to_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_gnomAD_REVEL_BayesDel.csv", index=False)
