In [3]:
import pandas as pd
import os
from Bio import SeqIO
from io import StringIO
import requests
import json
import re 

import seaborn as sns

In [123]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_rows', None) 

In [5]:
# Set gene
gene = 'PTPN11'

# Dictionary of Uniprot IDs for each gene
uniprot_ids = {
    "PTPN11": "Q06124",    
}

#### Processing functions

In [6]:
# Universal tools for dealing with proteins

aaconv = {'CYS': 'C', 'ASP': 'D', 'SER': 'S', 'GLN': 'Q', 'LYS': 'K',
     'ILE': 'I', 'PRO': 'P', 'THR': 'T', 'PHE': 'F', 'ASN': 'N', 
     'GLY': 'G', 'HIS': 'H', 'LEU': 'L', 'ARG': 'R', 'TRP': 'W', 
     'ALA': 'A', 'VAL':'V', 'GLU': 'E', 'TYR': 'Y', 'MET': 'M',
     'H1S': 'H', 'H2S': 'H'}

aacharge = {
         'C': 0, 'D': -1, 'S': 0, 'Q': 0, 'K': 1,
         'I': 0, 'P': 0, 'T': 0, 'F': 0, 'N': 0, 
         'G': 0, 'H': 0, 'L': 0, 'R': 1, 'W': 0, 
         'A': 0, 'V':0, 'E': -1, 'Y': 0, 'M': 0 }

aachargeHP = {
         'C': 0, 'D': -1, 'S': 0, 'Q': 0, 'K': 1,
         'I': 0, 'P': 0, 'T': 0, 'F': 0, 'N': 0, 
         'G': 0, 'H': 1, 'L': 0, 'R': 1, 'W': 0, 
         'A': 0, 'V':0, 'E': -1, 'Y': 0, 'M': 0 }

In [82]:
# Functions to test tables processing for any unexpected results

def threetoone(x):
    if len(x) % 3 != 0: 
        raise ValueError('Input length should be a multiple of three')

    y = ''
    for i in range(len(x) // 3):
            y += aaconv[x[3 * i : 3 * i + 3]]
    return y

def testTable(table,mutation="Protein_desc",threeLetter=False):
    mutation = table[mutation].apply(lambda x: str(x).split(".")[-1] if pd.notna(x) else "")
    locations=mutation.apply(lambda x: int(re.findall('\d+', x)[0]))
    if threeLetter:
        fromAA = mutation.apply(lambda x: aaconv[re.split('\d+',x)[0].upper()])
        toAA = mutation.apply(lambda x: aaconv[re.split('\d+',x)[1].upper()])
    else:
        fromAA = mutation.apply(lambda x: re.findall('[A-Z]+', x.upper())[0])
        toAA = mutation.apply(lambda x: re.findall('[A-Z]+', x.upper())[1])
    problem_idx = set()
    problem_idx.update(locations[locations > protein_length].index.tolist())
    problem_idx.update(fromAA[~fromAA.isin(aacharge.keys())].index.tolist())
    problem_idx.update(toAA[~toAA.isin(aacharge.keys())].index.tolist())

    return problem_idx

In [126]:
# Function to filter table for missense mutations
def filterTable_missense(df, gene, mutation_col="Protein_desc", gene_col="Gene", verbose=True, threeLetter=False):

    # Handle gene column logic
    if gene_col in df.columns:
        df = df.loc[df[gene_col].astype(str).str.contains(gene, na=False)]
    else:
        df.loc[:, gene_col] = gene
        
    # Safe extraction of protein change
    mutation = df[mutation_col].apply(lambda x: str(x).split(".")[-1] if pd.notna(x) else "")

    # Extract positions (integer)
    locations = mutation.apply(lambda x: int(re.findall(r'\d+', x)[0]) if re.findall(r'\d+', x) else -1)

    # Extract AA from/to
    if threeLetter:
        fromAA = mutation.apply(lambda x: aaconv[re.split(r'\d+', x)[0].upper()] if re.split(r'\d+', x)[0].upper() in aaconv else "")
        toAA   = mutation.apply(lambda x: aaconv[re.split(r'\d+', x)[1].upper()] if len(re.split(r'\d+', x)) > 1 and re.split(r'\d+', x)[1].upper() in aaconv else "")
    else:
        fromAA = mutation.apply(lambda x: re.findall(r'[A-Z]+', x.upper())[0] if re.findall(r'[A-Z]+', x.upper()) else "")
        toAA   = mutation.apply(lambda x: re.findall(r'[A-Z]+', x.upper())[1] if len(re.findall(r'[A-Z]+', x.upper())) > 1 else "")

    # Identify problematic rows
    problem_idx = set()
    problem_idx.update(locations[locations > protein_length].index.tolist())
    problem_idx.update(fromAA[~fromAA.isin(aacharge.keys())].index.tolist())
    problem_idx.update(toAA[~toAA.isin(aacharge.keys())].index.tolist())

    missense_df = df.drop(index=problem_idx, errors='ignore')

    # Report mismatched rows
    notmissense_df = df.loc[df.index.isin(problem_idx)]
    if verbose:
        print(notmissense_df)
    else:
        check_notmissense_rows = notmissense_df.loc[notmissense_df[mutation_col].notna()]
        print(check_notmissense_rows)
    
    return missense_df


In [135]:
# Function to extend tables

def extendTable(table,mutation="Protein_desc",threeLetter=False, DCharge=False):
    table["Protein_change"] = table[mutation].astype(str).apply(lambda x: x.split(".")[-1])
    table["AApos"] = table["Protein_change"].apply(lambda x:int( re.findall('\d+', x)[0])) # extract amino acid position
    if threeLetter:
        table["Protein_change"] = table["Protein_change"].str.upper()
        table["AAfrom"] = table["Protein_change"].apply(lambda x: aaconv[re.split('\d+',x)[0].upper()])
        table["AAto"] = table["Protein_change"].apply(lambda x: aaconv[re.split('\d+',x)[1].upper()])
        table["Protein_change"] = table["AAfrom"] + table["AApos"].astype(str) + table["AAto"]
    else:
        table["AAfrom"] = table["Protein_change"].apply(lambda x: re.findall('[A-Z]+', x.upper())[0])
        table["AAto"] = table["Protein_change"].apply(lambda x: re.findall('[A-Z]+', x.upper())[1])
    if DCharge:
        table["DCharge"] = table["Protein_change"].apply(lambda x: aacharge[re.findall('[A-Z]+', x.upper())[1]] - aacharge[re.findall('[A-Z]+', x.upper())[0]]) 
        table["DChargeHP"] = table["Protein_change"].apply(lambda x: aachargeHP[re.findall('[A-Z]+', x.upper())[1]] - aachargeHP[re.findall('[A-Z]+', x.upper())[0]]) 
    return table

### Read in data

#### Uniprot

In [11]:
# Fetch sequence length, domains and sites from UniProt

def get_uniprot_features(gene):
    uniprot_id = uniprot_ids[gene]
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    response = requests.get(url)
    response.raise_for_status()
    data = response.json()

    sequence_length = len(data['sequence']['value'])
    
    domains_list = []
    sites_list = []
    
    # Assign colors for each feature type
    feature_colours = {
        'Domain': "#66c2a5",
        'Region': "#fc8d62",
        'Active site': "#e78ac3",
        'Binding site': "#a6d854"
    }
    
    for feature in data.get('features', []):
        ftype = feature['type']
        start = feature['location']['start']['value']
        end = feature['location']['end']['value']
        name = feature.get('description', ftype)
        
        # Collect domains
        if ftype in ['Domain', 'Region']:
            domains_list.append([name, start, end, feature_colours[ftype]])
        # Collect sites
        elif ftype in ['Active site', 'Binding site']:
            sites_list.append([name, start, end, feature_colours[ftype]])
    
    # Assign distinct colors for domains
    palette = sns.color_palette("Set2", n_colors=len(domains_list))
    for i, domain in enumerate(domains_list):
        domain[3] = palette[i]
    
    return sequence_length, domains_list, sites_list

In [13]:
protein_length, domains, sites = get_uniprot_features(gene)

print("Gene:", gene)

print("Sequence Length:", protein_length)

print("Domains / regions:")
for domain in domains:
    print (domain)

print("\nActive / binding sites:")
for site in sites:
    print (site)

# Save features dictionaries
all_features = {
    "domains": domains,
    "sites": sites
}

with open(f"{gene}_features.json", "w") as f:
    json.dump(all_features, f, indent=2)

Gene: PTPN11
Sequence Length: 593
Domains / regions:
['SH2 1', 6, 102, (0.4, 0.7607843137254902, 0.6470588235294118)]
['SH2 2', 112, 216, (0.9882352941176471, 0.5529411764705883, 0.3843137254901961)]
['Tyrosine-protein phosphatase', 247, 517, (0.5529411764705883, 0.6274509803921569, 0.796078431372549)]
['Disordered', 548, 571, (0.9058823529411765, 0.5411764705882353, 0.7647058823529411)]

Active / binding sites:
['Phosphocysteine intermediate', 459, 459, '#e78ac3']
['', 425, 425, '#a6d854']
['', 459, 465, '#a6d854']
['', 506, 506, '#a6d854']


#### NSEuronet

In [106]:
# Manually load the NSEuronetData.csv (for all RAS genes)

df1 = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\NSEuroNetData.csv", sep=';', header=None)
df1_headers = ["Gene", "cDNA", "Protein_desc", "Disease"] #headers to add to df
df1.columns = df1_headers

print(df1.head())

     Gene       cDNA Protein_desc          Disease
0    KRAS    c.65A>G       p.Q22R  Noonan syndrome
1  PTPN11   c.922A>G      p.N308D  Noonan syndrome
2  PTPN11   c.184T>G       p.Y62D  Noonan syndrome
3  PTPN11   c.922A>G      p.N308D  Noonan syndrome
4  PTPN11  c.1232C>T      p.T411M  Noonan syndrome


In [108]:
# Filter NSEuronet df for missense variants
df1_missense = filterTable_missense(df1, gene, mutation_col="Protein_desc", gene_col="Gene", verbose=True, threeLetter=False)
# prints notmissense df

        Gene                 cDNA Protein_desc              Disease
321   PTPN11         c.181_183del     p.D61del      Noonan syndrome
940   PTPN11         c.179_181del     p.G60del      Noonan syndrome
2276  PTPN11                  NaN          NaN  LEOPARD syndrome...
3618  PTPN11         c.768_770dup    p.Q257dup      Noonan syndrome
3621  PTPN11         c.768_770dup    p.Q257dup      Noonan syndrome
4057  PTPN11                  NaN          NaN      Noonan syndrome
4132  PTPN11            c.1837C>G      p.L613V      Noonan syndrome
4386  PTPN11             c.255C>T       p.H85=      Noonan syndrome
4387  PTPN11  c.525+121_525+13...          NaN      Noonan syndrome
4388  PTPN11  c.525+125_525+13...          NaN      Noonan syndrome
4389  PTPN11  c.525+129_525+13...          NaN      Noonan syndrome
4390  PTPN11          c.854-21C>T          NaN      Noonan syndrome
4393  PTPN11  c.525+129_525+13...          NaN      Noonan syndrome
4394  PTPN11          c.854-21C>T          NaN  

In [117]:
# Extend NSEuronet df to create Protein_change, AA and charge columns
NSEuronet_df = extendTable(df1_missense,mutation="Protein_desc",threeLetter=False,DCharge=True)
NSEuronet_df

Unnamed: 0,Gene,cDNA,Protein_desc,Disease,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP
1,PTPN11,c.922A>G,p.N308D,Noonan syndrome,N308D,308,N,D,-1,-1
2,PTPN11,c.184T>G,p.Y62D,Noonan syndrome,Y62D,62,Y,D,-1,-1
3,PTPN11,c.922A>G,p.N308D,Noonan syndrome,N308D,308,N,D,-1,-1
4,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0
5,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0
6,PTPN11,c.1232C>T,p.T411M,Noonan syndrome,T411M,411,T,M,0,0
7,PTPN11,c.1529A>G,p.Q510R,NF1-Noonan syndrome,Q510R,510,Q,R,1,1
8,PTPN11,c.179G>C,p.G60A,Noonan syndrome,G60A,60,G,A,0,0
9,PTPN11,c.182A>G,p.D61G,Noonan syndrome,D61G,61,D,G,1,1
10,PTPN11,c.184T>G,p.Y62D,Noonan syndrome,Y62D,62,Y,D,-1,-1


#### ClinVar

NCBI E-utilities rate is limited to 3 requests per second (without an API key). Can request an NCBI API key to increase limit to ~10 per second.

Try again teXML release (gzipped) and parse for gene of interest

ClinVar maintains a complete set of variant data on an FTP server. Weekly update cycle but only the realease on the 1st Thursday of the month is archived. Download this below. https://pmc.ncbi.nlm.nih.gov/tools/ftp/

In [43]:
# Manually load the clinvar_result.txt download (for single gene e.g. PTPN11)

df2 = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\clinvar_result.txt", sep="\t", low_memory=False)
print(df2.head(30))


                   Name              Gene(s) Protein change  \
0   GRCh38/hg38 12p1...  LOC130008261|LOC...            NaN   
1   GRCh38/hg38 12q2...  HECTD4|LOC130008...            NaN   
2   GRCh38/hg38 12q2...  HECTD4|LOC130008...            NaN   
3   GRCh38/hg38 12q2...          PTPN11|RPL6            NaN   
4   NM_002834.3(PTPN...          PTPN11|RPL6            NaN   
5   NC_000012.12:g.1...          PTPN11|RPL6            NaN   
6   NM_002834.3(PTPN...          PTPN11|RPL6            NaN   
7   NM_002834.3(PTPN...          PTPN11|RPL6            NaN   
8   NM_002834.5(PTPN...               PTPN11            NaN   
9   NM_002834.3(PTPN...               PTPN11            NaN   
10  NM_002834.3(PTPN...               PTPN11            NaN   
11  NM_002834.3(PTPN...               PTPN11            NaN   
12  NM_002834.5(PTPN...               PTPN11            NaN   
13  NM_002834.5(PTPN...               PTPN11            NaN   
14  NM_002834.5(PTPN...               PTPN11           

In [127]:
# Filter ClinVar df for missense variants

# But first extract Protein_desc from 'Name' column
df2['Protein_desc'] = (df2['Name'].str.extract(r'(p\.[A-Za-z0-9]+)').iloc[:, 0].str.replace(')', '', regex=False).str.strip())

with pd.option_context('display.max_colwidth', 15):
    df2_missense = filterTable_missense(df2, gene, mutation_col="Protein_desc",  gene_col="Gene(s)", verbose=False, threeLetter=True)


                Name Gene(s)  Protein change    Condition(s)     Accession  \
26    NM_002834.5...  PTPN11             NaN  Cardiovascu...  VCV003427731   
27    NM_002834.5...  PTPN11             NaN  RASopathy|C...  VCV001160776   
28    NM_002834.5...  PTPN11             NaN  Cardiovascu...  VCV003940399   
29    NM_002834.5...  PTPN11             NaN  RASopathy|C...  VCV001530049   
32    NM_002834.5...  PTPN11             NaN  Cardiovascu...  VCV003784946   
56    NM_002834.5...  PTPN11             NaN       RASopathy  VCV003663504   
57    NM_002834.5...  PTPN11            W6fs       RASopathy  VCV001435208   
67    NM_002834.5...  PTPN11             NaN  Cardiovascu...  VCV000736408   
68    NM_002834.5...  PTPN11             NaN  Cardiovascu...  VCV003940411   
70    NM_002834.5...  PTPN11             NaN       RASopathy  VCV002861237   
71    NM_002834.5...  PTPN11             NaN       RASopathy  VCV000044608   
74    NM_002834.5...  PTPN11             NaN  Cardiovascu...  VC

In [128]:
df2_missense

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,Protein_desc
24,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,,p.Met1Arg
25,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,,p.Thr2Ile
30,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,,p.Arg4Gly
31,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,,p.Arg4Gln
58,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,,p.Trp6Cys
59,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002568101,12.0,112884083,12.0,112446279,2568101,2734639,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 22, 2023",criteria provide...,,,,,,,,p.Trp6Cys
60,NM_002834.5(PTPN...,PTPN11,N10Y,not provided|Car...,VCV002587734,12.0,112884093,12.0,112446289,2587734,2764072,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 3, 2024",criteria provide...,,,,,,,,p.Asn10Tyr
61,NM_002834.5(PTPN...,PTPN11,N10H,Metachondromatosis,VCV001684677,12.0,112884093,12.0,112446289,1684677,1676705,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 20, 2022",criteria provide...,,,,,,,,p.Asn10His
62,NM_002834.5(PTPN...,PTPN11,N10D,Cardiovascular p...,VCV000838860,12.0,112884093,12.0,112446289,838860,839322,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jul 3, 2025",criteria provide...,,,,,,,,p.Asn10Asp
63,NM_002834.5(PTPN...,PTPN11,N10T,RASopathy|not pr...,VCV002705464,12.0,112884094,12.0,112446290,2705464,2856602,rs200613531,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Aug 22, 2024",criteria provide...,,,,,,,,p.Asn10Thr


In [129]:
print(df2_missense["Molecular consequence"].unique())


['missense variant|initiator_codon_variant' 'missense variant']


In [132]:
with pd.option_context('display.max_colwidth', None):
    print(df2_missense[df2_missense["Molecular consequence"].str.lower() != "missense variant"])


                                      Name Gene(s) Protein change  \
24  NM_002834.5(PTPN11):c.2T>G (p.Met1Arg)  PTPN11            M1R   

                                                                                Condition(s)  \
24  Juvenile myelomonocytic leukemia|Noonan syndrome 1|Metachondromatosis|LEOPARD syndrome 1   

       Accession  GRCh37Chromosome GRCh37Location  GRCh38Chromosome  \
24  VCV003574257              12.0      112856917              12.0   

   GRCh38Location  VariationID  AlleleID(s) dbSNP ID  \
24      112419113      3574257      3707172      NaN   

                Canonical SPDI               Variant type  \
24  NC_000012.12:112419112:T:G  single nucleotide variant   

                       Molecular consequence Germline classification  \
24  missense variant|initiator_codon_variant       Likely pathogenic   

   Germline date last evaluated               Germline review status  \
24                 Jun 20, 2024  criteria provided, single submitter   


In [136]:
# Extend ClinVar df to create Protein_change, AA and charge columns
ClinVar_df = extendTable(df2_missense,mutation="Protein_desc",threeLetter=True,DCharge=True)
ClinVar_df

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,Protein_desc,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP
24,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,,p.Met1Arg,M1R,1,M,R,1,1
25,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,,p.Thr2Ile,T2I,2,T,I,0,0
30,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,,p.Arg4Gly,R4G,4,R,G,-1,-1
31,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,,p.Arg4Gln,R4Q,4,R,Q,-1,-1
58,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,,p.Trp6Cys,W6C,6,W,C,0,0
59,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002568101,12.0,112884083,12.0,112446279,2568101,2734639,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 22, 2023",criteria provide...,,,,,,,,p.Trp6Cys,W6C,6,W,C,0,0
60,NM_002834.5(PTPN...,PTPN11,N10Y,not provided|Car...,VCV002587734,12.0,112884093,12.0,112446289,2587734,2764072,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 3, 2024",criteria provide...,,,,,,,,p.Asn10Tyr,N10Y,10,N,Y,0,0
61,NM_002834.5(PTPN...,PTPN11,N10H,Metachondromatosis,VCV001684677,12.0,112884093,12.0,112446289,1684677,1676705,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 20, 2022",criteria provide...,,,,,,,,p.Asn10His,N10H,10,N,H,0,1
62,NM_002834.5(PTPN...,PTPN11,N10D,Cardiovascular p...,VCV000838860,12.0,112884093,12.0,112446289,838860,839322,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jul 3, 2025",criteria provide...,,,,,,,,p.Asn10Asp,N10D,10,N,D,-1,-1
63,NM_002834.5(PTPN...,PTPN11,N10T,RASopathy|not pr...,VCV002705464,12.0,112884094,12.0,112446290,2705464,2856602,rs200613531,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Aug 22, 2024",criteria provide...,,,,,,,,p.Asn10Thr,N10T,10,N,T,0,0


#### Meta predictors

## ?? get chromosone range from uniprot but it only has GRCH38 >>

In [148]:
def get_chr_range(df, gene, gene_col="Gene(s)", loc_col="GRCh38Location", chrom_col="GRCh38Chromosome"):
    # Filter for specified gene
    gene_df = df[df[gene_col].astype(str).str.contains(gene, na=False)].copy()

    # Convert location column to numeric (ignore errors)
    gene_df[loc_col] = pd.to_numeric(gene_df[loc_col], errors='coerce')

    # Drop missing positions
    #gene_df = gene_df.dropna(subset=[loc_col])

    # Find chromosome, min, and max positions
    chrom = gene_df[chrom_col].dropna().unique()[0]
    min_pos = gene_df[loc_col].min()
    max_pos = gene_df[loc_col].max()

    print(f" Gene: {gene}")
    print(f" Chromosome: {chrom}")
    print(f" Position range: {int(min_pos)} - {int(max_pos)}")

    return chrom, min_pos, max_pos


##### REVEL

Downloaded from https://sites.google.com/site/revelgenomics/downloads

Try again to curl from dfNSFP

In [None]:
# Manually download the REVEL data (for all genes)

# Read just the first 10 rows (6GB REVEL file)
df3 = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\revel-v1.3_all_chromosomes\revel_with_transcript_ids", nrows=10)
df3

In [150]:
chrom, min_pos, max_pos = get_chr_range(df2, gene, gene_col="Gene(s)", loc_col="GRCh38Location", chrom_col="GRCh38Chromosome")


 Gene: PTPN11
 Chromosome: 12.0
 Position range: 112418795 - 112509811


In [235]:
revel_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\revel-v1.3_all_chromosomes\revel_with_transcript_ids"

chunksize = 1_000_000
ptpn11_revel_chunks = []

for chunk in pd.read_csv(revel_file, chunksize=chunksize, low_memory=False):
    # Convert positions in revel_file to numeric
    chunk['grch38_pos'] = pd.to_numeric(chunk['grch38_pos'], errors='coerce')
    
    # Only keep PTPN11 chromosome
    chunk = chunk[chunk['chr'] == chrom]
    
    # Keep only positions within PTPN11 range
    filtered_chunk = chunk[
        (chunk['grch38_pos'] >= min_pos) &
        (chunk['grch38_pos'] <= max_pos)
    ]
    
    if not filtered_chunk.empty:
        ptpn11_revel_chunks.append(filtered_chunk)
    
    # Early stop: REVEL is sorted by grch38_pos
    if chunk['grch38_pos'].max() > max_pos:
        break

# Concatenate all filtered chunks
if ptpn11_revel_chunks:
    ptpn11_revel = pd.concat(ptpn11_revel_chunks)
    ptpn11_revel.to_csv( r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv", index=False)


##### BayesDel

Downloaded from https://drive.google.com/drive/folders/1K4LI6ZSsUGBhHoChUtegC8bgCt7hbQlA (2017! version)

Need to install and run VICTOR locally (Linux)


In [152]:
chrom37, min_pos37, max_pos37 = get_chr_range(df2, gene, gene_col="Gene(s)", loc_col="GRCh37Location", chrom_col="GRCh37Chromosome")


 Gene: PTPN11
 Chromosome: 12.0
 Position range: 112856599 - 112947615


In [None]:
# Unzipped file path for gene specific chrom and GRCH37 location
bayesdel_noAF_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\BayesDel_170824_noAF\BayesDel_170824_noAF\BayesDel_170824_noAF_chr12\BayesDel_170824_noAF_chr12"

with open(bayesdel_noAF_file, "r") as f:
    for _ in range(10):
        print(f.readline().strip())


In [153]:
def load_bayesdel_region(file_path, min_pos, max_pos, output_csv=None, chunksize=100_000, columns=None):
    chunks = []
    if columns is None:
        columns = ["Chr", "Start", "Ref", "Alt", "Score"]

    for chunk in pd.read_csv(file_path, sep="\t", names=columns, chunksize=chunksize, comment="#"):
        chunk["Start"] = pd.to_numeric(chunk["Start"], errors="coerce")
        # Filter by genomic position
        filtered = chunk[(chunk["Start"] >= min_pos) & (chunk["Start"] <= max_pos)]

        if not filtered.empty:
            chunks.append(filtered)

    if chunks:
        df_region = pd.concat(chunks, ignore_index=True)

    # Save to CSV 
    if output_csv:
        df_region.to_csv(output_csv, index=False)
        
    return df_region





In [154]:
bayesdel_noAF_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\BayesDel_170824_noAF\BayesDel_170824_noAF\BayesDel_170824_noAF_chr12\BayesDel_170824_noAF_chr12"
bayesdel_addAF_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\BayesDel_170824_noAF\BayesDel_170824_noAF\BayesDel_170824_noAF_chr12\BayesDel_170824_noAF_chr12"

# Load noAF region and save output
PTPN11_bayesdel_noAF = load_bayesdel_region(bayesdel_noAF_file, min_pos37, max_pos37, output_csv=r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_BayesDel_noAF.csv")

# Load addAF region and save output
PTPN11_bayesdel_addAF = load_bayesdel_region(bayesdel_addAF_file, min_pos37, max_pos37, output_csv=r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_BayesDel_addAF.csv")

#print("\nnoAF columns:", df_noAF.columns.tolist())
#print("\naddAF columns:", df_addAF.columns.tolist())

#### Merge Revel & BayesDel

In [269]:
print(revel_df["Ensembl_transcriptid"].unique())


['ENST00000392597;ENST00000351677;ENST00000392596' 'ENST00000530818'
 'ENST00000392596' 'ENST00000392597;ENST00000351677' 'ENST00000351677'
 'ENST00000392597']


In [270]:
# Load PTPN11_REVEL and BayesDel
revel_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv")
bayesdel_noAF_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_BayesDel_noAF.csv")
bayesdel_addAF_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_BayesDel_addAF.csv")

# Merge PTPN11_REVEL scores on ClinVar
bayesdel_noAF_df = bayesdel_noAF_df.rename(columns={"Score": "BayesDel_noAF"})
bayesdel_addAF_df = bayesdel_addAF_df.rename(columns={"Score": "BayesDel_addAF"})

revel_bayesdel_noAF_df = pd.merge(
    revel_df,
    bayesdel_noAF_df,
    left_on=['hg19_pos', 'ref', 'alt'],
    right_on=['Start', 'Ref', 'Alt'],
    how='left',
)

revel_bayesdel_noAF_df = revel_bayesdel_noAF_df.drop(columns=['Chr','Start', 'Ref', 'Alt'])

revel_bayesdel_df = pd.merge(
    revel_bayesdel_noAF_df,
    bayesdel_addAF_df,
    left_on=['hg19_pos', 'ref', 'alt'],
    right_on=['Start', 'Ref', 'Alt'],
    how='left',
)

revel_bayesdel_df = revel_bayesdel_df.drop(columns=['Chr','Start', 'Ref', 'Alt'])
revel_bayesdel_df['grch38_pos'] = pd.to_numeric(revel_bayesdel_df['grch38_pos'], errors='coerce').astype('Int64')



In [272]:
revel_bayesdel_df.to_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL_BayesDel.csv", index=False)


## Look at PTPN11 DS notebook for mapping genomic coordinates

In [None]:
"""
from pyensembl import EnsemblRelease

data = EnsemblRelease(77)  # GRCh38
data.download()  # downloads gene/transcript info if not already present
data.index()     # indexes data for lookup
# Get all transcripts for PTPN11
ptpn11_transcripts = data.transcripts_by_gene_name("PTPN11")

# Pick the first one (canonical) for testing
transcript = ptpn11_transcripts[0]

print(f"Transcript ID: {transcript.id}")"""

### Create datasets

#### Meta-predictor analysis

In [256]:
# Load PTPN11_REVEL_BayesDel
#revel_bayesdel_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL_BayesDel.csv")

# Take aggregate REVEL and BayesDel for multiple SNVs
revel_bayesdel_df_agg = revel_bayesdel_df.groupby(['grch38_pos','aaref','aaalt'], as_index=False)[['REVEL','BayesDel_noAF','BayesDel_addAF']].mean()

# Merge mean scores on ClinVar
df_stat_long = pd.merge(
    df2_missense,
    revel_bayesdel_df_agg,
    left_on=['GRCh38Location', 'AAfrom', 'AAto'],
    right_on=['grch38_pos', 'aaref', 'aaalt'],
    how='left'  # keep all extended ClinVar missense columns
)


In [255]:
df_stat_long

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,Protein_desc,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP,grch38_pos,aaref,aaalt,REVEL,BayesDel_noAF,BayesDel_addAF
0,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113.0,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,,p.Met1Arg,M1R,1,M,R,1,1,112419113.0,M,R,0.202,0.66,0.66
1,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116.0,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,,p.Thr2Ile,T2I,2,T,I,0,0,112419116.0,T,I,0.213,0.010807,0.010807
2,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121.0,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,,p.Arg4Gly,R4G,4,R,G,-1,-1,112419121.0,R,G,0.518,0.044882,0.044882
3,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122.0,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,,p.Arg4Gln,R4Q,4,R,Q,-1,-1,112419122.0,R,Q,0.393,-0.022043,-0.022043
4,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279.0,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,,p.Trp6Cys,W6C,6,W,C,0,0,112446279.0,W,C,0.92,0.512606,0.512606
5,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002568101,12.0,112884083,12.0,112446279.0,2568101,2734639,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 22, 2023",criteria provide...,,,,,,,,p.Trp6Cys,W6C,6,W,C,0,0,112446279.0,W,C,0.92,0.512606,0.512606
6,NM_002834.5(PTPN...,PTPN11,N10Y,not provided|Car...,VCV002587734,12.0,112884093,12.0,112446289.0,2587734,2764072,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 3, 2024",criteria provide...,,,,,,,,p.Asn10Tyr,N10Y,10,N,Y,0,0,112446289.0,N,Y,0.703,0.1565,0.1565
7,NM_002834.5(PTPN...,PTPN11,N10H,Metachondromatosis,VCV001684677,12.0,112884093,12.0,112446289.0,1684677,1676705,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 20, 2022",criteria provide...,,,,,,,,p.Asn10His,N10H,10,N,H,0,1,112446289.0,N,H,0.436,-0.07798,-0.07798
8,NM_002834.5(PTPN...,PTPN11,N10D,Cardiovascular p...,VCV000838860,12.0,112884093,12.0,112446289.0,838860,839322,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jul 3, 2025",criteria provide...,,,,,,,,p.Asn10Asp,N10D,10,N,D,-1,-1,112446289.0,N,D,0.372,-0.184288,-0.184288
9,NM_002834.5(PTPN...,PTPN11,N10T,RASopathy|not pr...,VCV002705464,12.0,112884094,12.0,112446290.0,2705464,2856602,rs200613531,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Aug 22, 2024",criteria provide...,,,,,,,,p.Asn10Thr,N10T,10,N,T,0,0,112446290.0,N,T,0.471,-0.082037,-0.082037


In [225]:
df_stat_long

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,Protein_desc,Protein_change,AApos,AAfrom,AAto,DCharge,DChargeHP,grch38_pos,aaref,aaalt,REVEL,BayesDel_noAF,BayesDel_addAF
0,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113.0,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,,p.Met1Arg,M1R,1,M,R,1,1,112419113.0,M,R,0.202,0.66,0.66
1,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116.0,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,,p.Thr2Ile,T2I,2,T,I,0,0,112419116.0,T,I,0.213,0.010807,0.010807
2,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121.0,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,,p.Arg4Gly,R4G,4,R,G,-1,-1,112419121.0,R,G,0.518,0.044882,0.044882
3,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122.0,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,,p.Arg4Gln,R4Q,4,R,Q,-1,-1,112419122.0,R,Q,0.393,-0.022043,-0.022043
4,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279.0,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,,p.Trp6Cys,W6C,6,W,C,0,0,112446279.0,W,C,0.92,0.512606,0.512606
5,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002568101,12.0,112884083,12.0,112446279.0,2568101,2734639,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 22, 2023",criteria provide...,,,,,,,,p.Trp6Cys,W6C,6,W,C,0,0,112446279.0,W,C,0.92,0.512606,0.512606
6,NM_002834.5(PTPN...,PTPN11,N10Y,not provided|Car...,VCV002587734,12.0,112884093,12.0,112446289.0,2587734,2764072,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 3, 2024",criteria provide...,,,,,,,,p.Asn10Tyr,N10Y,10,N,Y,0,0,112446289.0,N,Y,0.703,0.1565,0.1565
7,NM_002834.5(PTPN...,PTPN11,N10H,Metachondromatosis,VCV001684677,12.0,112884093,12.0,112446289.0,1684677,1676705,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 20, 2022",criteria provide...,,,,,,,,p.Asn10His,N10H,10,N,H,0,1,112446289.0,N,H,0.436,-0.07798,-0.07798
8,NM_002834.5(PTPN...,PTPN11,N10D,Cardiovascular p...,VCV000838860,12.0,112884093,12.0,112446289.0,838860,839322,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jul 3, 2025",criteria provide...,,,,,,,,p.Asn10Asp,N10D,10,N,D,-1,-1,112446289.0,N,D,0.372,-0.184288,-0.184288
9,NM_002834.5(PTPN...,PTPN11,N10T,RASopathy|not pr...,VCV002705464,12.0,112884094,12.0,112446290.0,2705464,2856602,rs200613531,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Aug 22, 2024",criteria provide...,,,,,,,,p.Asn10Thr,N10T,10,N,T,0,0,112446290.0,N,T,0.471,-0.082037,-0.082037


In [233]:
# Clean df_stat for analysis
#df_stat = df_stat.dropna(subset=['REVEL', 'Germline classification']) # Keep only rows where REVEL or ClinVar classification is not NaN
#df_stat['REVEL'] = pd.to_numeric(df_stat['REVEL'], errors='coerce') # Make sure REVEL is numeric
#df_stat['Germline classification'] = df_stat['Germline classification'].astype(str).str.strip()

# Select only the columns needed for statistical classification
columns_to_keep = [
    'GRCh38Location',
    'GRCh37Location',
    'Protein_change',
    'Germline classification',
    'REVEL',
    'BayesDel_noAF',
    'BayesDel_addAF',
    'AAfrom',
    'AAto',
    'AApos'
]

df_stat = df_stat[columns_to_keep]

# Quick check
print(df_stat.head())

   GRCh38Location GRCh37Location Protein_change Germline classification  \
0       112419113      112856917            M1R    Likely pathogenic      
1       112419116      112856920            T2I  Pathogenic/Likel...      
2       112419121      112856925            R4G  Conflicting clas...      
3       112419122      112856926            R4Q    Likely pathogenic      
4       112446279      112884083            W6C  Uncertain signif...      

   REVEL  BayesDel_noAF  BayesDel_addAF AAfrom AAto  AApos  
0  0.202       0.660000        0.660000      M    R      1  
1  0.213       0.010807        0.010807      T    I      2  
2  0.518       0.044882        0.044882      R    G      4  
3  0.393      -0.022043       -0.022043      R    Q      4  
4  0.920       0.512606        0.512606      W    C      6  


In [234]:
# Save merged file
df_stat.to_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_ClinVar_REVEL_BayesDel.csv", index=False)


##### Validate meta predictors with NSEuronet

In [None]:
# cDNA/protein --> genome as works even if multiple codons map to same AA change

