In [2]:
import pandas as pd
import os
from Bio import PDB
import requests


In [11]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', 20)
pd.set_option('display.max_rows', None) 

In [4]:
gene = 'PTPN11'

### Read in data

##### NSEuronet

In [9]:
# Manually load the NSEuronetData.csv (for all RAS genes)

df1 = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\NSEuroNetData.csv", sep=';', header=None)
print(df1.head())

        0          1        2                3
0    KRAS    c.65A>G   p.Q22R  Noonan syndrome
1  PTPN11   c.922A>G  p.N308D  Noonan syndrome
2  PTPN11   c.184T>G   p.Y62D  Noonan syndrome
3  PTPN11   c.922A>G  p.N308D  Noonan syndrome
4  PTPN11  c.1232C>T  p.T411M  Noonan syndrome


In [8]:
# Filter column [2] with protein change pattern 

df1[2] = df1[2].astype(str).str.strip() # strip white space so everything is a string
NSEuronet_df = df1[df1[2].str.match(r"^p\.[A-Za-z]\d+[A-Za-z]$", na=False)] # regex pattern: p. - original aa - positon - new aa

print(df1[~df1[2].str.match(r"^p\.[A-Za-z]\d+[A-Za-z]$", na=False)]) # check the funny business


           0                    1                   2                    3
321   PTPN11         c.181_183del            p.D61del      Noonan syndrome
760   PTPN11     c.179_182delinsT    p.G60_D61delinsV      Noonan syndrome
940   PTPN11         c.179_181del            p.G60del      Noonan syndrome
1109  MAP2K2         c.186_197del        p.K63_E66del         CFC syndrome
1120  MAP2K2         c.186_197del        p.K63_E66del         CFC syndrome
1233  MAP2K1         c.175_177del            p.K59del    Costello syndrome
1484    SOS1       c.1435_1443dup      p.R479_P481dup         CFC syndrome
1527  MAP2K2         c.136_165del        p.L46_E55del         CFC syndrome
1544     CBL          c.1096-1G>C                 nan      Noonan syndrome
1545     CBL          c.1228-4A>G                 nan      Noonan syndrome
1695    BRAF       c.1384_1407del      p.R462_G469del         CFC syndrome
1702    BRAF       c.1408_1410del           p.T470del         CFC syndrome
1710  MAP2K1         c.17

In [11]:
NSEuronet_headers = ["Gene", "cDNA", "Protein", "Disease"] #headers to add to df
NSEuronet_df.columns = NSEuronet_headers

# Filter for gene
NSEuronet_gene_df = NSEuronet_df[NSEuronet_df["Gene"]==f'{gene}']

In [12]:
NSEuronet_gene_df

Unnamed: 0,Gene,cDNA,Protein,Disease
1,PTPN11,c.922A>G,p.N308D,Noonan syndrome
2,PTPN11,c.184T>G,p.Y62D,Noonan syndrome
3,PTPN11,c.922A>G,p.N308D,Noonan syndrome
4,PTPN11,c.1232C>T,p.T411M,Noonan syndrome
5,PTPN11,c.1232C>T,p.T411M,Noonan syndrome
6,PTPN11,c.1232C>T,p.T411M,Noonan syndrome
7,PTPN11,c.1529A>G,p.Q510R,NF1-Noonan syndrome
8,PTPN11,c.179G>C,p.G60A,Noonan syndrome
9,PTPN11,c.182A>G,p.D61G,Noonan syndrome
10,PTPN11,c.184T>G,p.Y62D,Noonan syndrome


In [13]:
NSEuronet_gene_df.shape

(2193, 4)

#### ClinVar

NCBI E-utilities rate is limited to 3 requests per second (without an API key). Can request an NCBI API key to increase limit to ~10 per second.

Try again teXML release (gzipped) and parse for gene of interest

ClinVar maintains a complete set of variant data on an FTP server. Weekly update cycle but only the realease on the 1st Thursday of the month is archived. Download this below. https://pmc.ncbi.nlm.nih.gov/tools/ftp/

In [12]:
# Manually load the clinvar_result.txt download (for single gene e.g. PTPN11)

df2 = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\clinvar_result.txt", sep="\t", low_memory=False)
print(df2.head())


                  Name              Gene(s) Protein change  \
0  GRCh38/hg38 12p1...  LOC130008261|LOC...            NaN   
1  GRCh38/hg38 12q2...  HECTD4|LOC130008...            NaN   
2  GRCh38/hg38 12q2...  HECTD4|LOC130008...            NaN   
3  GRCh38/hg38 12q2...          PTPN11|RPL6            NaN   
4  NM_002834.3(PTPN...          PTPN11|RPL6            NaN   

          Condition(s)     Accession  GRCh37Chromosome       GRCh37Location  \
0            See cases  VCV000150740              12.0   282465 - 133773393   
1            See cases  VCV000059818              12.0  112741234 - 1131...   
2            See cases  VCV000059819              12.0  112745336 - 1131...   
3            See cases  VCV000145960              12.0  112854667 - 1128...   
4  Noonan syndrome ...  VCV000882155              12.0            112856599   

   GRCh38Chromosome       GRCh38Location  VariationID  AlleleID(s)  \
0              12.0   121271 - 133196807       150740       160491   
1           

In [13]:
# Filter for SNVs
df2_snvs = df2[~df2['GRCh38Location'].astype(str).str.contains('-')].copy() # Keep rows where GRCh38Location is a single number (i.e no hyphen)
df2_snvs = df2_snvs.dropna(subset=['GRCh38Location'])

# Convert GRCh38Location to int (for merging later)
df2_snvs['GRCh38Location'] = df2_snvs['GRCh38Location'].astype(int)


In [16]:
df2_snvs[df2_snvs["Protein change"].notna()]

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24
24,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,
25,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,
30,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,
31,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,
57,NM_002834.5(PTPN...,PTPN11,W6fs,RASopathy,VCV001435208,12.0,112884082,12.0,112446278,1435208,1422183,rs2135856266,NC_000012.12:112...,Deletion,frameshift variant,Pathogenic,"Jun 13, 2022",criteria provide...,,,,,,,
58,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,
59,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002568101,12.0,112884083,12.0,112446279,2568101,2734639,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 22, 2023",criteria provide...,,,,,,,
60,NM_002834.5(PTPN...,PTPN11,N10Y,not provided|Car...,VCV002587734,12.0,112884093,12.0,112446289,2587734,2764072,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 3, 2024",criteria provide...,,,,,,,
61,NM_002834.5(PTPN...,PTPN11,N10H,Metachondromatosis,VCV001684677,12.0,112884093,12.0,112446289,1684677,1676705,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 20, 2022",criteria provide...,,,,,,,
62,NM_002834.5(PTPN...,PTPN11,N10D,Cardiovascular p...,VCV000838860,12.0,112884093,12.0,112446289,838860,839322,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jul 3, 2025",criteria provide...,,,,,,,


In [29]:
print(df2_snvs.columns.tolist())

['Name', 'Gene(s)', 'Protein change', 'Condition(s)', 'Accession', 'GRCh37Chromosome', 'GRCh37Location', 'GRCh38Chromosome', 'GRCh38Location', 'VariationID', 'AlleleID(s)', 'dbSNP ID', 'Canonical SPDI', 'Variant type', 'Molecular consequence', 'Germline classification', 'Germline date last evaluated', 'Germline review status', 'Somatic clinical impact', 'Somatic clinical impact date last evaluated', 'Somatic clinical impact review status', 'Oncogenicity classification', 'Oncogenicity date last evaluated', 'Oncogenicity review status', 'Unnamed: 24']


In [19]:
# Filter for missense variants
df2_missense = df2_snvs.assign(
    **{'Protein change': df2_snvs['Protein change'].str.split(',')}
).explode('Protein change')

df2_missense['Protein change'] = df2_missense['Protein change'].str.strip() # remove leading/trailing spaces

df2_missense = df2_missense[df2_missense['Protein change'].str.match(r"^[A-Za-z]\d+[A-Za-z]$", na=False)] # regex pattern: original aa - positon - new aa


In [20]:
# Extract ClinVar version of AA info
df2_missense['AAfrom'] = df2_missense['Protein change'].str[0]        # first character
df2_missense['AAto']   = df2_missense['Protein change'].str[-1]       # last character
df2_missense['AApos_ClinVar']  = df2_missense['Protein change'].str.extract(r'(\d+)').astype(int)  # numeric part


In [23]:
df2_missense

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,AAfrom,AAto,AApos_ClinVar
24,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,,M,R,1
25,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,,T,I,2
30,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,,R,G,4
31,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,,R,Q,4
58,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,,W,C,6
59,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002568101,12.0,112884083,12.0,112446279,2568101,2734639,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 22, 2023",criteria provide...,,,,,,,,W,C,6
60,NM_002834.5(PTPN...,PTPN11,N10Y,not provided|Car...,VCV002587734,12.0,112884093,12.0,112446289,2587734,2764072,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 3, 2024",criteria provide...,,,,,,,,N,Y,10
61,NM_002834.5(PTPN...,PTPN11,N10H,Metachondromatosis,VCV001684677,12.0,112884093,12.0,112446289,1684677,1676705,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 20, 2022",criteria provide...,,,,,,,,N,H,10
62,NM_002834.5(PTPN...,PTPN11,N10D,Cardiovascular p...,VCV000838860,12.0,112884093,12.0,112446289,838860,839322,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jul 3, 2025",criteria provide...,,,,,,,,N,D,10
63,NM_002834.5(PTPN...,PTPN11,N10T,RASopathy|not pr...,VCV002705464,12.0,112884094,12.0,112446290,2705464,2856602,rs200613531,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Aug 22, 2024",criteria provide...,,,,,,,,N,T,10


###### Parse Uniprot reference sequence

In [16]:
# Dictionary of Uniprot IDs for each gene
uniprot_ids = {
    "PTPN11": "Q06124",    
}

# Function to retrieve the gene reference sequence from UniProt 
def get_uniprot_sequence(gene):
    uniprot_id = uniprot_ids[gene]
    url = f'https://www.uniprot.org/uniprot/{uniprot_id}.fasta'
    response = requests.get(url)
    if response.status_code == 200:
        # Parse the FASTA sequence from UniProt respo nse
        fasta_data = response.text.strip().split('\n')
        sequence = ''.join(fasta_data[1:])  # Skip the header line
        return sequence
    else:
        return None


parser = PDB.PDBParser(QUIET=True) # Initialise PDB parser

uniprot_sequence = get_uniprot_sequence(gene) # Retrieve UniProt reference sequence for gene



In [24]:
# Check if AAfrom matches the reference sequence
df2_missense['matches_Uniprot'] = df2_missense.apply(
    lambda row: (1 <= row['AApos_ClinVar'] <= len(uniprot_sequence) and 
                 uniprot_sequence[row['AApos_ClinVar'] - 1] == row['AAfrom']), axis=1
)



In [27]:
# Filter ClinVar AA info that matches Uniprot reference 
df2_missense = df2_missense[df2_missense['matches_Uniprot']]

In [29]:
df2_missense

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,AAfrom,AAto,AApos_ClinVar,matches_Uniprot
24,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,,M,R,1,True
25,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,,T,I,2,True
30,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,,R,G,4,True
31,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,,R,Q,4,True
58,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,,W,C,6,True
59,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002568101,12.0,112884083,12.0,112446279,2568101,2734639,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 22, 2023",criteria provide...,,,,,,,,W,C,6,True
60,NM_002834.5(PTPN...,PTPN11,N10Y,not provided|Car...,VCV002587734,12.0,112884093,12.0,112446289,2587734,2764072,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 3, 2024",criteria provide...,,,,,,,,N,Y,10,True
61,NM_002834.5(PTPN...,PTPN11,N10H,Metachondromatosis,VCV001684677,12.0,112884093,12.0,112446289,1684677,1676705,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 20, 2022",criteria provide...,,,,,,,,N,H,10,True
62,NM_002834.5(PTPN...,PTPN11,N10D,Cardiovascular p...,VCV000838860,12.0,112884093,12.0,112446289,838860,839322,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jul 3, 2025",criteria provide...,,,,,,,,N,D,10,True
63,NM_002834.5(PTPN...,PTPN11,N10T,RASopathy|not pr...,VCV002705464,12.0,112884094,12.0,112446290,2705464,2856602,rs200613531,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Aug 22, 2024",criteria provide...,,,,,,,,N,T,10,True


#### REVEL

Downloaded from https://sites.google.com/site/revelgenomics/downloads

Try again to curl from dfNSFP

In [24]:
# Manually download the REVEL data (for all genes)

# Read just the first 10 rows (6GB REVEL file)
df3 = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\revel-v1.3_all_chromosomes\revel_with_transcript_ids", nrows=10)
df3

Unnamed: 0,chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid
0,1,35142,35142,G,A,T,M,0.027,ENST00000417324
1,1,35142,35142,G,C,T,R,0.035,ENST00000417324
2,1,35142,35142,G,T,T,K,0.043,ENST00000417324
3,1,35143,35143,T,A,T,S,0.018,ENST00000417324
4,1,35143,35143,T,C,T,A,0.034,ENST00000417324
5,1,35143,35143,T,G,T,P,0.039,ENST00000417324
6,1,35144,35144,A,C,C,W,0.012,ENST00000417324
7,1,35145,35145,C,A,C,F,0.023,ENST00000417324
8,1,35145,35145,C,G,C,S,0.029,ENST00000417324
9,1,35145,35145,C,T,C,Y,0.016,ENST00000417324


In [62]:
revel_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\revel-v1.3_all_chromosomes\revel_with_transcript_ids"
PTPN11_revel_file = r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv" #output file

min_pos = min(df2_snvs['GRCh38Location'])
max_pos = max(df2_snvs['GRCh38Location'])
chrom = df2_snvs['GRCh38Chromosome'].unique()[0]

chunksize = 1_000_000
ptpn11_revel_chunks = []

for chunk in pd.read_csv(revel_file, chunksize=chunksize, low_memory=False):
    # Convert positions in revel_file to numeric
    chunk['grch38_pos'] = pd.to_numeric(chunk['grch38_pos'], errors='coerce')
    
    # Only keep PTPN11 chromosome
    chunk = chunk[chunk['chr'] == chrom]
    
    # Keep only positions within PTPN11 range
    filtered_chunk = chunk[
        (chunk['grch38_pos'] >= min_pos) &
        (chunk['grch38_pos'] <= max_pos)
    ]
    
    if not filtered_chunk.empty:
        ptpn11_revel_chunks.append(filtered_chunk)
    
    # Early stop: REVEL is sorted by grch38_pos
    if chunk['grch38_pos'].max() > max_pos:
        break

# Concatenate all filtered chunks
if ptpn11_revel_chunks:
    ptpn11_revel = pd.concat(ptpn11_revel_chunks)
    ptpn11_revel.to_csv(PTPN11_revel_file, index=False)
    print(f"Filtered REVEL for PTPN11 saved to:\n{PTPN11_revel_file}")

Filtered REVEL for PTPN11 saved to:
\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv


In [7]:
ptpn11_revel = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv") #output file


In [8]:
ptpn11_revel

Unnamed: 0,chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid
0,12,112856916,112419112.0,A,C,M,L,0.263,ENST00000392597;ENST00000351677;ENST00000392596
1,12,112856916,112419112.0,A,G,M,V,0.301,ENST00000392597;ENST00000351677;ENST00000392596
2,12,112856916,112419112.0,A,T,M,L,0.263,ENST00000392597;ENST00000351677;ENST00000392596
3,12,112856917,112419113.0,T,A,M,K,0.294,ENST00000392597;ENST00000351677;ENST00000392596
4,12,112856917,112419113.0,T,C,M,T,0.315,ENST00000392597;ENST00000351677;ENST00000392596
5,12,112856917,112419113.0,T,G,M,R,0.202,ENST00000392597;ENST00000351677;ENST00000392596
6,12,112856918,112419114.0,G,A,M,I,0.222,ENST00000392597;ENST00000351677;ENST00000392596
7,12,112856918,112419114.0,G,C,M,I,0.222,ENST00000392597;ENST00000351677;ENST00000392596
8,12,112856918,112419114.0,G,T,M,I,0.222,ENST00000392597;ENST00000351677;ENST00000392596
9,12,112856919,112419115.0,A,C,T,P,0.194,ENST00000392597;ENST00000351677;ENST00000392596


In [63]:
ptpn11_revel.shape

(4645, 9)

### Create datasets

#### Meta-predictor analysis

In [35]:
revel_df[revel_df.duplicated(subset=['grch38_pos','aaref','aaalt'], keep=False)]

Unnamed: 0,chr,hg19_pos,grch38_pos,ref,alt,aaref,aaalt,REVEL,Ensembl_transcriptid
0,12,112856916,112419112.0,A,C,M,L,0.263,ENST00000392597;...
2,12,112856916,112419112.0,A,T,M,L,0.263,ENST00000392597;...
6,12,112856918,112419114.0,G,A,M,I,0.222,ENST00000392597;...
7,12,112856918,112419114.0,G,C,M,I,0.222,ENST00000392597;...
8,12,112856918,112419114.0,G,T,M,I,0.222,ENST00000392597;...
29,12,112884080,112446276.0,A,C,R,S,0.721,ENST00000392597;...
30,12,112884080,112446276.0,A,T,R,S,0.716,ENST00000392597;...
31,12,112884081,112446277.0,T,A,W,R,0.941,ENST00000392597;...
32,12,112884081,112446277.0,T,C,W,R,0.941,ENST00000392597;...
36,12,112884083,112446279.0,G,C,W,C,0.92,ENST00000392597;...


In [36]:
# Load PTPN11_REVEL
revel_df = pd.read_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_REVEL.csv")

# Take aggregate REVEL for multiple SNVs
revel_df_agg = revel_df.groupby(['grch38_pos','aaref','aaalt'], as_index=False)['REVEL'].mean()

# Merge PTPN11_REVEL mean scores on ClinVar
df_stat = pd.merge(
    df2_missense,
    revel_df_agg,
    left_on=['GRCh38Location', 'AAfrom', 'AAto'],
    right_on=['grch38_pos', 'aaref', 'aaalt'],
    how='left'  # keep all ClinVar missense
)


In [37]:
df2_missense.shape

(588, 29)

In [38]:
df2_missense

Unnamed: 0,Name,Gene(s),Protein change,Condition(s),Accession,GRCh37Chromosome,GRCh37Location,GRCh38Chromosome,GRCh38Location,VariationID,AlleleID(s),dbSNP ID,Canonical SPDI,Variant type,Molecular consequence,Germline classification,Germline date last evaluated,Germline review status,Somatic clinical impact,Somatic clinical impact date last evaluated,Somatic clinical impact review status,Oncogenicity classification,Oncogenicity date last evaluated,Oncogenicity review status,Unnamed: 24,AAfrom,AAto,AApos_ClinVar,matches_Uniprot
24,NM_002834.5(PTPN...,PTPN11,M1R,Juvenile myelomo...,VCV003574257,12.0,112856917,12.0,112419113,3574257,3707172,,NC_000012.12:112...,single nucleotid...,missense variant...,Likely pathogenic,"Jun 20, 2024",criteria provide...,,,,,,,,M,R,1,True
25,NM_002834.5(PTPN...,PTPN11,T2I,LEOPARD syndrome...,VCV000013349,12.0,112856920,12.0,112419116,13349,28388,rs267606990,NC_000012.12:112...,single nucleotid...,missense variant,Pathogenic/Likel...,"May 29, 2025",criteria provide...,,,,,,,,T,I,2,True
30,NM_002834.5(PTPN...,PTPN11,R4G,not provided,VCV000280283,12.0,112856925,12.0,112419121,280283,264573,rs886041517,NC_000012.12:112...,single nucleotid...,missense variant,Conflicting clas...,"Nov 8, 2024",criteria provide...,,,,,,,,R,G,4,True
31,NM_002834.5(PTPN...,PTPN11,R4Q,RASopathy,VCV002729546,12.0,112856926,12.0,112419122,2729546,2893458,rs2499756229,NC_000012.12:112...,single nucleotid...,missense variant,Likely pathogenic,"Dec 3, 2024",criteria provide...,,,,,,,,R,Q,4,True
58,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002785856,12.0,112884083,12.0,112446279,2785856,2941818,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jan 2, 2025",criteria provide...,,,,,,,,W,C,6,True
59,NM_002834.5(PTPN...,PTPN11,W6C,Cardiovascular p...,VCV002568101,12.0,112884083,12.0,112446279,2568101,2734639,rs79203122,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 22, 2023",criteria provide...,,,,,,,,W,C,6,True
60,NM_002834.5(PTPN...,PTPN11,N10Y,not provided|Car...,VCV002587734,12.0,112884093,12.0,112446289,2587734,2764072,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 3, 2024",criteria provide...,,,,,,,,N,Y,10,True
61,NM_002834.5(PTPN...,PTPN11,N10H,Metachondromatosis,VCV001684677,12.0,112884093,12.0,112446289,1684677,1676705,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"May 20, 2022",criteria provide...,,,,,,,,N,H,10,True
62,NM_002834.5(PTPN...,PTPN11,N10D,Cardiovascular p...,VCV000838860,12.0,112884093,12.0,112446289,838860,839322,rs368633510,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Jul 3, 2025",criteria provide...,,,,,,,,N,D,10,True
63,NM_002834.5(PTPN...,PTPN11,N10T,RASopathy|not pr...,VCV002705464,12.0,112884094,12.0,112446290,2705464,2856602,rs200613531,NC_000012.12:112...,single nucleotid...,missense variant,Uncertain signif...,"Aug 22, 2024",criteria provide...,,,,,,,,N,T,10,True


In [39]:
# Clean df_stat for analysis
df_stat = df_stat.dropna(subset=['REVEL', 'Germline classification']) # Keep only rows where REVEL or ClinVar classification is not NaN
df_stat['REVEL'] = pd.to_numeric(df_stat['REVEL'], errors='coerce') # Make sure REVEL is numeric
df_stat['Germline classification'] = df_stat['Germline classification'].astype(str).str.strip()

# Select only the columns needed for statistical classification
columns_to_keep = [
    'GRCh38Location',
    'Germline classification',
    'REVEL',
    'AAfrom',
    'AAto',
    'AApos_ClinVar'
]

df_stat = df_stat[columns_to_keep]

# Quick check
print(df_stat.head())

   GRCh38Location Germline classification  REVEL AAfrom AAto  AApos_ClinVar
0       112419113    Likely pathogenic     0.202      M    R              1
1       112419116  Pathogenic/Likel...     0.213      T    I              2
2       112419121  Conflicting clas...     0.518      R    G              4
3       112419122    Likely pathogenic     0.393      R    Q              4
4       112446279  Uncertain signif...     0.920      W    C              6


In [40]:
# Save merged file
df_stat.to_csv(r"\\rdp.arc.ucl.ac.uk\ritd-ag-project-rd025c-bhall50\ModellingRASopathies RDSS\PTPN11_ClinVar_REVEL.csv", index=False)


In [41]:
df_stat

Unnamed: 0,GRCh38Location,Germline classification,REVEL,AAfrom,AAto,AApos_ClinVar
0,112419113,Likely pathogenic,0.202,M,R,1
1,112419116,Pathogenic/Likel...,0.213,T,I,2
2,112419121,Conflicting clas...,0.518,R,G,4
3,112419122,Likely pathogenic,0.393,R,Q,4
4,112446279,Uncertain signif...,0.92,W,C,6
5,112446279,Uncertain signif...,0.92,W,C,6
6,112446289,Uncertain signif...,0.703,N,Y,10
7,112446289,Uncertain signif...,0.436,N,H,10
8,112446289,Uncertain signif...,0.372,N,D,10
9,112446290,Uncertain signif...,0.471,N,T,10
