In [1]:

import pandas as pd
from pathlib import Path
import json
dataset_path = Path("/data/dzeiberg/mave_calibration/data/Erwood_NPC1_RPE1/")
assert dataset_path.exists()
AUTHOR_TRANSCRIPT = "NM_000271"

In [2]:
data = pd.read_excel(dataset_path / 'raw' / '41587_2021_1201_MOESM3_ESM.xlsx', sheet_name='Supplementary Table 2',header=1)
data = data.assign(author_transcript = AUTHOR_TRANSCRIPT,
                   CHROM = data.id.str.split('-').str[0].str.replace("chr",""))
data = data.rename(columns={
    'end' : 'POSITION',
    'reference_base' : 'REF',
    'alternate_base' : 'ALT',
    'Function Score' : 'score'
})

In [3]:
data

Unnamed: 0,Protein Annotation,Wild type Base,Edited Base,CDS,Consequence,SPE Classification,score,id,start,POSITION,REF,ALT,refseq_id,Clinvar_SIG,CADD_phred,Unadjusted Function Score,author_transcript,CHROM
0,Y420N,T,A,1258,missense,Deleterious,0.691882,chr18-23556310-23556311-A-T,23556310,23556311,A,T,NPC1:NM_000271:exon8:c.T1258A:p.Y420N:Select,,23.5,1.547779,NM_000271,18
1,Y420H,T,C,1258,missense,Functional,0.962923,chr18-23556310-23556311-A-G,23556310,23556311,A,G,NPC1:NM_000271:exon8:c.T1258C:p.Y420H:Select,,20.9,0.185460,NM_000271,18
2,Y420D,T,G,1258,missense,Deleterious,0.467523,chr18-23556310-23556311-A-C,23556310,23556311,A,C,NPC1:NM_000271:exon8:c.T1258G:p.Y420D:Select,,25.0,2.690271,NM_000271,18
3,Y420S,A,C,1259,missense,Deleterious,0.723563,chr18-23556309-23556310-T-G,23556309,23556310,T,G,NPC1:NM_000271:exon8:c.A1259C:p.Y420S:Select,,24.5,1.393231,NM_000271,18
4,Y420C,A,G,1259,missense,Functional,0.836480,chr18-23556309-23556310-T-C,23556309,23556310,T,C,NPC1:NM_000271:exon8:c.A1259G:p.Y420C:Select,,25.1,0.822727,NM_000271,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,L919L,G,A,2757,synonymous,Functional,1.010511,chr18-23539848-23539849-C-T,23539848,23539849,C,T,NPC1:NM_000271:exon18:c.G2757A:p.L919L:Select,,,-0.051112,NM_000271,18
92,L919L,G,C,2757,synonymous,Functional,1.002218,chr18-23539848-23539849-C-G,23539848,23539849,C,G,NPC1:NM_000271:exon18:c.G2757C:p.L919L:Select,,,-0.012112,NM_000271,18
93,L919L,G,T,2757,synonymous,Functional,1.010500,chr18-23539848-23539849-C-A,23539848,23539849,C,A,NPC1:NM_000271:exon18:c.G2757T:p.L919L:Select,,,-0.054215,NM_000271,18
94,V920L,G,C,2758,missense,Deleterious,0.357568,chr18-23539847-23539848-C-G,23539847,23539848,C,G,NPC1:NM_000271:exon18:c.G2758C:p.V920L:Select,,22.7,3.370410,NM_000271,18


In [4]:
from Bio.PDB.Polypeptide import protein_letters_3to1
protein_letters_1to3 = {v:k.title() for k,v in protein_letters_3to1.items()}
def clean(s):
    hgvs_pro = protein_letters_1to3[s[0]] + s[1:-1]
    if s[-1] == "*":
        hgvs_pro += "Ter"
    else:
        hgvs_pro += protein_letters_1to3[s[-1]]
    return "p."+ hgvs_pro

In [5]:
data = data.assign(hgvs_pro=data.loc[:,'Protein Annotation'].apply(clean))
data = data.loc[data.hgvs_pro != ""]

In [6]:
data.to_csv(dataset_path / 'scoreset.csv',index=False)

In [7]:
data.shape

(96, 19)

In [8]:
metadata = dict(uniprot_acc='O15118')
with open(dataset_path / 'metadata.json','w') as f:
    json.dump(metadata,f)