In [1]:

import pandas as pd
from pathlib import Path
import json
dataset_path = Path("/data/dzeiberg/mave_calibration/data/Erwood_NPC1_HEK293T/")
assert dataset_path.exists()
AUTHOR_TRANSCRIPT = "NM_000271"

In [2]:
data = pd.read_excel(dataset_path / 'raw' / '41587_2021_1201_MOESM3_ESM.xlsx', sheet_name='Supplementary Table 1',header=1)
data = data.assign(author_transcript = AUTHOR_TRANSCRIPT,
                   CHROM = data.id.str.split('-').str[0].str.replace("chr",""))
data = data.rename(columns={
    'end' : 'POSITION',
    'reference_base' : 'REF',
    'alternate_base' : 'ALT',
    'Function Score' : 'score'
})

In [3]:
data

Unnamed: 0,Protein Annotation,Wild type Base,Edited Base,CDS,Consequence,SPE Classification,score,id,start,POSITION,REF,ALT,refseq_id,Clinvar_SIG,CADD_phred,Unadjusted Function Score,author_transcript,CHROM
0,L1027L,T,A,3081,synonymous,Functional,0.954726,chr18-23536836-23536837-A-T,23536836,23536837,A,T,NPC1:NM_000271:exon21:c.T3081A:p.L1027L:Select,,,0.188709,NM_000271,18
1,L1027L,T,C,3081,synonymous,Functional,0.999139,chr18-23536836-23536837-A-G,23536836,23536837,A,G,NPC1:NM_000271:exon21:c.T3081C:p.L1027L:Select,,,0.008803,NM_000271,18
2,L1027L,T,G,3081,synonymous,Functional,0.963492,chr18-23536836-23536837-A-C,23536836,23536837,A,C,NPC1:NM_000271:exon21:c.T3081G:p.L1027L:Select,,,0.145510,NM_000271,18
3,G1028S,G,A,3082,missense,Functional,0.936084,chr18-23536835-23536836-C-T,23536835,23536836,C,T,NPC1:NM_000271:exon21:c.G3082A:p.G1028S:Select,,,0.260211,NM_000271,18
4,G1028R,G,C,3082,missense,Functional,0.953889,chr18-23536835-23536836-C-G,23536835,23536836,C,G,NPC1:NM_000271:exon21:c.G3082C:p.G1028R:Select,,11.22,0.191542,NM_000271,18
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,L919L,G,C,2757,synonymous,Functional,1.008759,chr18-23539848-23539849-C-G,23539848,23539849,C,G,NPC1:NM_000271:exon18:c.G2757C:p.L919L:Select,,,-0.031451,NM_000271,18
974,L919L,G,T,2757,synonymous,Functional,1.005450,chr18-23539848-23539849-C-A,23539848,23539849,C,A,NPC1:NM_000271:exon18:c.G2757T:p.L919L:Select,,,-0.023142,NM_000271,18
975,V920M,G,A,2758,missense,Deleterious,-0.010638,chr18-23539847-23539848-C-T,23539847,23539848,C,T,NPC1:NM_000271:exon18:c.G2758A:p.V920M:Select,,27.10,3.304521,NM_000271,18
976,V920L,G,C,2758,missense,Deleterious,0.480805,chr18-23539847-23539848-C-G,23539847,23539848,C,G,NPC1:NM_000271:exon18:c.G2758C:p.V920L:Select,,22.70,1.693627,NM_000271,18


In [4]:
from Bio.PDB.Polypeptide import protein_letters_3to1
protein_letters_1to3 = {v:k.title() for k,v in protein_letters_3to1.items()}
def clean(s):
    try:
        hgvs_pro = protein_letters_1to3[s[0]] + s[1:-1]
    except TypeError as e:
        return ""
        raise e
    if s[-1] == "*":
        hgvs_pro += "Ter"
    else:
        hgvs_pro += protein_letters_1to3[s[-1]]
    return "p."+ hgvs_pro

In [5]:
data = data.assign(hgvs_pro=data.loc[:,'Protein Annotation'].apply(clean))
data = data.loc[data.hgvs_pro != ""]

In [6]:
data

Unnamed: 0,Protein Annotation,Wild type Base,Edited Base,CDS,Consequence,SPE Classification,score,id,start,POSITION,REF,ALT,refseq_id,Clinvar_SIG,CADD_phred,Unadjusted Function Score,author_transcript,CHROM,hgvs_pro
0,L1027L,T,A,3081,synonymous,Functional,0.954726,chr18-23536836-23536837-A-T,23536836,23536837,A,T,NPC1:NM_000271:exon21:c.T3081A:p.L1027L:Select,,,0.188709,NM_000271,18,p.Leu1027Leu
1,L1027L,T,C,3081,synonymous,Functional,0.999139,chr18-23536836-23536837-A-G,23536836,23536837,A,G,NPC1:NM_000271:exon21:c.T3081C:p.L1027L:Select,,,0.008803,NM_000271,18,p.Leu1027Leu
2,L1027L,T,G,3081,synonymous,Functional,0.963492,chr18-23536836-23536837-A-C,23536836,23536837,A,C,NPC1:NM_000271:exon21:c.T3081G:p.L1027L:Select,,,0.145510,NM_000271,18,p.Leu1027Leu
3,G1028S,G,A,3082,missense,Functional,0.936084,chr18-23536835-23536836-C-T,23536835,23536836,C,T,NPC1:NM_000271:exon21:c.G3082A:p.G1028S:Select,,,0.260211,NM_000271,18,p.Gly1028Ser
4,G1028R,G,C,3082,missense,Functional,0.953889,chr18-23536835-23536836-C-G,23536835,23536836,C,G,NPC1:NM_000271:exon21:c.G3082C:p.G1028R:Select,,11.22,0.191542,NM_000271,18,p.Gly1028Arg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
973,L919L,G,C,2757,synonymous,Functional,1.008759,chr18-23539848-23539849-C-G,23539848,23539849,C,G,NPC1:NM_000271:exon18:c.G2757C:p.L919L:Select,,,-0.031451,NM_000271,18,p.Leu919Leu
974,L919L,G,T,2757,synonymous,Functional,1.005450,chr18-23539848-23539849-C-A,23539848,23539849,C,A,NPC1:NM_000271:exon18:c.G2757T:p.L919L:Select,,,-0.023142,NM_000271,18,p.Leu919Leu
975,V920M,G,A,2758,missense,Deleterious,-0.010638,chr18-23539847-23539848-C-T,23539847,23539848,C,T,NPC1:NM_000271:exon18:c.G2758A:p.V920M:Select,,27.10,3.304521,NM_000271,18,p.Val920Met
976,V920L,G,C,2758,missense,Deleterious,0.480805,chr18-23539847-23539848-C-G,23539847,23539848,C,G,NPC1:NM_000271:exon18:c.G2758C:p.V920L:Select,,22.70,1.693627,NM_000271,18,p.Val920Leu


In [7]:
data.to_csv(dataset_path / 'scoreset.csv',index=False)

In [8]:
metadata = dict(uniprot_acc='O15118')
with open(dataset_path / 'metadata.json','w') as f:
    json.dump(metadata,f)