In [1]:
import pandas as pd

from pathlib import Path
import json
dataset_path = Path("/data/dzeiberg/mave_calibration/data/Erwood_BRCA2_HEK293T/")
assert dataset_path.exists()
pd.set_option('display.max_columns', 500)

AUTHOR_TRANSCRIPT = "NM_000059"

In [2]:
data = pd.read_excel(dataset_path / 'raw' / '41587_2021_1201_MOESM3_ESM.xlsx', sheet_name='Supplementary Table 3',header=1)
data = data.assign(author_transcript = AUTHOR_TRANSCRIPT,
                   CHROM = data.id.str.split("-").str[0].str.replace("chr", ""))
data = data.rename(columns = {
    "end" : "POSITION",
    "reference_base" : "REF",
    "alternate_base" : "ALT",
    'Function Score' : 'score'
})

In [3]:
data.head()

Unnamed: 0,Protein Annotation,Wild type Base,Edited Base,CDS,Consequence,SPE Classification,score,id,start,POSITION,REF,ALT,refseq_id,Clinvar_SIG,CADD_phred,Unadjusted Function Score,author_transcript,CHROM
0,T2515T,A,C,7545,synonymous,Functional,0.999369,chr13-32356536-32356537-A-C,32356536,32356537,A,C,BRCA2:NM_000059:exon15:c.A7545C:p.T2515T:Select,,,0.003596,NM_000059,13
1,T2515T,A,T,7545,synonymous,Functional,0.995782,chr13-32356536-32356537-A-T,32356536,32356537,A,T,BRCA2:NM_000059:exon15:c.A7545T:p.T2515T:Select,,,0.009437,NM_000059,13
2,T2515T,A,G,7545,synonymous,Functional,0.957546,chr13-32356536-32356537-A-G,32356536,32356537,A,G,BRCA2:NM_000059:exon15:c.A7545G:p.T2515T:Select,Likely_benign,,0.108622,NM_000059,13
3,S2516T,T,A,7546,missense,Functional,1.006952,chr13-32356537-32356538-T-A,32356537,32356538,T,A,BRCA2:NM_000059:exon15:c.T7546A:p.S2516T:Select,,26.2,-0.016045,NM_000059,13
4,S2516P,T,C,7546,missense,Functional,0.960612,chr13-32356537-32356538-T-C,32356537,32356538,T,C,BRCA2:NM_000059:exon15:c.T7546C:p.S2516P:Select,,28.1,0.098578,NM_000059,13


In [4]:
from Bio.PDB.Polypeptide import one_to_index, index_to_three
def one2three(one):
    if one == "X":
        return "Ter"
    return index_to_three(one_to_index(one)).title()
    
def parse_refseq_id(id):
    try:
        symbol, transcript, exon, dna_variant, protein_variant,select = id.split(':')
    except ValueError:
        raise ValueError(f"Expected 6 parts in {id}")
    dna_ref, transcript_pos, dna_alt = dna_variant[2], dna_variant[3:-1], dna_variant[-1]
    dna_variant = f"c.{transcript_pos}{dna_ref}>{dna_alt}"

    protein_ref, protein_pos, protein_alt = protein_variant[2], protein_variant[3:-1], protein_variant[-1]
    protein_variant = f"p.{one2three(protein_ref)}{protein_pos}{one2three(protein_alt)}"
    return dict(symbol=symbol, transcript=transcript, exon=exon, hgvs_nuc=dna_variant, hgvs_pro=protein_variant, select=select)

In [5]:
parsed_id = pd.DataFrame(list(data.refseq_id.apply(parse_refseq_id)),index=data.index)
data = pd.concat([data,parsed_id],axis=1)

In [6]:
data

Unnamed: 0,Protein Annotation,Wild type Base,Edited Base,CDS,Consequence,SPE Classification,score,id,start,POSITION,REF,ALT,refseq_id,Clinvar_SIG,CADD_phred,Unadjusted Function Score,author_transcript,CHROM,symbol,transcript,exon,hgvs_nuc,hgvs_pro,select
0,T2515T,A,C,7545,synonymous,Functional,0.999369,chr13-32356536-32356537-A-C,32356536,32356537,A,C,BRCA2:NM_000059:exon15:c.A7545C:p.T2515T:Select,,,0.003596,NM_000059,13,BRCA2,NM_000059,exon15,c.7545A>C,p.Thr2515Thr,Select
1,T2515T,A,T,7545,synonymous,Functional,0.995782,chr13-32356536-32356537-A-T,32356536,32356537,A,T,BRCA2:NM_000059:exon15:c.A7545T:p.T2515T:Select,,,0.009437,NM_000059,13,BRCA2,NM_000059,exon15,c.7545A>T,p.Thr2515Thr,Select
2,T2515T,A,G,7545,synonymous,Functional,0.957546,chr13-32356536-32356537-A-G,32356536,32356537,A,G,BRCA2:NM_000059:exon15:c.A7545G:p.T2515T:Select,Likely_benign,,0.108622,NM_000059,13,BRCA2,NM_000059,exon15,c.7545A>G,p.Thr2515Thr,Select
3,S2516T,T,A,7546,missense,Functional,1.006952,chr13-32356537-32356538-T-A,32356537,32356538,T,A,BRCA2:NM_000059:exon15:c.T7546A:p.S2516T:Select,,26.20,-0.016045,NM_000059,13,BRCA2,NM_000059,exon15,c.7546T>A,p.Ser2516Thr,Select
4,S2516P,T,C,7546,missense,Functional,0.960612,chr13-32356537-32356538-T-C,32356537,32356538,T,C,BRCA2:NM_000059:exon15:c.T7546C:p.S2516P:Select,,28.10,0.098578,NM_000059,13,BRCA2,NM_000059,exon15,c.7546T>C,p.Ser2516Pro,Select
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,T1155T,T,C,3465,synonymous,Functional,0.984006,chr13-32337819-32337820-T-C,32337819,32337820,T,C,BRCA2:NM_000059:exon11:c.T3465C:p.T1155T:Select,Likely_benign,12.04,0.032910,NM_000059,13,BRCA2,NM_000059,exon11,c.3465T>C,p.Thr1155Thr,Select
422,T1155T,T,G,3465,synonymous,Functional,1.022418,chr13-32337819-32337820-T-G,32337819,32337820,T,G,BRCA2:NM_000059:exon11:c.T3465G:p.T1155T:Select,,11.75,-0.047259,NM_000059,13,BRCA2,NM_000059,exon11,c.3465T>G,p.Thr1155Thr,Select
423,S1156T,T,A,3466,missense,Functional,1.041031,chr13-32337820-32337821-T-A,32337820,32337821,T,A,BRCA2:NM_000059:exon11:c.T3466A:p.S1156T:Select,,16.72,-0.082471,NM_000059,13,BRCA2,NM_000059,exon11,c.3466T>A,p.Ser1156Thr,Select
424,S1156P,T,C,3466,missense,Functional,1.028343,chr13-32337820-32337821-T-C,32337820,32337821,T,C,BRCA2:NM_000059:exon11:c.T3466C:p.S1156P:Select,,23.60,-0.055343,NM_000059,13,BRCA2,NM_000059,exon11,c.3466T>C,p.Ser1156Pro,Select


In [7]:
data.to_csv(dataset_path / 'scoreset.csv',index=False)

In [None]:
metadata = dict(uniprot_acc='P51587')
with open(dataset_path / 'metadata.json','w') as f:
    json.dump(metadata,f)