In [3]:

import pandas as pd
pd.set_option('display.max_columns', 500)
from pathlib import Path
import json
dataset_path = Path("/mnt/i/bio/mave_curation/Erwood_BRCA2_HEK293T/")
assert dataset_path.exists()

import sys
sys.path.append("..")
from clinvar import getClinvar

In [4]:
data = pd.read_excel(dataset_path / 'raw' / '41587_2021_1201_MOESM3_ESM.xlsx', sheet_name='Supplementary Table 3',header=1)

In [5]:
data.head()

Unnamed: 0,Protein Annotation,Wild type Base,Edited Base,CDS,Consequence,SPE Classification,Function Score,id,start,end,reference_base,alternate_base,refseq_id,Clinvar_SIG,CADD_phred,Unadjusted Function Score
0,T2515T,A,C,7545,synonymous,Functional,0.999369,chr13-32356536-32356537-A-C,32356536,32356537,A,C,BRCA2:NM_000059:exon15:c.A7545C:p.T2515T:Select,,,0.003596
1,T2515T,A,T,7545,synonymous,Functional,0.995782,chr13-32356536-32356537-A-T,32356536,32356537,A,T,BRCA2:NM_000059:exon15:c.A7545T:p.T2515T:Select,,,0.009437
2,T2515T,A,G,7545,synonymous,Functional,0.957546,chr13-32356536-32356537-A-G,32356536,32356537,A,G,BRCA2:NM_000059:exon15:c.A7545G:p.T2515T:Select,Likely_benign,,0.108622
3,S2516T,T,A,7546,missense,Functional,1.006952,chr13-32356537-32356538-T-A,32356537,32356538,T,A,BRCA2:NM_000059:exon15:c.T7546A:p.S2516T:Select,,26.2,-0.016045
4,S2516P,T,C,7546,missense,Functional,0.960612,chr13-32356537-32356538-T-C,32356537,32356538,T,C,BRCA2:NM_000059:exon15:c.T7546C:p.S2516P:Select,,28.1,0.098578


In [6]:
from Bio.PDB.Polypeptide import one_to_index, index_to_three
def one2three(one):
    if one == "X":
        return "Ter"
    return index_to_three(one_to_index(one)).title()
    
def parse_refseq_id(id):
    try:
        symbol, transcript, exon, dna_variant, protein_variant,select = id.split(':')
    except ValueError:
        raise ValueError(f"Expected 6 parts in {id}")
    dna_ref, transcript_pos, dna_alt = dna_variant[2], dna_variant[3:-1], dna_variant[-1]
    dna_variant = f"c.{transcript_pos}{dna_ref}>{dna_alt}"

    protein_ref, protein_pos, protein_alt = protein_variant[2], protein_variant[3:-1], protein_variant[-1]
    protein_variant = f"p.{one2three(protein_ref)}{protein_pos}{one2three(protein_alt)}"
    return dict(symbol=symbol, transcript=transcript, exon=exon, hgvs_nuc=dna_variant, hgvs_pro=protein_variant, select=select)

In [7]:
parsed_id = pd.DataFrame(list(data.refseq_id.apply(parse_refseq_id)),index=data.index)
data = pd.concat([data,parsed_id],axis=1)

In [12]:
hgvs_pro_scoreset = pd.DataFrame.from_records(list(data.groupby("hgvs_pro")['Function Score'].agg('mean').items()),
                                                columns=['hgvs_pro','score'])

In [13]:
hgvs_pro_scoreset

Unnamed: 0,hgvs_pro,score
0,p.Ala1896Ala,1.105465
1,p.Ala1896Glu,1.116942
2,p.Ala1896Gly,1.064648
3,p.Ala1896Val,1.087023
4,p.Ala392Ala,1.018658
...,...,...
327,p.Val2652Val,1.028852
328,p.Val388Ala,1.007652
329,p.Val388Glu,1.027413
330,p.Val388Gly,1.022345


In [14]:
hgvs_pro_scoreset.to_csv(dataset_path / 'scoreset.csv',index=False)

In [None]:
data.to_csv(dataset_path / 'hgvs_nuc_scoreset.csv',index=False)

In [None]:
metadata = dict(uniprot_acc='P51587')
with open(dataset_path / 'metadata.json','w') as f:
    json.dump(metadata,f)

In [16]:
import pandas as pd
ss = pd.read_csv('/mnt/i/bio/mave_curation/Erwood_BRCA2_HEK293T/scoreset.csv')

In [17]:
ss

Unnamed: 0,hgvs_pro,score
0,p.Ala1896Ala,1.105465
1,p.Ala1896Glu,1.116942
2,p.Ala1896Gly,1.064648
3,p.Ala1896Val,1.087023
4,p.Ala392Ala,1.018658
...,...,...
327,p.Val2652Val,1.028852
328,p.Val388Ala,1.007652
329,p.Val388Glu,1.027413
330,p.Val388Gly,1.022345
