In [1]:
import pandas as pd
import joblib
from data_processing.main import read_scoreset
from tqdm import tqdm
from pathlib import Path
from data_processing.mapping_nbs import mapping_utils
from data_processing.mapping_nbs import plotting
pd.set_option('display.max_columns', 500)

dataset_name = "SCN5A"
dataset_path = Path("/data/dzeiberg/mave_calibration/data/") / dataset_name
assert dataset_path.exists()

In [2]:
metadata, gene_info, gnomAD_df, spliceAI_df, clinvar_df = mapping_utils.load_mapping_data()

In [4]:
scoreset = read_scoreset(dataset_path / "scoreset.csv")

In [5]:
scoreset

Unnamed: 0,mutation,mutation_type,aa_num,wt_allele,mut_allele,score,dms_se,dms_95ci,dms_95ci_lower,dms_95ci_upper,class,author_transcript,hgvs_pro,synonymous,nonsense
0,L1621X,nonsense,1621,L,X,-11.5,3.4,6.6,-18.1,-4.9,LOF,ENST00000333535,p.Leu1621Ter,False,True
1,F1622X,nonsense,1622,F,X,14.2,1.5,2.9,11.3,17.1,LOF,ENST00000333535,p.Phe1622Ter,False,True
2,R1623X,nonsense,1623,R,X,-0.8,2.3,4.5,-5.2,3.7,LOF,ENST00000333535,p.Arg1623Ter,False,True
3,V1624X,nonsense,1624,V,X,25.5,3.7,7.2,18.3,32.6,LOF,ENST00000333535,p.Val1624Ter,False,True
4,I1625X,nonsense,1625,I,X,-6.3,2.7,5.2,-11.5,-1.1,LOF,ENST00000333535,p.Ile1625Ter,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
243,R1632Q,nonsynonymous,1632,R,Q,61.0,11.5,22.6,38.4,83.5,possiblyLOF,ENST00000333535,p.Arg1632Gln,False,False
244,R1632S,nonsynonymous,1632,R,S,117.6,8.5,16.6,101.0,134.3,possiblyWT,ENST00000333535,p.Arg1632Ser,False,False
245,R1632T,nonsynonymous,1632,R,T,115.2,6.1,12.0,103.2,127.3,possiblyWT,ENST00000333535,p.Arg1632Thr,False,False
246,R1632V,nonsynonymous,1632,R,V,88.1,10.2,20.0,68.1,108.1,possiblyWT,ENST00000333535,p.Arg1632Val,False,False


In [20]:
if "author_transcript" not in scoreset.columns or scoreset.author_transcript.isna().all():
    AUTHOR_TRANSCRIPT = gene_info.loc[metadata.loc[dataset_name],'MANE_RefSeq_nuc'].values[0].split(".")[0]
else:
    AUTHOR_TRANSCRIPT = scoreset.author_transcript.iloc[0].split(".")[0]

In [13]:
translation_table  = pd.read_csv("/data/dzeiberg/mave_calibration/cache/transcript_mapping_table.tsv",sep="\t")

In [22]:
AUTHOR_TRANSCRIPT

'ENST00000333535'

In [21]:
translation_table[translation_table.stable_id == AUTHOR_TRANSCRIPT].display_label

70097    NM_198056.3
Name: display_label, dtype: object

In [16]:
if AUTHOR_TRANSCRIPT[:4] == "ENST":
    AUTHOR_TRANSCRIPT = translation_table[translation_table.stable_id == AUTHOR_TRANSCRIPT].display_label.values[0].split(".")[0]

In [17]:
AUTHOR_TRANSCRIPT

'NM_198056'

# Add ClinVar annotations

In [18]:
clinvar_hgvs_pro_summaries = mapping_utils.get_clinvar_summaries(clinvar_df,AUTHOR_TRANSCRIPT)

0it [00:00, ?it/s]


In [8]:
scoreset_w_clinvar = pd.merge(scoreset.set_index("hgvs_pro"),clinvar_hgvs_pro_summaries,
                            left_index=True,
                            right_index=True,
                            how="left",
                            validate="one_to_one",
                            suffixes=("","_clinvar"))

# Add gnomAD annotations

In [9]:
Ensembl_transcript_stable_ids = mapping_utils.translate_refseq_to_ensembl(AUTHOR_TRANSCRIPT)

In [None]:
Ensembl_transcript_stable_ids

In [11]:
gnomAD_info = mapping_utils.gather_gnomAD_info(gnomAD_df,Ensembl_transcript_stable_ids,[AUTHOR_TRANSCRIPT,])

In [12]:
scoreset_processed = pd.merge(scoreset_w_clinvar,gnomAD_info,
                                        left_index=True,right_index=True,how="left",validate="one_to_one").reset_index()

In [None]:
scoreset_processed

In [14]:
scoreset_processed.to_csv(dataset_path / "scoreset_processed.csv",index=False)

# Reload Processed

In [8]:
scoreset_processed = pd.read_csv(dataset_path / "scoreset_processed.csv")

In [15]:
def conflicting_interpretations(r):
    """
    Check if a record has conflicting interpretations
    P/LP and B/LB ; P/LP and VUS ; B/LB and VUS ; P/LP and conflicting ; B/LB and conflicting
    If data is mapped at the protein level, this could be a result of different RNA substitutions
    If data is mapped at the RNA level, this is a true conflict

    Parameters
    ----------
    r : pd.Series
        A record from the ClinVar data frame

    Returns
    -------
    bool
        True if there are conflicting interpretations, False otherwise
    """
    return r.num_p_lp > 0 and r.num_b_lb > 0 or \
            r.num_p_lp > 0 and r.num_VUS > 0 or \
            r.num_b_lb > 0 and r.num_VUS > 0 or \
            r.num_p_lp > 0 and r.num_conflicting > 0 or \
            r.num_b_lb > 0 and r.num_conflicting > 0


def is_pathogenic(r):
    return r.num_p_lp > 0 and not conflicting_interpretations(r) and r.clinvar_spliceAI_max <= .5

def is_benign(r):
    return r.num_b_lb > 0 and not conflicting_interpretations(r) and r.clinvar_spliceAI_max <= .5

def is_vus(r):
    return r.num_VUS > 0

def is_conflicting(r):
    return r.num_conflicting > 0

def is_gnomAD(r):
    return r.gnomAD_variants_maxAC_AF > 0 and r.gnomAD_variants_max_spliceAI_score <= .5

def is_synonymous(r):
    return r.synonymous and r.num_p_lp == 0 and r.clinvar_spliceAI_max <= .5



In [19]:
sample_data = {
    "P/LP" : scoreset_processed[scoreset_processed.apply(lambda r: not r.nonsense and is_pathogenic(r),axis=1)],
    "B/LB" : scoreset_processed[scoreset_processed.apply(lambda r: not r.nonsense and is_benign(r),axis=1)],
    'gnomAD': scoreset_processed[scoreset_processed.apply(lambda r: not r.nonsense and is_gnomAD(r),axis=1)],
    'synonymous' : scoreset_processed[scoreset_processed.apply(is_synonymous,axis=1)],
}
sample_data = {k : v for k,v in sample_data.items() if len(v)}

INVERT_SCORES = False
if INVERT_SCORES:
    for k in sample_data:
        sample_data[k] = sample_data[k].assign(score= -sample_data[k].score)

In [None]:
for sample_name, sample in sample_data.items():
    print(f"{sample_name}: {sample.shape[0]}")

In [None]:
plotting.plot_samples({k : v.score.values for k,v in sample_data.items()})

In [22]:
pd.DataFrame.from_records([*[dict(sample_name='P/LP',score=score) for score in sample_data['P/LP'].score.values],
                            *[dict(sample_name='B/LB',score=score) for score in sample_data['B/LB'].score.values],
                            *[dict(sample_name='gnomAD',score=score) for score in sample_data['gnomAD'].score.values],
                            *[dict(sample_name='synonymous',score=score) for score in sample_data['synonymous'].score.values]]).to_csv(dataset_path / "samples.csv",index=False)

In [23]:
old = joblib.load(dataset_path / "hgvs_pro.pkl")

In [None]:
clinvar_hgvs_pro_summaries[clinvar_hgvs_pro_summaries.index.isin(set(old['p_lp'].values) - set(sample_data['P/LP'].hgvs_pro.values))]

In [None]:
clinvar_hgvs_pro_summaries[clinvar_hgvs_pro_summaries.index.isin(set(old['b_lb'].values) - set(sample_data['B/LB'].hgvs_pro.values))]