# All missense in dbNSFP5.2 for precomputed score

In [None]:
import pandas as pd
from tqdm import tqdm

chromosomes = list(range(1, 23)) + ['X', 'Y']
# chromosomes = ['M']

amino_acid = set(['L', 'V', 'K', 'T', 'R', 'I', 'H', 'D', 'Y', 'S', 'Q', 'A', 'G', 'C', 'N', 'E', 'P', 'M', 'F', 'W'])
columns_preserve = ['#chr', 'pos(1-based)', 'ref', 'alt', 'aaref', 'aaalt', 'rs_dbSNP', 'hg19_chr', 'hg19_pos(1-based)', 'aapos', 'genename',
                    'Ensembl_geneid', 'Ensembl_transcriptid', 'Ensembl_proteinid', 'Uniprot_acc', 'Uniprot_entry', 
                    'HGVSc_snpEff', 'HGVSp_snpEff', 'HGVSc_VEP', 'HGVSp_VEP', 'MANE', 'cds_strand', 'refcodon', 'codonpos', 
                    'clinvar_id', 'clinvar_clnsig', 'clinvar_review',
                    'SIFT_score',
                    'SIFT4G_score',
                    'Polyphen2_HDIV_score',
                    'Polyphen2_HVAR_score',
                    'MutationTaster_score',
                    'MutationAssessor_score',
                    'PROVEAN_score',
                    'VEST4_score',
                    'MetaSVM_score',
                    'MetaLR_score',
                    'MetaRNN_score',
                    'M-CAP_score',
                    'REVEL_score',
                    'MutPred2_score',
                    'MVP_score',
                    'gMVP_score',
                    'MPC_score',
                    'PrimateAI_score',
                    'DEOGEN2_score',
                    'BayesDel_noAF_score',
                    'ClinPred_score',
                    'LIST-S2_score',
                    'VARITY_R_score',
                    'ESM1b_score',
                    'AlphaMissense_score',
                    'PHACTboost_score',
                    'MutFormer_score',
                    'MutScore_score',
                    'Aloft_prob_Tolerant',
                    'CADD_phred',
                    'DANN_score',
                    'fathmm-XF_coding_score',
                    'Eigen-PC-raw_coding',
                    'GERP++_RS',
                    'GERP_91_mammals',
                    'phyloP100way_vertebrate',
                    'phyloP470way_mammalian',
                    'phyloP17way_primate',
                    'phastCons100way_vertebrate',
                    'phastCons470way_mammalian',
                    'phastCons17way_primate',
                    'bStatistic']

hg38_column = ['#chr', 'pos(1-based)', 'ref', 'alt', 
               'SIFT_score', 'SIFT4G_score', 'Polyphen2_HDIV_score', 'Polyphen2_HVAR_score', 'MutationTaster_score', 'MutationAssessor_score',
               'PROVEAN_score', 'VEST4_score', 'MetaSVM_score', 'MetaLR_score', 'MetaRNN_score', 'M-CAP_score', 'REVEL_score', 'MutPred2_score', 
               'MVP_score', 'gMVP_score', 'MPC_score', 'PrimateAI_score', 'DEOGEN2_score', 'BayesDel_noAF_score', 'ClinPred_score', 'LIST-S2_score',
               'VARITY_R_score', 'ESM1b_score', 'AlphaMissense_score', 'PHACTboost_score', 'MutFormer_score', 'MutScore_score', 'Aloft_prob_Tolerant',
               'CADD_phred', 'DANN_score', 'fathmm-XF_coding_score', 'Eigen-PC-raw_coding', 'GERP++_RS', 'GERP_91_mammals', 'phyloP100way_vertebrate',
               'phyloP470way_mammalian', 'phyloP17way_primate', 'phastCons100way_vertebrate', 'phastCons470way_mammalian', 'phastCons17way_primate',
               'bStatistic']

hg19_column = ['hg19_chr', 'hg19_pos(1-based)', 'ref', 'alt'] + hg38_column[4:] 

group_sets = {
    'hg38': (hg38_column, ['#chr', 'pos(1-based)', 'ref', 'alt']),
    'hg19': (hg19_column, ['hg19_chr', 'hg19_pos(1-based)', 'ref', 'alt'])
}

In [None]:
# Get all missense variant from dbNSFP

import pandas as pd
from tqdm import tqdm

for chr_num in chromosomes:
    tqdm.write(f"Chr {chr_num} started")
    
    file_path = f'/gpfs/home/pl2948/VariantInterpretation/dbNSFP/dbNSFP5.2a_variant.chr{chr_num}.gz'
    df = pd.read_csv(file_path, compression='gzip', sep='\t', low_memory=False)
    df = df[columns_preserve]

    df = df[(
        (df['aaref'].isin(amino_acid)) & 
        (df['aaalt'].isin(amino_acid)) & 
        (df['aaalt']!=df['aaref'])
    )].reset_index(drop=True)
    
    for ver, (column_set, group_keys) in group_sets.items():
        df_ver = df[column_set].copy()
        df_ver = df_ver[(df_ver[group_keys[0]]!= '.') & (df_ver[group_keys[1]]!= '.')]
        df_ver[group_keys[1]] = df_ver[group_keys[1]].astype(int)
        
        df_ver_sorted = df_ver.sort_values(group_keys).reset_index(drop=True)

        merged_rows = []
        prev_key = None
        current_row = None

        for _, row in tqdm(df_ver_sorted.iterrows(), total=len(df_ver_sorted), desc=f"Processing chr{chr_num} {ver}"):
            key = tuple(row[col] for col in group_keys)
            if key != prev_key:
                if current_row is not None:
                    merged_rows.append(current_row)
                current_row = row.copy()
                prev_key = key
            else:
                for col in df_ver.columns:
                    if col not in group_keys:
                        val = str(row[col])
                        current_row[col] = f"{current_row[col]};{val}"

        if current_row is not None:
            merged_rows.append(current_row)

        merged_df = pd.DataFrame(merged_rows)

        if ver == 'hg38':
            merged_df = merged_df.reset_index(drop=True)
            merged_df['End'] = merged_df['pos(1-based)']
            merged_df.rename(columns={'#chr': '#Chr', 'pos(1-based)': 'Start', 'ref': 'Ref', 'alt': 'Alt'}, inplace=True)
        else:
            merged_df = merged_df.reset_index(drop=True)
            merged_df['End'] = merged_df['hg19_pos(1-based)']
            merged_df.rename(columns={'hg19_chr': '#Chr', 'hg19_pos(1-based)': 'Start', 'ref': 'Ref', 'alt': 'Alt'}, inplace=True)

        front_columns = ['#Chr', 'Start', 'End', 'Ref', 'Alt']
        other_columns = [col for col in merged_df.columns if col not in front_columns]
        merged_df = merged_df[front_columns + other_columns]

        merged_df.to_csv(
            f"/gpfs/home/pl2948/VariantInterpretation/dbNSFP/{ver}_missense_dbNSFP_chr{chr_num}.csv",
            sep='\t', 
            index=False
        )
        display(merged_df)

    tqdm.write(f"Chr {chr_num} completed")

In [None]:
# Get the mean value of all transcript for each single genomic coordinate

import pandas as pd
from tqdm import tqdm
tqdm.pandas()

chromosomes = list(range(1, 23)) + ['X', 'Y']

tool_list = ['SIFT_score', 'SIFT4G_score', 'Polyphen2_HDIV_score', 'Polyphen2_HVAR_score', 'MutationAssessor_score', 'PROVEAN_score', 'VEST4_score', 
             'REVEL_score', 'MutPred2_score', 'MPC_score', 'PrimateAI_score', 'BayesDel_noAF_score', 'LIST-S2_score', 'ESM1b_score', 'AlphaMissense_score',
             'CADD_phred', 'DANN_score', 'Eigen-PC-raw_coding', 'GERP++_RS', 'GERP_91_mammals', 'phyloP100way_vertebrate', 'phyloP470way_mammalian',
             'phyloP17way_primate', 'phastCons100way_vertebrate', 'phastCons470way_mammalian', 'phastCons17way_primate', 'bStatistic']

column_list = ['#Chr', 'Start', 'End', 'Ref', 'Alt'] + tool_list

def process_and_convert_column(col):
    def process_cell(cell):
        try:
            return float(cell)
        except:
            pass

        if isinstance(cell, str) and ";" in cell:
            try:
                values = [float(x) for x in cell.split(";") if x != '.' and x != '']
                if values:
                    return sum(values) / len(values)
            except:
                pass
        return "."
    return col.apply(process_cell)


for chr_num in chromosomes:
    print('Chromosome',chr_num,'start')
    for ver in ['hg38', 'hg19']:
        chromosome_df = pd.read_csv(f"/gpfs/home/pl2948/VariantInterpretation/dbNSFP/{ver}_missense_dbNSFP_chr{chr_num}.csv", sep='\t', low_memory=False)
        chromosome_df = chromosome_df[column_list]
        # display(chromosome_df)
        chromosome_df[tool_list] = chromosome_df[tool_list].progress_apply(process_and_convert_column)
        chromosome_df.to_csv(f"/gpfs/home/pl2948/VariantInterpretation/P_KNN_input/{ver}_missense_dbNSFP_chr{chr_num}.csv", index=False)
        display(chromosome_df)
        print(ver, 'complete')

# Missense VUS in ClinVar dbNSFP5.1

In [None]:
import pandas as pd

pathogenic_count = 0
benign_count = 0

chromosomes = list(range(1, 23)) + ['X', 'Y']
pathogenic = ['Pathogenic', 'Likely_pathogenic', 'Pathogenic/Likely_pathogenic']
benign = ['Benign', 'Likely_benign', 'Benign/Likely_benign']
clinvar_annotation = pathogenic + benign + ['Uncertain_significance']

clinvar_dbNSFP = pd.DataFrame()

for chr in chromosomes:
    file_path = f'/gpfs/home/pl2948/VariantInterpretation/dbNSFP/dbNSFP5.1a_variant.chr{chr}.gz'
    df = pd.read_csv(file_path, compression='gzip', sep='\t', low_memory=False)
    
    chr_benign = df['clinvar_clnsig'].isin(benign).sum()
    chr_pathogenic = df['clinvar_clnsig'].isin(pathogenic).sum()
    print(f'Chr: {chr:<10}, Benign: {chr_benign:<10}, Pathogenic: {chr_pathogenic:<10}')
    
    benign_count += chr_benign
    pathogenic_count += chr_pathogenic
    
    relevant_data = df[df['clinvar_clnsig'].isin(clinvar_annotation)]
    clinvar_dbNSFP = pd.concat([clinvar_dbNSFP, relevant_data], ignore_index=True)

print(f'Total: Benign: {benign_count:<10}, Pathogenic: {pathogenic_count:<10}')

display(clinvar_dbNSFP)

In [None]:
clinvar_dbNSFP.to_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_dbNSFP51a.csv.gz", 
                      index=False, compression='gzip')

## Filter missense VUS

In [None]:
import pandas as pd

clinvar_dbNSFP = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_dbNSFP51a.csv.gz", 
                            compression='gzip', low_memory=False)

display(clinvar_dbNSFP)

In [None]:
for i in range(49): 
    print(clinvar_dbNSFP.columns[i*10:i*10+10])

In [None]:
columns_to_keep = ['#chr', 'pos(1-based)', 'ref', 'alt', 'aaref', 'aaalt', 'rs_dbSNP', 'hg19_chr', 'hg19_pos(1-based)',
                   'aapos', 'genename', 'Ensembl_geneid', 'Ensembl_transcriptid', 'Ensembl_proteinid', 'MANE', 
                   'clinvar_id', 'clinvar_clnsig', 'clinvar_review',
                   'SIFT_score', 'fathmm-XF_coding_score', 'VEST4_score', 'REVEL_score', 
                   'GERP++_RS', 'phyloP100way_vertebrate', 'BayesDel_noAF_score', 
                   'Polyphen2_HVAR_score', 'MPC_score', 'PrimateAI_score', 'gnomAD2.1.1_exomes_controls_AF']

In [None]:
clinvar_dbNSFP_VUS = clinvar_dbNSFP[columns_to_keep]

clinvar_dbNSFP_VUS = clinvar_dbNSFP_VUS[clinvar_dbNSFP_VUS['clinvar_clnsig']=='Uncertain_significance']
print('VUS', len(clinvar_dbNSFP_VUS))

# remove splice
clinvar_dbNSFP_VUS = clinvar_dbNSFP_VUS[(clinvar_dbNSFP_VUS['aaalt']!='.') & (clinvar_dbNSFP_VUS['aaref']!='.')]
print('VUS, exclude splice', len(clinvar_dbNSFP_VUS))

# remove nonsense
clinvar_dbNSFP_VUS = clinvar_dbNSFP_VUS[(clinvar_dbNSFP_VUS['aaalt']!='X') | (clinvar_dbNSFP_VUS['aaref']!='X')]
print('VUS, exclude splice, exclude nonsense', len(clinvar_dbNSFP_VUS))

# remove synonymous
clinvar_dbNSFP_VUS = clinvar_dbNSFP_VUS[(clinvar_dbNSFP_VUS["aaref"] != clinvar_dbNSFP_VUS["aaalt"])]
print('VUS, exclude splice, exclude nonsense, exclude synomymous', len(clinvar_dbNSFP_VUS))

In [None]:
clinvar_dbNSFP_VUS.to_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_dbNSFP51a_VUS.csv.gz", 
                      index=False, compression='gzip')

## Validate with ClinVar

In [None]:
import pandas as pd

ClinVar_annotation = pd.read_csv('/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/AllClinVarBenchmark_202504.csv.gz', 
                                 compression='gzip')

display(ClinVar_annotation)

In [None]:
VUS_NSFP = set(clinvar_dbNSFP_VUS['clinvar_id'].unique().astype(int))

VUS_clinvar = set(ClinVar_annotation[((ClinVar_annotation['ClinVarName_coding_sequence'] == 1) & 
                                      (ClinVar_annotation['ClinVar_annotation'].isin({2})))]['VariationID'].unique())

In [None]:
print('VUS in NSFP', len(VUS_NSFP))
print('VUS in NSFP, not in clinvar', len(VUS_NSFP - VUS_clinvar))
print('VUS in clinvar, not in NSFP', len(VUS_clinvar - VUS_NSFP))

# VUS_NSFP - VUS_clinvar
# VUS_clinvar - VUS_NSFP

In [None]:
clinvar_dbNSFP_VUS = clinvar_dbNSFP_VUS[clinvar_dbNSFP_VUS['clinvar_id'].isin(VUS_clinvar)].reset_index(drop=True)
clinvar_dbNSFP_VUS

In [None]:
clinvar_dbNSFP_VUS.to_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_dbNSFP51a_VUS.csv.gz", 
                      index=False, compression='gzip')

## Expand tuple column and select

In [None]:
count = 0
tuple_column = []

for i in range(len(clinvar_dbNSFP_VUS)):
    for column, content in clinvar_dbNSFP_VUS.iloc[i].items():
        if ';' in str(content):
            print(f"{column:<20}{content}")
            tuple_column.append(column)
            count +=1
    if count!=0: break

print(count)
print(tuple_column)

In [None]:
for col in tuple_column:
    clinvar_dbNSFP_VUS[col] = clinvar_dbNSFP_VUS[col].apply(
        lambda x: tuple(x.split(';')) if isinstance(x, str) and ';' in x else (x,))

display(clinvar_dbNSFP_VUS)

In [None]:
import ast

def convert_to_tuple(value):
    try:
        return tuple(ast.literal_eval(value))
    except (ValueError, SyntaxError):
        return value

for col in tuple_column:
    clinvar_dbNSFP_VUS[col] = clinvar_dbNSFP_VUS[col].apply(lambda x: convert_to_tuple(x) if pd.notna(x) else x)

clinvar_dbNSFP_VUS = clinvar_dbNSFP_VUS.reset_index(drop=True)
display(clinvar_dbNSFP_VUS)

In [None]:
from tqdm import tqdm

row_lengths = clinvar_dbNSFP_VUS[tuple_column].apply(lambda col: col.map(len)).max(axis=1).tolist()

expanded_rows = []

for idx, row in tqdm(clinvar_dbNSFP_VUS.iterrows(), total=len(clinvar_dbNSFP_VUS), desc="Processing Rows"):
    length = row_lengths[idx]
    for i in range(length):
        new_row = row.copy()
        for col in tuple_column:
            new_row[col] = row[col][i] if i < len(row[col]) else None
        expanded_rows.append(new_row)

expanded_clinvar_dbNSFP_VUS = pd.DataFrame(expanded_rows)

display(expanded_clinvar_dbNSFP_VUS)

In [None]:
import pandas as pd

expanded_clinvar_dbNSFP_VUS.to_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_dbNSFP51a_VUS.csv.gz", 
                      index=False, compression='gzip')

In [None]:
import pandas as pd
expanded_clinvar_dbNSFP_VUS = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_dbNSFP51a_VUS.csv.gz", 
                      compression='gzip')
expanded_clinvar_dbNSFP_VUS

In [None]:
print('select', sum(expanded_clinvar_dbNSFP_VUS['MANE'] == 'Select'))
print('plus clinical',  sum(expanded_clinvar_dbNSFP_VUS['MANE'] == 'Plus_Clinical'))

In [None]:
import pandas as pd
from tqdm import tqdm

select_column = ['SIFT_score', 'fathmm-XF_coding_score', 'VEST4_score', 'REVEL_score', 
                 'GERP++_RS', 'phyloP100way_vertebrate', 'BayesDel_noAF_score', 
                 'Polyphen2_HVAR_score', 'MPC_score', 'PrimateAI_score',
                ]

expanded_clinvar_dbNSFP_VUS['dot_count'] = expanded_clinvar_dbNSFP_VUS[select_column].apply(lambda row: row.tolist().count('.'), axis=1)

def select_row(group):
    min_dot_count = group['dot_count'].min()
    candidates = group[group['dot_count'] == min_dot_count]
    
    if len(candidates) == 1:
        return candidates.iloc[0]

    mane_select = candidates[candidates['MANE'] == 'Select']
    if not mane_select.empty:
        return mane_select.iloc[0]
    
    plus_clinical_select = candidates[candidates['MANE'] == 'Plus_Clinical']
    if not plus_clinical_select.empty:
        return plus_clinical_select.iloc[0]
    
    return candidates.iloc[0]
    
tqdm.pandas(desc="Processing Rows")
result = expanded_clinvar_dbNSFP_VUS.groupby('clinvar_id').progress_apply(select_row).reset_index(drop=True)

display(result)

In [None]:
result.to_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_dbNSFP51a_VUS.csv", 
                index=False)

## Prepare for annovar

In [None]:
annovar_df = result[['hg19_chr', 'hg19_pos(1-based)', 'ref', 'alt']]
annovar_df.columns = ['Chr', 'Start', 'Ref', 'Alt']
annovar_df['End'] = annovar_df['Start']
annovar_df = annovar_df[['Chr', 'Start', 'End', 'Ref', 'Alt']]
annovar_df

In [None]:
annovar_df.to_csv('/gpfs/home/pl2948/VariantInterpretation/Data/dbNSFP_VUS.avinput', 
                  sep='\t', 
                  index=False, header=False)

In [None]:
import pandas as pd
VUS_CADD = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/dbNSFP_VUS_dbnsfp42a.hg19_multianno.csv")
VUS_CADD

In [None]:
result['CADDv1.6_PHRED'] = VUS_CADD['CADD_phred'].values
result['FATHMM_score'] = VUS_CADD['FATHMM_score'].values

result = result.drop(columns=['dot_count'])
result = result.drop(columns=['fathmm-XF_coding_score'])

result

## Exclude zero star and AF >= 0.01

In [None]:
result['gnomAD2.1.1_exomes_controls_AF'].replace('.', 0).astype(float).unique()

In [None]:
import numpy as np

# Use only pathogenic and benign annotation 
REVIEW_STATUS_TO_GOLD_STARS = {
    'criteria_provided,_single_submitter': 1,
    'criteria_provided,_multiple_submitters,_no_conflicts': 2,
    'no_assertion_criteria_provided': np.nan,
    'reviewed_by_expert_panel': 3
    }

result['gold_stars'] = result['clinvar_review'].map(REVIEW_STATUS_TO_GOLD_STARS)
print(len(result))
result = result[result['gold_stars']>=1]
print(len(result))
result['gnomAD2.1.1_exomes_controls_AF'] = result['gnomAD2.1.1_exomes_controls_AF'].replace('.', 0).astype(float)
result = result[result['gnomAD2.1.1_exomes_controls_AF']<0.01].reset_index(drop=True)
print(len(result))

In [None]:
result.to_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_dbNSFP51a_VUS.csv", 
                index=False)

In [None]:
import pandas as pd
gnomAD_AF = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/ClinVar2025_gnomAD.hg38_multianno.csv")
gnomAD_AF

In [None]:
ClinVar2025['gnomAD_AF'] = gnomAD_AF['controls_AF_popmax']
ClinVar2025

In [None]:
print('origin', len(ClinVar2025))

# Remove 0 star
ClinVar2025 = ClinVar2025[ClinVar2025['ClinVar_annotation'].isin([1. , 1.1, 0. , 0.1])].reset_index(drop=True)
print('remove 0 star', len(ClinVar2025))

# Remove all benign
ClinVar2025 = ClinVar2025.groupby('GeneSymbol').filter(lambda x: not all(x['ClinVar_annotation'] == 0))
print('remove all benign', len(ClinVar2025))

# Remove AF > 0.01
ClinVar2025['gnomAD_AF'] = ClinVar2025['gnomAD_AF'].replace('.', 0).astype(float)
ClinVar2025 = ClinVar2025[ClinVar2025['gnomAD_AF']<0.01].reset_index(drop=True)
print('remove AF > 0.01', len(ClinVar2025))

display(ClinVar2025)

In [None]:
new_set = set(ClinVar2025['VariationID']) - set(ClinVar2020['VariationID'])
len(new_set)

In [None]:
import pandas as pd

ClinVar_2019 = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/ClinVar2019Set.csv")
ClinVar_2019

In [None]:
import pandas as pd

filtered_rows = ClinVar2025[(ClinVar2025['VariationID'].isin(new_set))].reset_index(drop=True)
print(filtered_rows.shape) # Duplication is related to X and Y

In [None]:
filtered_rows.to_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_2025-2020_PBLPB.csv", 
                      index=False)

# Calibrate ESM1b and DMS

In [None]:
import pandas as pd

ClinVar2025 = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/AllClinVarBenchmark_202504.csv.gz", 
                                 compression='gzip')

display(ClinVar2025)

In [None]:
import pandas as pd
gnomAD_AF = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/ClinVar2025_gnomAD.hg38_multianno.csv")
ClinVar2025['gnomAD_AF'] = gnomAD_AF['controls_AF_popmax']
gnomAD_AF

In [None]:
amino_acid = ['G', 'Q', 'N', 'I', 'V', 'R', 'E', 'A', 'C', 'W', 'D', 'F', 'K', 'P', 'M', 'L', 'Y', 'T', 'H', 'S']
filtered_ClinVar2025 = ClinVar2025[((ClinVar2025['ClinVarName_start_aa'].isin(amino_acid)) & 
                               (ClinVar2025['ClinVarName_alt'].isin(amino_acid)))].reset_index(drop=True)

filtered_ClinVar2025["ClinVar_category"] = filtered_ClinVar2025["ClinVar_annotation"].apply(lambda x: "Benign" if x in [0, 0.1] else ("Pathogenic" if x in [1, 1.1] else "other"))

print(len(filtered_ClinVar2025))
display(filtered_ClinVar2025)

In [None]:
print('origin', len(filtered_ClinVar2025))

# Remove 0 star
filtered_ClinVar2025 = filtered_ClinVar2025[filtered_ClinVar2025['ClinVar_annotation'].isin([1. , 1.1, 0. , 0.1])].reset_index(drop=True)
print('remove 0 star', len(filtered_ClinVar2025))

# Remove all benign
filtered_ClinVar2025 = filtered_ClinVar2025.groupby('GeneSymbol').filter(lambda x: not all(x['ClinVar_annotation'] == 0))
print('remove all benign', len(filtered_ClinVar2025))

# Remove AF > 0.01
filtered_ClinVar2025['gnomAD_AF'] = filtered_ClinVar2025['gnomAD_AF'].replace('.', 0).astype(float)
filtered_ClinVar2025 = filtered_ClinVar2025[filtered_ClinVar2025['gnomAD_AF']<0.01].reset_index(drop=True)
print('remove AF > 0.01', len(filtered_ClinVar2025))

display(filtered_ClinVar2025)

In [None]:
i = 100
grouped_summary = filtered_ClinVar2025.groupby("GeneSymbol")["ClinVar_category"].value_counts().unstack(fill_value=0)
grouped_summary["Total"] = grouped_summary.sum(axis=1)

grouped_summary[(grouped_summary['Benign']>i) & (grouped_summary['Pathogenic']>i)]

sorted_summary = grouped_summary[(grouped_summary['Benign']>i) & (grouped_summary['Pathogenic']>i)].sort_values(by="Total", ascending=False)
sorted_summary

## TP53

In [None]:
TP53_ClinVar = filtered_ClinVar2025[((filtered_ClinVar2025['GeneSymbol']=='TP53'))].reset_index(drop=True)
TP53_ClinVar

### Add ESM

In [None]:
import pandas as pd

file_path = "/gpfs/data/brandeslab/Project/esm_scores/refseq_esm_scores/TP53.csv"
column_names = ["Template", "AA_POS", "AA_ALT", "AA_Change", "ESM1b_score"]

TP53_ESM = pd.read_csv(file_path, names=column_names, header=None)

display(TP53_ESM)

In [None]:
import pandas as pd
import numpy as np

def add_ESM_score(ClinVar: pd.DataFrame, ESM: pd.DataFrame) -> pd.DataFrame:
    ClinVar['ESM1b_score'] = None
    AA_Change_list = list(ESM['AA_Change'].unique())
    count = 0
    
    for idx, row in ClinVar.iterrows():
        AA_Change = f"{row['ClinVarName_start_aa']}{int(row['ClinVarName_start_pos'])}{row['ClinVarName_alt']}"
        if AA_Change not in AA_Change_list: 
            continue
        count +=1
        ClinVar.loc[idx, 'ESM1b_score'] = ESM[ESM['AA_Change']==AA_Change].iloc[0]['ESM1b_score']
    print(count)
    return ClinVar

In [None]:
TP53_ClinVar = add_ESM_score(TP53_ClinVar, TP53_ESM)
TP53_ClinVar

### Add MAVEs

In [None]:
import os
import pandas as pd

folder_path = "/gpfs/home/pl2948/VariantInterpretation/Data/TP53"
csv_dict = {}

for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        key = filename[:-4] 
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)
        csv_dict[key] = df
        print(key)
        display(csv_dict[key])

In [None]:
transcript_aa_change_dict = {}
count_dict = {}

for key in csv_dict.keys():
    TP53_ClinVar[key] = None
    count_dict[key] = 0
    if key=='urn_mavedb_00001213-a-1_scores':
        transcript_aa_change_dict[key] = csv_dict['urn_mavedb_00001213-a-1_scores']['HGVS(protein)'].str.split(':').str.get(1).tolist()
    else:
        transcript_aa_change_dict[key] = csv_dict[key]['hgvs_pro'].unique()

for idx, row in TP53_ClinVar.iterrows():
    aa_change = f"p.{row['ClinVarName_raw_protein_change']}"

    for key in csv_dict.keys(): 
        if aa_change in transcript_aa_change_dict[key]:
            MAVE = csv_dict[key]
            count_dict[key]+=1
            if key=='urn_mavedb_00001213-a-1_scores':
                TP53_ClinVar.loc[idx, key] = MAVE[MAVE['HGVS(protein)'].str.contains(aa_change)].iloc[0, 7] 
            else:
                TP53_ClinVar.loc[idx, key] = MAVE[MAVE['hgvs_pro']==aa_change].iloc[0, 2] 
                
print(count_dict)
display(TP53_ClinVar)

In [None]:
TP53_ClinVar.to_csv("/gpfs/home/pl2948/VariantInterpretation/Data/TP53_ESM_DMS.csv", 
                        index=False)

In [None]:
import pandas as pd
TP53_ClinVar = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/TP53_ESM_DMS.csv")
TP53_ClinVar

In [None]:
csv_dict = {'urn_mavedb_00000068-b-1_scores': 322, 'urn_mavedb_00001234-g-1_scores': 324, 'TP53_DNE_urn_mavedb_00001235-a-1_scores': 322, 'urn_mavedb_00001234-e-1_scores': 324, 'urn_mavedb_00001234-d-1_scores': 324, 'urn_mavedb_00001213-a-1_scores': 215, 'TP53_transcription_urn_mavedb_00001234-0-1_scores': 324, 'urn_mavedb_00001236-0-1_scores': 322, 'urn_mavedb_00001234-h-1_scores': 324, 'urn_mavedb_00000068-c-1_scores': 322, 'urn_mavedb_00001234-a-1_scores': 324, 'urn_mavedb_00001234-c-1_scores': 324, 'urn_mavedb_00001234-b-1_scores': 324, 'urn_mavedb_00001234-f-1_scores': 324, 'urn_mavedb_00000068-0-1_scores': 322, 'urn_mavedb_00000068-a-1_scores': 322}

In [None]:
TP53_ClinVar[csv_dict.keys()].isna().sum()

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

columns_to_corr = list(csv_dict.keys())
df_corr = TP53_ClinVar[columns_to_corr]

corr_matrix = np.abs(df_corr.corr(method='pearson'))

plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", square=True,
            cbar_kws={"shrink": .75}, linewidths=0.5, linecolor='white')
plt.title("Pearson Correlation Heatmap between MAVEs Scores")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
plt.show()
