# Import Clingen Dataset

In [None]:
select_column = [
    'SIFT_score', 
    'FATHMM_score', 
    'VEST4_score', 
    'REVEL_score',
    'GERP++_RS', 
    'phyloP100way_vertebrate', 
    'EA_1.0',
    'BayesDel_nsfp33a_noAF', 
    'MutPred2.0_score', 
    'CADDv1.6_PHRED',
    'pph2_prob', 
    'MPC_score', 
    'PrimateAI_score',
    'ClinVar_annotation'
]

## ClinVar 2019

In [None]:
import pandas as pd

ClinVar_2019 = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/ClinVar2019Set.csv")
ClinVar_2019

In [None]:
calibration_data = ClinVar_2019.rename(columns={'clnsig': 'ClinVar_annotation'})
del ClinVar_2019

ClinVar_ann = {
    'Benign/Likely_benign': 0, 
    'Likely_benign': 0, 
    'Pathogenic': 1,
    'Likely_pathogenic': 1, 
    'Benign': 0, 
    'Pathogenic/Likely_pathogenic': 1
    }

calibration_data['ClinVar_annotation'] = calibration_data['ClinVar_annotation'].map(ClinVar_ann)
calibration_data = calibration_data[calibration_data['ClinVar_annotation'].notna()]
# calibration_data = calibration_data[calibration_data['hg19_chr']!='MT']
calibration_data = calibration_data[select_column]

print(len(calibration_data))

for col in select_column[:-1]:
    calibration_data[col] = pd.to_numeric(calibration_data[col], errors='coerce')
    if col in ['SIFT_score', 'FATHMM_score']:
        calibration_data[col] *=-1

calibration_data = calibration_data.reset_index(drop=True)

display(calibration_data)
print(calibration_data.isna().sum()/len(calibration_data))

In [None]:
print("Pathogenic", sum(calibration_data['ClinVar_annotation']==1))
print("Benign", sum(calibration_data['ClinVar_annotation']==0))

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def calculate_roc_and_auc(y_true, y_score, label):
    y_true = np.array(y_true)
    y_score = np.array(y_score)

    valid_indices = ~np.isnan(y_score) & ~np.isnan(y_true)
    y_true_cleaned = y_true[valid_indices]
    y_score_cleaned = y_score[valid_indices]

    fpr, tpr, _ = roc_curve(y_true_cleaned, y_score_cleaned)
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc, label

def calculate_MI(y_true, y_score):
    y_true = np.array(y_true)
    y_score = np.array(y_score)

    valid_indices = ~np.isnan(y_score) & ~np.isnan(y_true)
    y_score = np.array(y_score).reshape(-1, 1)
    y_true_cleaned = y_true[valid_indices]
    y_score_cleaned = y_score[valid_indices]

    mutual_information = mutual_info_classif(y_score_cleaned, y_true_cleaned)
    return mutual_information[0]

    
roc_curves = []
AUC = []
MI = []

for col in select_column[:-1]:  # Exclude 'ClinVar_annotation'
    roc_curves.append(calculate_roc_and_auc(
        calibration_data['ClinVar_annotation'], calibration_data[col], col))
    MI.append(calculate_MI(calibration_data['ClinVar_annotation'], calibration_data[col]))

# Plot ROC curves

plt.figure(figsize=(8, 6))
for fpr, tpr, roc_auc, label in roc_curves:
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')
    AUC.append(roc_auc)

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Reference diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
# plt.grid()
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(MI, AUC)
plt.xlabel('Mutual Information')
plt.ylabel('AUC')

## ClinVar 2020

In [None]:
import pandas as pd

ClinVar_2020 = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/ClinVar2020Set.csv")
ClinVar_2020

In [None]:
test_data = ClinVar_2020.rename(columns={'clnsig': 'ClinVar_annotation'})
del ClinVar_2020

ClinVar_ann = {
    'Benign/Likely_benign': 0, 
    'Likely_benign': 0, 
    'Pathogenic': 1,
    'Likely_pathogenic': 1, 
    'Benign': 0, 
    'Pathogenic/Likely_pathogenic': 1
    }

test_data['ClinVar_annotation'] = test_data['ClinVar_annotation'].map(ClinVar_ann)
test_data = test_data[test_data['ClinVar_annotation'].notna()]
test_data = test_data[select_column]

print(len(test_data))

for col in select_column[:-1]:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')
    if col in ['SIFT_score', 'FATHMM_score']:
        test_data[col] *=-1

test_data = test_data.reset_index(drop=True)

display(test_data)
print(test_data.isna().sum()/len(test_data))

In [None]:
print("Pathogenic", sum(test_data['ClinVar_annotation']==1))
print("Benign", sum(test_data['ClinVar_annotation']==0))

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

roc_curves = []

for col in select_column[:-1]:  # Exclude 'ClinVar_annotation'
    roc_curves.append(calculate_roc_and_auc(
        test_data['ClinVar_annotation'], test_data[col], col))

plt.figure(figsize=(8, 6))
for fpr, tpr, roc_auc, label in roc_curves:
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Reference diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
# plt.grid()
plt.show()

## gnomAD set

In [None]:
import pandas as pd

gnomAD_set = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/GnomADSet.csv")
gnomAD_set

In [None]:
gnomAD_set = gnomAD_set[select_column[:-1]]

# print(len(gnomAD_set))

for col in select_column[:-1]:
    gnomAD_set[col] = pd.to_numeric(gnomAD_set[col], errors='coerce')
    # gnomAD_set = gnomAD_set[(gnomAD_set[col]!='.') & (gnomAD_set[col].notna())]
    # print(col, len(gnomAD_set))
    if col in ['SIFT_score', 'FATHMM_score']:
        gnomAD_set[col] *=-1

gnomAD_set = gnomAD_set.reset_index(drop=True)
    
display(gnomAD_set)
print(gnomAD_set.isna().sum()/len(gnomAD_set))

# Single Tool 1D calibration

In [None]:
import copy
import numpy as np

select_column = [
    'SIFT_score', 
    'FATHMM_score', 
    'VEST4_score', 
    'REVEL_score',
    'GERP++_RS', 
    'phyloP100way_vertebrate', 
    'EA_1.0',
    'BayesDel_nsfp33a_noAF', 
    'MutPred2.0_score', 
    'CADDv1.6_PHRED',
    'pph2_prob', 
    'MPC_score', 
    'PrimateAI_score',
    'ClinVar_annotation'
]

calibration_feature = calibration_data[select_column[:-1]].to_numpy()
test_feature = test_data[select_column[:-1]].to_numpy()
regularization_feature = gnomAD_set[select_column[:-1]].to_numpy()

calibration_label = calibration_data[select_column[-1]].to_numpy()
calibration_label_bk = copy.deepcopy(calibration_label)
test_label = test_data[select_column[-1]].to_numpy()
test_label_bk = copy.deepcopy(test_label)

print(calibration_feature.shape, test_feature.shape, regularization_feature.shape)

In [None]:
from P_KNN_CPU import get_bootstrap_KNN_score, get_P_KNN_ACMG_score_1D, evaluate_result_1D
import copy

#Parameter setting
Pprior = 0.0441
w_calibration=None
n_calibration_in_window = 100
frac_regularization_in_window=0.03
parallel=True
normalization= None
impute = False
mi_scaling = False
n_bootstrap = 100

p_value = 0.05
logbase = 1124

best_mean_evidence_strength = 0

combine_data = pd.DataFrame()

for i in range(len(select_column)-1):    
    condition_string = select_column[i]
    select_feature = i

    valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
    calibration_array = calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1)
    calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

    valid_test_idx = ~np.isnan(test_feature[:,select_feature])
    test_array = test_feature[valid_test_idx][:,select_feature].reshape(-1, 1)
    test_label = copy.deepcopy(test_label_bk[valid_test_idx])

    valid_regularization_idx = ~np.isnan(regularization_feature[:,select_feature])
    regularization_array = regularization_feature[valid_regularization_idx][:,select_feature].reshape(-1, 1)

    print("")
    print(f"Tool {condition_string}")
    print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

    # test_results_array = get_bootstrap_KNN_score(calibration_array, test_array, regularization_array, 
    #                                              calibration_label, Pprior, w_calibration, 
    #                                              n_calibration_in_window, frac_regularization_in_window, 
    #                                              normalization, impute, mi_scaling, n_bootstrap, parallel)

    # np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_{condition_string}.npy', test_results_array)

    test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_{condition_string}.npy')

    P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score_1D(test_results_array, test_array, p_value, Pprior, logbase)

    evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result_1D(test_results_array,
                                                                                                      test_array,
                                                                                                      test_label, 
                                                                                                      p_value, 
                                                                                                      Pprior, 
                                                                                                      logbase, 
                                                                                                      category = condition_string, 
                                                                                                      show_plot=True, 
                                                                                                      save_name=condition_string)

    test_data.loc[valid_test_idx, f"P-KNN_Pathogenic_{condition_string}"] = P_KNN_pathogenic

    combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

    mean_evidence_strength = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean() + evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()
    if mean_evidence_strength >  best_mean_evidence_strength:
        best_mean_evidence_strength = mean_evidence_strength
        best_tool = i
        
    print("pathogenic evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean():.3f}")
    print("benign evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean():.3f}")


select_feature = best_tool
condition_string = select_column[select_feature]

valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
calibration_array = calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1)
calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

valid_test_idx = ~np.isnan(test_feature[:,select_feature])
test_array = test_feature[valid_test_idx][:,select_feature].reshape(-1, 1)
test_label = copy.deepcopy(test_label_bk[valid_test_idx])

valid_regularization_idx = ~np.isnan(regularization_feature[:,select_feature])
regularization_array = regularization_feature[valid_regularization_idx][:,select_feature].reshape(-1, 1)

print("")
print(f"Best Tool {condition_string}")
print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_{condition_string}.npy')

P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score_1D(test_results_array, test_array, p_value, Pprior, logbase)
test_data.loc[valid_test_idx, f"ACMGLLR_{condition_string}"] = ACMG_scores 

# Integrating All 13 tools

In [None]:
import copy
import numpy as np

select_column = [
    'SIFT_score', 
    'FATHMM_score', 
    'VEST4_score', 
    'REVEL_score',
    'GERP++_RS', 
    'phyloP100way_vertebrate', 
    'EA_1.0',
    'BayesDel_nsfp33a_noAF', 
    'MutPred2.0_score', 
    'CADDv1.6_PHRED',
    'pph2_prob', 
    'MPC_score', 
    'PrimateAI_score',
    'ClinVar_annotation'
]

calibration_feature = calibration_data[select_column[:-1]].to_numpy()
test_feature = test_data[select_column[:-1]].to_numpy()
regularization_feature = gnomAD_set[select_column[:-1]].to_numpy()

calibration_label = calibration_data[select_column[-1]].to_numpy()
calibration_label_bk = copy.deepcopy(calibration_label)
test_label = test_data[select_column[-1]].to_numpy()
test_label_bk = copy.deepcopy(test_label)

print(calibration_feature.shape, test_feature.shape, regularization_feature.shape)

In [None]:
from P_KNN_CPU import get_bootstrap_KNN_score, get_P_KNN_ACMG_score, evaluate_result
import copy

#Parameter setting
Pprior = 0.0441
w_calibration = None
n_calibration_in_window = 100 
frac_regularization_in_window = 0.03
parallel=True
normalization= 'rank'
impute = True
mi_scaling = True
n_bootstrap = 100

p_value = 0.05
logbase = 1124

condition_string = 'P-KNN'
valid_calibration_idx = ~np.isnan(calibration_feature).all(axis=1) 
calibration_array = calibration_feature[valid_calibration_idx]
calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

valid_test_idx = ~np.isnan(test_feature).all(axis=1) 
test_array = test_feature[valid_test_idx]
test_label = copy.deepcopy(test_label_bk[valid_test_idx])

valid_regularization_idx = ~np.isnan(regularization_feature).all(axis=1) 
regularization_array = regularization_feature[valid_regularization_idx]

print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

test_results_array = get_bootstrap_KNN_score(calibration_array, test_array, regularization_array, 
                                             calibration_label, Pprior, w_calibration, 
                                             n_calibration_in_window, frac_regularization_in_window, 
                                             normalization, impute, mi_scaling, n_bootstrap, parallel)

np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_All.npy', test_results_array)

test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_All.npy')

P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

test_data.loc[valid_test_idx, f"ACMGLLR"] = ACMG_scores 
test_data.loc[valid_test_idx, f"P_KNN_pathogenic"] = P_KNN_pathogenic 

evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result(test_results_array,
                                                                                                  test_label, 
                                                                                                  p_value, 
                                                                                                  Pprior, 
                                                                                                  logbase, 
                                                                                                  category = condition_string, 
                                                                                                  show_plot=True, 
                                                                                                  save_name="Clingen13_All")

combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)
    
print("pathogenic evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean():.3f}")
print("benign evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean():.3f}")