# Import Clingen Dataset

In [None]:
select_column = [
    'SIFT_score', 
    'FATHMM_score', 
    'VEST4_score', 
    'REVEL_score',
    'GERP++_RS', 
    'phyloP100way_vertebrate', 
    'EA_1.0',
    'BayesDel_nsfp33a_noAF', 
    'MutPred2.0_score', 
    'CADDv1.6_PHRED',
    'pph2_prob', 
    'MPC_score', 
    'PrimateAI_score',
    'ClinVar_annotation'
]

## ClinVar 2019

In [None]:
import pandas as pd

ClinVar_2019 = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/ClinVar2019Set.csv")
ClinVar_2019

In [None]:
calibration_data = ClinVar_2019.rename(columns={'clnsig': 'ClinVar_annotation'})
del ClinVar_2019

ClinVar_ann = {
    'Benign/Likely_benign': 0, 
    'Likely_benign': 0, 
    'Pathogenic': 1,
    'Likely_pathogenic': 1, 
    'Benign': 0, 
    'Pathogenic/Likely_pathogenic': 1
    }

calibration_data['ClinVar_annotation'] = calibration_data['ClinVar_annotation'].map(ClinVar_ann)
calibration_data = calibration_data[calibration_data['ClinVar_annotation'].notna()]
# calibration_data = calibration_data[calibration_data['hg19_chr']!='MT']
calibration_data = calibration_data[select_column]

print(len(calibration_data))

for col in select_column[:-1]:
    calibration_data[col] = pd.to_numeric(calibration_data[col], errors='coerce')
    if col in ['SIFT_score', 'FATHMM_score']:
        calibration_data[col] *=-1

calibration_data = calibration_data.reset_index(drop=True)

display(calibration_data)
print(calibration_data.isna().sum()/len(calibration_data))

In [None]:
print("Pathogenic", sum(calibration_data['ClinVar_annotation']==1))
print("Benign", sum(calibration_data['ClinVar_annotation']==0))

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.feature_selection import mutual_info_classif
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

def calculate_roc_and_auc(y_true, y_score, label):
    y_true = np.array(y_true)
    y_score = np.array(y_score)

    valid_indices = ~np.isnan(y_score) & ~np.isnan(y_true)
    y_true_cleaned = y_true[valid_indices]
    y_score_cleaned = y_score[valid_indices]

    fpr, tpr, _ = roc_curve(y_true_cleaned, y_score_cleaned)
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc, label

def calculate_MI(y_true, y_score):
    y_true = np.array(y_true)
    y_score = np.array(y_score)

    valid_indices = ~np.isnan(y_score) & ~np.isnan(y_true)
    y_score = np.array(y_score).reshape(-1, 1)
    y_true_cleaned = y_true[valid_indices]
    y_score_cleaned = y_score[valid_indices]

    mutual_information = mutual_info_classif(y_score_cleaned, y_true_cleaned)
    return mutual_information[0]

    
roc_curves = []
AUC = []
MI = []

for col in select_column[:-1]:  # Exclude 'ClinVar_annotation'
    roc_curves.append(calculate_roc_and_auc(
        calibration_data['ClinVar_annotation'], calibration_data[col], col))
    MI.append(calculate_MI(calibration_data['ClinVar_annotation'], calibration_data[col]))

# Plot ROC curves

plt.figure(figsize=(8, 6))
for fpr, tpr, roc_auc, label in roc_curves:
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')
    AUC.append(roc_auc)

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Reference diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
# plt.grid()
plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(MI, AUC)
plt.xlabel('Mutual Information')
plt.ylabel('AUC')

## ClinVar 2020

In [None]:
import pandas as pd

ClinVar_2020 = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/ClinVar2020Set.csv")
ClinVar_2020

In [None]:
test_data = ClinVar_2020.rename(columns={'clnsig': 'ClinVar_annotation'})
del ClinVar_2020

ClinVar_ann = {
    'Benign/Likely_benign': 0, 
    'Likely_benign': 0, 
    'Pathogenic': 1,
    'Likely_pathogenic': 1, 
    'Benign': 0, 
    'Pathogenic/Likely_pathogenic': 1
    }

test_data['ClinVar_annotation'] = test_data['ClinVar_annotation'].map(ClinVar_ann)
test_data = test_data[test_data['ClinVar_annotation'].notna()]
test_data = test_data[select_column]

print(len(test_data))

for col in select_column[:-1]:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')
    if col in ['SIFT_score', 'FATHMM_score']:
        test_data[col] *=-1

test_data = test_data.reset_index(drop=True)

display(test_data)
print(test_data.isna().sum()/len(test_data))

In [None]:
print("Pathogenic", sum(test_data['ClinVar_annotation']==1))
print("Benign", sum(test_data['ClinVar_annotation']==0))

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

roc_curves = []

for col in select_column[:-1]:  # Exclude 'ClinVar_annotation'
    roc_curves.append(calculate_roc_and_auc(
        test_data['ClinVar_annotation'], test_data[col], col))

plt.figure(figsize=(8, 6))
for fpr, tpr, roc_auc, label in roc_curves:
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Reference diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
# plt.grid()
plt.show()

## gnomAD set

In [None]:
import pandas as pd

gnomAD_set = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/Data/GnomADSet.csv")
gnomAD_set

In [None]:
gnomAD_set = gnomAD_set[select_column[:-1]]

# print(len(gnomAD_set))

for col in select_column[:-1]:
    gnomAD_set[col] = pd.to_numeric(gnomAD_set[col], errors='coerce')
    # gnomAD_set = gnomAD_set[(gnomAD_set[col]!='.') & (gnomAD_set[col].notna())]
    # print(col, len(gnomAD_set))
    if col in ['SIFT_score', 'FATHMM_score']:
        gnomAD_set[col] *=-1

gnomAD_set = gnomAD_set.reset_index(drop=True)
    
display(gnomAD_set)
print(gnomAD_set.isna().sum()/len(gnomAD_set))

# Observing silhouette score

In [None]:
import copy
import numpy as np

select_column = [
    'SIFT_score', 
    'FATHMM_score', 
    'VEST4_score', 
    'REVEL_score',
    'GERP++_RS', 
    'phyloP100way_vertebrate', 
    'EA_1.0',
    'BayesDel_nsfp33a_noAF', 
    'MutPred2.0_score', 
    'CADDv1.6_PHRED',
    'pph2_prob', 
    'MPC_score', 
    'PrimateAI_score',
    'ClinVar_annotation'
]

calibration_feature = calibration_data[select_column[:-1]].to_numpy()
test_feature = test_data[select_column[:-1]].to_numpy()
regularization_feature = gnomAD_set[select_column[:-1]].to_numpy()

calibration_label = calibration_data[select_column[-1]].to_numpy()
calibration_label_bk = copy.deepcopy(calibration_label)
test_label = test_data[select_column[-1]].to_numpy()
test_label_bk = copy.deepcopy(test_label)

print(calibration_feature.shape, test_feature.shape, regularization_feature.shape)

In [None]:
import torch
from P_KNN_GPU import silhouette_score_1d_torch, get_score_rank_torch

calibration_feature_rank = get_score_rank_torch(torch.tensor(calibration_feature), torch.tensor(calibration_feature))

for i in range(len(select_column)-1): 
    select_feature = i

    valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
    calibration_array = torch.tensor(calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1))
    calibration_array_rank = calibration_feature_rank[valid_calibration_idx][:,select_feature].reshape(-1, 1)
    calibration_label = torch.tensor(copy.deepcopy(calibration_label_bk[valid_calibration_idx]))

    print(f"Tool {select_column[i]}, calibration size: {calibration_array.shape[0]}")
    silhouette_raw = silhouette_score_1d_torch(calibration_array, calibration_label)
    silhouette_rank = silhouette_score_1d_torch(calibration_array_rank, calibration_label)
    print('silhouette_raw', silhouette_raw)
    print('silhouette_rank', silhouette_rank)

    red_mask = calibration_label == 1
    blue_mask = calibration_label == 0

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    axes[0].hist(calibration_array[red_mask], bins=30, color='red', alpha=0.6, label='Label 1')
    axes[0].hist(calibration_array[blue_mask], bins=30, color='blue', alpha=0.6, label='Label 0')
    axes[0].set_title(f"Histogram of {select_column[i]}")
    axes[0].set_xlabel("Calibration Array Values")
    axes[0].set_ylabel("Frequency")
    axes[0].legend()

    axes[1].hist(calibration_array_rank[red_mask], bins=30, color='red', alpha=0.6, label='Label 1')
    axes[1].hist(calibration_array_rank[blue_mask], bins=30, color='blue', alpha=0.6, label='Label 0')
    axes[1].set_title(f"Histogram of Ranked {select_column[i]}")
    axes[1].set_xlabel("Calibration Array Rank Values")
    axes[1].set_ylabel("Frequency")
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
for i in [2,3,7,8]: 
    select_feature = i

    valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
    calibration_array = calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1)
    calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

    valid_test_idx = ~np.isnan(test_feature[:,select_feature]) 
    test_array = test_feature[valid_test_idx][:,select_feature].reshape(-1, 1)
    test_label = copy.deepcopy(test_label_bk[valid_test_idx])

    print(f"Tool {select_column[i]}, calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

    red_mask_calibration = calibration_label == 1
    blue_mask_calibration = calibration_label == 0

    red_mask_test = test_label == 1
    blue_mask_test = test_label == 0

    plt.figure(figsize=(6, 3))
    plt.hist(calibration_array[red_mask_calibration], bins=30, color='red', alpha=0.6, label='Calibration-Pathogenic', density=True)
    plt.hist(calibration_array[blue_mask_calibration], bins=30, color='blue', alpha=0.6, label='Calibration-Benign', density=True)
    plt.hist(test_array[red_mask_test], bins=30, color='orange', alpha=0.3, label='Test-Pathogenic', density=True)
    plt.hist(test_array[blue_mask_test], bins=30, color='cyan', alpha=0.3, label='Test-Benign', density=True)
    # ax = plt.gca()
    plt.title(f"Histogram of calibration dataset {select_column[i]}")
    plt.xlabel("Calibration Array Values")
    plt.ylabel("Frequency")
    if select_column[i]=='BayesDel_nsfp33a_noAF': plt.legend(loc='upper left', fontsize = 14)
    else: plt.legend(loc='upper center')
    plt.tight_layout()
    plt.savefig(f"/gpfs/home/pl2948/VariantInterpretation/KNNCode/{select_column[i]}_distribution.svg", format="svg")
    plt.show()

# Single Tool 1D calibration

In [None]:
import copy
import numpy as np

select_column = [
    'SIFT_score', 
    'FATHMM_score', 
    'VEST4_score', 
    'REVEL_score',
    'GERP++_RS', 
    'phyloP100way_vertebrate', 
    'EA_1.0',
    'BayesDel_nsfp33a_noAF', 
    'MutPred2.0_score', 
    'CADDv1.6_PHRED',
    'pph2_prob', 
    'MPC_score', 
    'PrimateAI_score',
    'ClinVar_annotation'
]

calibration_feature = calibration_data[select_column[:-1]].to_numpy()
test_feature = test_data[select_column[:-1]].to_numpy()
regularization_feature = gnomAD_set[select_column[:-1]].to_numpy()

calibration_label = calibration_data[select_column[-1]].to_numpy()
calibration_label_bk = copy.deepcopy(calibration_label)
test_label = test_data[select_column[-1]].to_numpy()
test_label_bk = copy.deepcopy(test_label)

print(calibration_feature.shape, test_feature.shape, regularization_feature.shape)

In [None]:
import torch
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score_1D, evaluate_result_1D
import copy

#Parameter setting
Pprior = 0.0441
w_calibration=None
n_calibration_in_window = 100
frac_regularization_in_window=0.03
batch_size = 512 
normalization= None
impute = False
mi_scaling = False
n_bootstrap = 100

p_value = 0.05
logbase = 1124

best_mean_evidence_strength = 0

combine_data = pd.DataFrame()

for i in range(len(select_column)-1):    
    condition_string = select_column[i]
    select_feature = i

    valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
    calibration_array = calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1)
    calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

    valid_test_idx = ~np.isnan(test_feature[:,select_feature])
    test_array = test_feature[valid_test_idx][:,select_feature].reshape(-1, 1)
    test_label = copy.deepcopy(test_label_bk[valid_test_idx])

    valid_regularization_idx = ~np.isnan(regularization_feature[:,select_feature])
    regularization_array = regularization_feature[valid_regularization_idx][:,select_feature].reshape(-1, 1)

    print("")
    print(f"Tool {condition_string}")
    print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

    torch.cuda.empty_cache()
    gc.collect()

    test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                     calibration_label, Pprior, w_calibration, 
                                                     n_calibration_in_window, frac_regularization_in_window, 
                                                     normalization, impute, mi_scaling, n_bootstrap, batch_size)

    np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_{condition_string}.npy', test_results_array)

    test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_{condition_string}.npy')

    P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score_1D(test_results_array, test_array, p_value, Pprior, logbase)

    evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result_1D(test_results_array,
                                                                                                      test_array,
                                                                                                      test_label, 
                                                                                                      p_value, 
                                                                                                      Pprior, 
                                                                                                      logbase, 
                                                                                                      category = condition_string, 
                                                                                                      show_plot=True, 
                                                                                                      save_name=condition_string)

    test_data.loc[valid_test_idx, f"P-KNN_Pathogenic_{condition_string}"] = P_KNN_pathogenic

    combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

    mean_evidence_strength = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean() + evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()
    if mean_evidence_strength >  best_mean_evidence_strength:
        best_mean_evidence_strength = mean_evidence_strength
        best_tool = i
        
    print("pathogenic evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean():.3f}")
    print("benign evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean():.3f}")


select_feature = best_tool
condition_string = select_column[select_feature]

valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
calibration_array = calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1)
calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

valid_test_idx = ~np.isnan(test_feature[:,select_feature])
test_array = test_feature[valid_test_idx][:,select_feature].reshape(-1, 1)
test_label = copy.deepcopy(test_label_bk[valid_test_idx])

valid_regularization_idx = ~np.isnan(regularization_feature[:,select_feature])
regularization_array = regularization_feature[valid_regularization_idx][:,select_feature].reshape(-1, 1)

print("")
print(f"Best Tool {condition_string}")
print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_{condition_string}.npy')

P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score_1D(test_results_array, test_array, p_value, Pprior, logbase)
test_data.loc[valid_test_idx, f"ACMGLLR_{condition_string}"] = ACMG_scores 

## Deal with miscalibration

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# tool="MutPred2.0_score"
tool="BayesDel_nsfp33a_noAF"
# tool="VEST4_score"
# tool="REVEL_score"

upper = 0.3
lower = 0.2

x = test_data[tool]
y = test_data[f'P-KNN_Pathogenic_{tool}']

plt.figure(figsize=(6, 6))
plt.scatter(x, y, alpha=0.6, c='purple', label='Variants')
plt.xlabel(tool)
plt.ylabel(f'Posterior Probability Pathogenic_{tool}')
plt.title(f'Scatter Plot of {tool} vs. P-KNN')
plt.grid(True)

idx_05 = (np.abs(y - upper)).idxmin()
idx_06 = (np.abs(y - lower)).idxmin()

x_05 = x.loc[idx_05]
x_06 = x.loc[idx_06]

plt.scatter([x_05], [y.loc[idx_05]], color='orange', label='Closest to 0.5', zorder=5)
plt.scatter([x_06], [y.loc[idx_06]], color='cyan', label='Closest to 0.6', zorder=5)

plt.text(x_05, y.loc[idx_05], f"{x_05:.3f}", fontsize=9, ha='right')
plt.text(x_06, y.loc[idx_06], f"{x_06:.3f}", fontsize=9, ha='right')

plt.legend()
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import copy

for i in [7]:
    select_feature = i
    valid_calibration_idx = ~np.isnan(calibration_feature[:, select_feature])
    calibration_array = calibration_feature[valid_calibration_idx][:, select_feature].reshape(-1, 1)
    calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

    valid_test_idx = ~np.isnan(test_feature[:, select_feature])
    test_array = test_feature[valid_test_idx][:, select_feature].reshape(-1, 1)
    test_label = copy.deepcopy(test_label_bk[valid_test_idx])

    print(f"Tool {select_column[i]}, calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

    red_mask_calibration = calibration_label == 1
    blue_mask_calibration = calibration_label == 0
    red_mask_test = test_label == 1
    blue_mask_test = test_label == 0

    combined_values = np.concatenate([
        calibration_array[red_mask_calibration].ravel(),
        calibration_array[blue_mask_calibration].ravel(),
        test_array[red_mask_test].ravel(),
        test_array[blue_mask_test].ravel()
    ])
    bins = np.histogram_bin_edges(combined_values, bins=30)

    colors = {
        'Calibration-Pathogenic': 'red',
        'Calibration-Benign': 'blue',
        'Test-Pathogenic': '#cc6666',
        'Test-Benign': '#339999',
    }

    fig, ax = plt.subplots(figsize=(5, 6))
    ax.hist(calibration_array[red_mask_calibration], bins=bins, color=colors['Calibration-Pathogenic'],
            histtype='stepfilled', alpha=0.6, label='Calibration-pathogenic variants', density=True)
    ax.hist(calibration_array[blue_mask_calibration], bins=bins, color=colors['Calibration-Benign'],
            histtype='stepfilled', alpha=0.6, label='Calibration-benign variants', density=True)
    ax.hist(test_array[red_mask_test], bins=bins, color=colors['Test-Pathogenic'],
            histtype='stepfilled', alpha=0.3, label='Test-pathogenic variants', density=True)
    ax.hist(test_array[blue_mask_test], bins=bins, color=colors['Test-Benign'],
            histtype='stepfilled', alpha=0.3, label='Test-benign variants', density=True)

    ax.set_xlabel("Feature Value")
    ax.set_ylabel("Density")
    ax.set_xticks([-1, -0.5, 0, 0.5])
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)

    if select_column[i] == 'BayesDel_nsfp33a_noAF':
        ax.legend(loc='upper left', fontsize=14, bbox_to_anchor=(0, 1.5))
    else:
        ax.legend(loc='upper center')

    if select_column[i] == 'BayesDel_nsfp33a_noAF':
        x_raw = test_data['BayesDel_nsfp33a_noAF'].values
        x_knn = test_data['P-KNN_Pathogenic_BayesDel_nsfp33a_noAF'].values
        valid_mask = ~np.isnan(x_raw) & ~np.isnan(x_knn)
        x_raw = x_raw[valid_mask]
        x_knn = x_knn[valid_mask]

        desired_knn_scores = np.array([0.0, 0.01, 0.1, 0.4, 0.8])
        matched_raw = []
        for s in desired_knn_scores:
            idx = np.abs(x_knn - s).argmin()
            matched_raw.append(x_raw[idx])

        ax_bottom = ax.twiny()
        ax_bottom.set_xlim(ax.get_xlim())
        ax_bottom.set_xticks(matched_raw)
        ax_bottom.set_xticklabels([f"{v:.2f}" if v < 0.1 else f"{v:.1f}" for v in desired_knn_scores])
        ax_bottom.set_xlabel("P-KNN Pathogenic Probability (BayesDel_nsfp33a_noAF)", fontsize=11)

        ax_bottom.xaxis.set_ticks_position('bottom')
        ax_bottom.xaxis.set_label_position('bottom')
        ax_bottom.spines['top'].set_visible(False)
        ax_bottom.spines['right'].set_visible(False)
        ax_bottom.spines['bottom'].set_position(('outward', 60))

    plt.tight_layout()
    plt.savefig(f"/gpfs/home/pl2948/VariantInterpretation/KNNCode/{select_column[i]}_distribution_with_calibrated_ticks.svg", format="svg", bbox_inches='tight')
    plt.show()

In [None]:
import torch
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score_1D, evaluate_result_1D
import copy

#Parameter setting
Pprior = 0.0441
w_calibration=None
n_calibration_in_window = 100
frac_regularization_in_window=0.03
batch_size = 512 
normalization= None
impute = False
mi_scaling = False
n_bootstrap = 100

p_value = 0.05
logbase = 1124

best_mean_evidence_strength = 0

combine_data = pd.DataFrame()

for i in [2, 3, 7, 8]:    
    condition_string = select_column[i]
    select_feature = i

    valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
    calibration_array = calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1)
    calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

    valid_regularization_idx = ~np.isnan(regularization_feature[:,select_feature])
    regularization_array = regularization_feature[valid_regularization_idx][:,select_feature].reshape(-1, 1)

    print("")
    print(f"Tool {condition_string}")
    print(f"calibration size: {calibration_array.shape[0]}")

    torch.cuda.empty_cache()
    gc.collect()

    test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_calibrateset{condition_string}.npy')

    P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score_1D(test_results_array, calibration_array, p_value, Pprior, logbase)

    evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result_1D(test_results_array,
                                                                                                      calibration_array,
                                                                                                      calibration_label, 
                                                                                                      p_value, 
                                                                                                      Pprior, 
                                                                                                      logbase, 
                                                                                                      category = condition_string, 
                                                                                                      show_plot=True, 
                                                                                                      save_name=condition_string)

    calibration_data.loc[valid_calibration_idx, f"P-KNN_Pathogenic_{condition_string}"] = P_KNN_pathogenic

    combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

    print("pathogenic evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean():.3f}")
    print("benign evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean():.3f}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import copy

Pprior = 0.0441

for i in [7]:
    if i=='P_KNN_pathogenic':
        tool = i
        score_tool = i
    else:    
        tool = select_column[i]
        score_tool = f"P-KNN_Pathogenic_{tool}"

    calibration_pathogenic = calibration_data[calibration_data['ClinVar_annotation'] == 1][score_tool].dropna()
    calibration_benign = calibration_data[calibration_data['ClinVar_annotation'] == 0][score_tool].dropna()
    test_pathogenic = test_data[test_data['ClinVar_annotation'] == 1][score_tool].dropna()
    test_benign = test_data[test_data['ClinVar_annotation'] == 0][score_tool].dropna()

    bins = np.linspace(0,1,11)
    bin_centers = 0.5 * (bins[1:] + bins[:-1])

    cal_path_dens, _ = np.histogram(calibration_pathogenic, bins=bins, density=True)
    test_path_dens, _ = np.histogram(test_pathogenic, bins=bins, density=True)
    cal_ben_dens, _ = np.histogram(calibration_benign, bins=bins, density=True)
    test_ben_dens, _ = np.histogram(test_benign, bins=bins, density=True)

    calibration_prob = cal_path_dens*Pprior/(cal_path_dens*Pprior+cal_ben_dens*(1-Pprior))
    test_prob = test_path_dens*Pprior/(cal_path_dens*Pprior+test_ben_dens*(1-Pprior))

    fig, ax = plt.subplots(figsize=(6, 3))
    
    ax.plot(bin_centers, calibration_prob, color='purple', label='Calibration dataset \nnormalized % of pathogenic variants', linewidth=2, alpha=0.7)
    ax.plot(bin_centers, test_prob, color='orchid', label='Test dataset \nnormalized % of pathogenic variants', linewidth=2, alpha=0.7)
    ax.set_ylabel("Normalized % of \npathogenic variants", fontsize=12)
    ax.set_xlim([0, 1.05])
    ax.set_xlabel("Posterior probability (pathogenic)")
    
    ax.plot(bin_centers, calibration_prob-test_prob, color='orange', label='Difference (calibration - test) \nnormalized % of pathogenic variants', linewidth=4, alpha=0.7)
    
    ax.legend(loc='lower center', bbox_to_anchor=(0.5, 1.5), fontsize=14)
    ax.spines['top'].set_visible(False)
    ax.spines['right'].set_visible(False)
    
    plt.tight_layout()
    plt.savefig(f"/gpfs/home/pl2948/VariantInterpretation/KNNCode/{select_column[i]}_densityratio.svg", format="svg", bbox_inches='tight')

    plt.show()

In [None]:
def weighted_score_with_binom_ci(p_array, n_array, w, p_value=0.05):
    from scipy.stats import binom
    p_array = np.asarray(p_array)
    n_array = np.asarray(n_array)
    shape = p_array.shape

    scores = np.full(shape, np.nan)
    ci_lower = np.full(shape, np.nan)
    ci_upper = np.full(shape, np.nan)

    t_array = p_array + n_array

    it = np.nditer(p_array, flags=['multi_index'])
    while not it.finished:
        idx = it.multi_index
        p = p_array[idx]
        t = t_array[idx]

        if t > 0:
            pi_hat = p / t
            ci_low, ci_up = binom.interval(1-p_value, t, pi_hat)
            
            def weighted(pi):
                return pi / (pi + w * (1 - pi))

            scores[idx] = weighted(pi_hat)
            ci_lower[idx] = weighted(ci_low / t)
            ci_upper[idx] = weighted(ci_up / t)
        
        it.iternext()
    
    return scores, ci_lower, ci_upper

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="ticks")
import numpy as np
import copy
from scipy.interpolate import interp1d

Pprior = 0.0441
logbase = 1124

Post_p = np.zeros(4) 
Post_b = np.zeros(4)

for j in range(4):
    Post_p[j] = logbase ** (1 / 2 ** j) * Pprior / ((logbase ** (1 / 2 ** j) - 1) * Pprior + 1)
    Post_b[j] = (logbase ** (1 / 2 ** j)) * (1 - Pprior) / (((logbase ** (1 / 2 ** j)) - 1) * (1 - Pprior) + 1)

for i in [2, 3, 7, 8]:#, 'P_KNN_pathogenic']:
    if i=='P_KNN_pathogenic':
        tool = i
        score_tool = i
    else:    
        tool = select_column[i]
        score_tool = f"P-KNN_Pathogenic_{tool}"

    calibration_pathogenic = calibration_data[calibration_data['ClinVar_annotation'] == 1][score_tool].dropna()
    calibration_benign = calibration_data[calibration_data['ClinVar_annotation'] == 0][score_tool].dropna()
    test_pathogenic = test_data[test_data['ClinVar_annotation'] == 1][score_tool].dropna()
    test_benign = test_data[test_data['ClinVar_annotation'] == 0][score_tool].dropna()

    w_test = (1 - Pprior) * len(test_pathogenic) / (len(test_benign) * Pprior) 

    bins = np.linspace(0,1,11)
    bin_centers = 0.5 * (bins[1:] + bins[:-1])

    with np.errstate(divide='ignore', invalid='ignore'):
        ratio_path_dens = np.where(cal_path_dens != 0, test_path_dens / cal_path_dens, np.nan)
        ratio_ben_dens = np.where(cal_ben_dens != 0, test_ben_dens / cal_ben_dens , np.nan)

    pathogenic_counts, _ = np.histogram(test_pathogenic, bins=bins, density=False)
    benign_counts, _ = np.histogram(test_benign, bins=bins, density=False)

    pathogenic_ratios, ci_lower, ci_upper = weighted_score_with_binom_ci(pathogenic_counts, benign_counts, w_test, p_value=0.05)
    valid_mask = ~np.isnan(pathogenic_ratios) & ~np.isnan(ci_lower) & ~np.isnan(ci_upper)

    cal_path_dens, _ = np.histogram(calibration_pathogenic, bins=bins, density=True)
    test_path_dens, _ = np.histogram(test_pathogenic, bins=bins, density=True)
    cal_ben_dens, _ = np.histogram(calibration_benign, bins=bins, density=True)
    test_ben_dens, _ = np.histogram(test_benign, bins=bins, density=True)

    calibration_prob = cal_path_dens*Pprior/(cal_path_dens*Pprior+cal_ben_dens*(1-Pprior))
    test_prob = test_path_dens*Pprior/(cal_path_dens*Pprior+test_ben_dens*(1-Pprior))

    valid_mask = ~np.isnan(pathogenic_ratios) & ~np.isnan(ci_lower) & ~np.isnan(ci_upper)

    fig, ax1 = plt.subplots(figsize=(6, 5.45))

    ax1.plot(bin_centers[valid_mask], pathogenic_ratios[valid_mask], 
             'o-', color='red', label='Frequency of pathogenic variants')
    ax1.fill_between(bin_centers[valid_mask], 
                     ci_lower[valid_mask], ci_upper[valid_mask], 
                     color='red', alpha=0.2, label='95% Confidence Interval')
    ax1.set_ylabel("Pathogenic Probability", fontsize=12)
    ax1.tick_params(axis='y')
    
    for threshold in Post_p:
        ax1.axvline(x=threshold, color='orange', linestyle='--', alpha=0.2)
        ax1.axhline(y=threshold, color='orange', linestyle='--', alpha=0.2)
    
    x_vals = np.linspace(0, 1, 100)
    ax1.plot(x_vals, x_vals, color='red', linestyle='--', alpha=0.2)
    
    ax2 = ax1.twinx()
    ax2.plot(bin_centers, calibration_prob - test_prob, color='orange', linestyle='-', label='Difference (calibration - test) \nnormalized % of pathogenic variants', linewidth=3, alpha = 0.7)
    ax2.set_ylabel("Normalized % of pathogenic variants", fontsize=12)
    y_max = np.nanmax(calibration_prob - test_prob) 
    ax2.set_yticks(np.arange(0, y_max + 0.1, 0.1))
    ax2.tick_params(axis='y')
    
    ax1.set_xlabel("P-KNN Pathogenic Score", fontsize=12)
    ax1.set_xlim([0, 1.0])

    handles1, labels1 = ax1.get_legend_handles_labels()
    handles2, labels2 = ax2.get_legend_handles_labels()
        
    all_handles = handles1 + handles2
    all_labels = labels1 + labels2
    
    fig.legend(all_handles, all_labels, loc='lower center', bbox_to_anchor=(0.4, 1.0), frameon=True)

    plt.tight_layout()
    plt.savefig(f"/gpfs/home/pl2948/VariantInterpretation/KNNCode/{tool}_combined_dualyaxis.svg", 
            format="svg", bbox_inches='tight')
    plt.show()

# Integrating All 13 tools

In [None]:
import copy
import numpy as np

select_column = [
    'SIFT_score', 
    'FATHMM_score', 
    'VEST4_score', 
    'REVEL_score',
    'GERP++_RS', 
    'phyloP100way_vertebrate', 
    'EA_1.0',
    'BayesDel_nsfp33a_noAF', 
    'MutPred2.0_score', 
    'CADDv1.6_PHRED',
    'pph2_prob', 
    'MPC_score', 
    'PrimateAI_score',
    'ClinVar_annotation'
]

calibration_feature = calibration_data[select_column[:-1]].to_numpy()
test_feature = test_data[select_column[:-1]].to_numpy()
regularization_feature = gnomAD_set[select_column[:-1]].to_numpy()

calibration_label = calibration_data[select_column[-1]].to_numpy()
calibration_label_bk = copy.deepcopy(calibration_label)
test_label = test_data[select_column[-1]].to_numpy()
test_label_bk = copy.deepcopy(test_label)

print(calibration_feature.shape, test_feature.shape, regularization_feature.shape)

In [None]:
import torch
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score, evaluate_result
import copy

#Parameter setting
Pprior = 0.0441
w_calibration = None
n_calibration_in_window = 100 
frac_regularization_in_window = 0.03
batch_size = 512
normalization= 'rank'
impute = True
mi_scaling = True
n_bootstrap = 100

p_value = 0.05
logbase = 1124

condition_string = 'P-KNN'
valid_calibration_idx = ~np.isnan(calibration_feature).all(axis=1) 
calibration_array = calibration_feature[valid_calibration_idx]
calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

valid_test_idx = ~np.isnan(test_feature).all(axis=1) 
test_array = test_feature[valid_test_idx]
test_label = copy.deepcopy(test_label_bk[valid_test_idx])

valid_regularization_idx = ~np.isnan(regularization_feature).all(axis=1) 
regularization_array = regularization_feature[valid_regularization_idx]

print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

torch.cuda.empty_cache()
gc.collect()

test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                 calibration_label, Pprior, w_calibration, 
                                                 n_calibration_in_window, frac_regularization_in_window, 
                                                 normalization, impute, mi_scaling, n_bootstrap, batch_size)

np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_All.npy', test_results_array)

test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen13_All.npy')

P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

test_data.loc[valid_test_idx, f"ACMGLLR"] = ACMG_scores 
test_data.loc[valid_test_idx, f"P_KNN_pathogenic"] = P_KNN_pathogenic 

evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result(test_results_array,
                                                                                                  test_label, 
                                                                                                  p_value, 
                                                                                                  Pprior, 
                                                                                                  logbase, 
                                                                                                  category = condition_string, 
                                                                                                  show_plot=True, 
                                                                                                  save_name="Clingen13_All")

combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)
    
print("pathogenic evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean():.3f}")
print("benign evidence mean", f"{evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean():.3f}")

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_curve, auc
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

roc_curves = []

for col in select_column[:-1]+["ACMGLLR"]:  # Exclude 'ClinVar_annotation'
    roc_curves.append(calculate_roc_and_auc(
        test_data['ClinVar_annotation'], test_data[col], col))

plt.figure(figsize=(8, 7))
for fpr, tpr, roc_auc, label in roc_curves:
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')

plt.plot([0, 1], [0, 1], color='gray', linestyle='--')  # Reference diagonal line
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curves')
plt.legend(loc='lower right')
# plt.grid()
plt.show()

## Comparison

In [None]:
import pandas as pd
from scipy import stats

for i in [0, 1]:
    print(i)
    df = test_data[test_data['ClinVar_annotation']==i][['ACMGLLR', 'ACMGLLR_BayesDel_nsfp33a_noAF']]
    display(df)
    
    p_value_cutoff = 0.05
    
    side = 'greater' if i == 1 else 'less'

    t_stat, p_value = stats.ttest_rel(df['ACMGLLR'], df['ACMGLLR_BayesDel_nsfp33a_noAF'], alternative=side)
    
    if p_value < p_value_cutoff and t_stat > 0:
        result = "ACMGLLR is significantly higher to ACMGLLR_BayesDel_nsfp33a_noAF"
    elif p_value < p_value_cutoff and t_stat < 0:
        result = "ACMGLLR is significantly lower to ACMGLLR_BayesDel_nsfp33a_noAF"
    else:
        result = "No significant difference between the two groups"
    
    print(f"T-statistic: {t_stat}")
    print(f"P-value: {np.format_float_scientific(p_value, precision=5)}")
    print(result)

In [None]:
import scipy.stats as stats

for i in [0, 1]:
    print(i)
    df = test_data[test_data['ClinVar_annotation']==i][['ACMGLLR', 'ACMGLLR_BayesDel_nsfp33a_noAF']]
    display(df)

    stat, p_value = stats.wilcoxon(df['ACMGLLR'], df['ACMGLLR_BayesDel_nsfp33a_noAF'])
    
    print(f"Wilcoxon Statistic: {stat}")
    print(f"P-value: {p_value}")
    
    if p_value < 0.05:
        print("Significant difference between the two tools.")
    else:
        print("No significant difference between the two tools.")

In [None]:
columns_to_keep = ['P-KNN', 
                   'BayesDel_nsfp33a_noAF', 
                   'REVEL_score', 
                   'MutPred2.0_score',
                   'VEST4_score']
combine_data = combine_data[combine_data['Category'].isin(columns_to_keep)]
combine_data['Category'] = pd.Categorical(combine_data['Category'], categories=columns_to_keep, ordered=True)
combine_data = combine_data.sort_values(by='Category')
combine_data

In [None]:
import seaborn as sns

sns.set(style="ticks")
plt.figure(figsize=(12, 3.5))
ax = sns.violinplot(
    x="Category",
    y="Score", 
    hue="Label", 
    data=combine_data, 
    split=True, 
    inner="box", 
    palette={"Pathogenic variants": "red", "Benign variants": "blue"},
    alpha=0.6, 
    density_norm='area'
)

dark_gray = '0.3'

sns.boxplot(
    x="Category", y="Score", hue="Label", data=combine_data,
    showcaps=True,  
    showfliers=False,
    palette={"Pathogenic variants": "pink", "Benign variants": "#bae0f5"},
    width=0.1, dodge=True, ax=ax,
    whiskerprops={'color': dark_gray, 'linewidth': 1.5, 'zorder': 2},
    capprops={'color': dark_gray, 'linewidth': 1.5},
    medianprops={'color': dark_gray, 'linewidth': 1.5},
    boxprops={'zorder': 2, 'edgecolor': dark_gray, 'linewidth': 1.5},
    hue_order=['Pathogenic variants', 'Benign variants']

)

handles, labels = ax.get_legend_handles_labels()
n_hue = combine_data["Label"].nunique() 
ax.legend(handles[:n_hue], labels[:n_hue], loc="upper right", fontsize=12)

category_labels = ["P-KNN", "BayesDel", "REVEL", "MutPred2", "VEST4"]
plt.xticks(ticks=range(len(category_labels)), labels=category_labels, fontsize=14)

plt.xlabel("")
plt.xticks(fontsize=14)
plt.ylabel("Evidence strength (LLR)", fontsize=14)
sns.despine(top=True, right=True)

plt.savefig(f"/gpfs/home/pl2948/VariantInterpretation/KNNgraph/{condition_string}_violin.svg", format="svg")
plt.savefig('Fig3A.svg', format="svg")

plt.show()

# Calibrate VUS

In [None]:
import torch

if torch.cuda.is_available():
    gpu_count = torch.cuda.device_count() 
    for i in range(gpu_count):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}") 
        print(f"Memory Usage for GPU {i}:")
        print(f"  Allocated: {torch.cuda.memory_allocated(i) / 1024**2:.2f} MB") 
        print(f"  Cached: {torch.cuda.memory_reserved(i) / 1024**2:.2f} MB") 
else:
    print("CUDA is not available.")

In [None]:
import torch
import gc

torch.cuda.empty_cache()
gc.collect()

print(f"Allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"Reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")

In [None]:
import pandas as pd
clinvar_dbNSFP_VUS = pd.read_csv("/gpfs/home/pl2948/VariantInterpretation/ClinVarBenchmark/Clinvar_dbNSFP51a_VUS.csv")

clinvar_dbNSFP_VUS

In [None]:
select_column_Clingen = [
    'SIFT_score', 
    'FATHMM_score', 
    'VEST4_score', 
    'REVEL_score',
    'GERP++_RS', 
    'phyloP100way_vertebrate', 
    # 'EA_1.0',
    'BayesDel_nsfp33a_noAF', 
    # 'MutPred2.0_score', 
    'CADDv1.6_PHRED',
    'pph2_prob', 
    'MPC_score', 
    'PrimateAI_score',
    'ClinVar_annotation'
]

In [None]:
test_data = clinvar_dbNSFP_VUS.rename(columns={'fathmm-XF_coding_score': 'FATHMM_score',
                                               'BayesDel_noAF_score': 'BayesDel_nsfp33a_noAF',
                                               'Polyphen2_HVAR_score': 'pph2_prob',
                                               'clnsig': 'ClinVar_annotation',
                                              })
del clinvar_dbNSFP_VUS


test_data = test_data[select_column_Clingen[:-1]]

print(len(test_data))

for col in select_column_Clingen[:-1]:
    test_data[col] = pd.to_numeric(test_data[col], errors='coerce')
    if col in ['SIFT_score', 'FATHMM_score']:
        test_data[col] *=-1

test_data = test_data.reset_index(drop=True)

display(test_data)
print(test_data.isna().sum()/len(test_data))

## Calibration

In [None]:
calibration_data[select_column_Clingen[:-1]]

In [None]:
test_data[select_column_Clingen[:-1]]

In [None]:
import copy

calibration_feature = calibration_data[select_column_Clingen[:-1]].to_numpy()
test_feature = test_data[select_column_Clingen[:-1]].to_numpy()
regularization_feature = gnomAD_set[select_column_Clingen[:-1]].to_numpy()

calibration_label = calibration_data[select_column_Clingen[-1]].to_numpy()
calibration_label_bk = copy.deepcopy(calibration_label)

print(calibration_feature.shape, test_feature.shape, regularization_feature.shape)

In [None]:
import torch
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score, evaluate_result
import copy

#Parameter setting
Pprior = 0.0441; # Prior probability of pathogenicity (changes w/ c)
w_calibration = None
n_calibration_in_window = 100   # minimum number of clinvar variants in a local window
frac_regularization_in_window = 0.03
batch_size = 4096 # This is for A100, if V100, use 512
normalization= 'rank'
impute = True
mi_scaling = True
n_bootstrap = 100

p_value = 0.05
logbase = 1124

condition_string = 'P-KNN_VUS'
valid_calibration_idx = ~np.isnan(calibration_feature).all(axis=1) 
calibration_array = calibration_feature[valid_calibration_idx]
calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

valid_test_idx = ~np.isnan(test_feature).all(axis=1) 
test_array = test_feature[valid_test_idx]

valid_regularization_idx = ~np.isnan(regularization_feature).all(axis=1) 
regularization_array = regularization_feature[valid_regularization_idx]

print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

torch.cuda.empty_cache()
gc.collect()

test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                 calibration_label, Pprior, w_calibration, 
                                                 n_calibration_in_window, frac_regularization_in_window, 
                                                 normalization, impute, mi_scaling, n_bootstrap, batch_size)

np.save('/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen11_VUS.npy', test_results_array)

test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen11_VUS.npy')

P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

test_data.loc[valid_test_idx, f"ACMGLLR"] = ACMG_scores

In [None]:
import torch
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score_1D, evaluate_result_1D
import copy

#Parameter setting
Pprior = 0.0441
w_calibration=None
n_calibration_in_window = 100
frac_regularization_in_window=0.03
batch_size = 4096 
normalization= 'z'
impute = False
mi_scaling = False
n_bootstrap = 100

p_value = 0.05
logbase = 1124

i = 6 #BayesDel   
condition_string = select_column_Clingen[i]
select_feature = i

valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
calibration_array = calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1)
calibration_label = copy.deepcopy(calibration_label_bk[valid_calibration_idx])

valid_test_idx = ~np.isnan(test_feature[:,select_feature])
test_array = test_feature[valid_test_idx][:,select_feature].reshape(-1, 1)

valid_regularization_idx = ~np.isnan(regularization_feature[:,select_feature])
regularization_array = regularization_feature[valid_regularization_idx][:,select_feature].reshape(-1, 1)

print("")
print(f"Tool {condition_string}")
print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

torch.cuda.empty_cache()
gc.collect()

test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                 calibration_label, Pprior, w_calibration, 
                                                 n_calibration_in_window, frac_regularization_in_window, 
                                                 normalization, impute, mi_scaling, n_bootstrap, batch_size)

np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen11_VUS_{condition_string}.npy', test_results_array)

test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen11_VUS_{condition_string}.npy')

P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score_1D(test_results_array, test_array, p_value, Pprior, logbase)

test_data.loc[valid_test_idx, f"ACMGLLR_{condition_string}"] = ACMG_scores 

In [None]:
import pandas as pd
from scipy import stats

df = test_data[(test_data['ACMGLLR'].notna())&(test_data['ACMGLLR_BayesDel_nsfp33a_noAF'].notna())][['ACMGLLR', 'ACMGLLR_BayesDel_nsfp33a_noAF']]
display(df)

p_value_cutoff = 0.05

t_stat, p_value = stats.ttest_rel(np.abs(df['ACMGLLR']), np.abs(df['ACMGLLR_BayesDel_nsfp33a_noAF']), alternative='greater')

print(f'ACMGLLR mean {np.abs(df["ACMGLLR"]).mean()}')
print(f'ACMGLLR_BayesDel_nsfp33a_noAF mean {np.abs(df["ACMGLLR_BayesDel_nsfp33a_noAF"]).mean()}')

if p_value < p_value_cutoff and t_stat > 0:
    result = "ACMGLLR is significantly higher to ACMGLLR_BayesDel_nsfp33a_noAF"
elif p_value < p_value_cutoff and t_stat < 0:
    result = "ACMGLLR is significantly lower to ACMGLLR_BayesDel_nsfp33a_noAF"
else:
    result = "No significant difference between the two groups"

print(f"T-statistic: {t_stat}")
print(f"P-value: {p_value}")
print(result)

In [None]:
import numpy as np

test_results_array = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Clingen11_VUS.npy')

logbase = 1124;       # LR+ constant from Tavtigian et al. (changes w/ alpha)
Pprior = 0.0441; # Prior probability of pathogenicity (changes w/ c)
p_value = 0.05

Post_p = np.zeros(4) 
Post_b = np.zeros(4)

for j in range(1, 5):
    Post_p[j-1] = logbase ** (1 / 2 ** (j - 1)) * Pprior / ((logbase ** (1 / 2 ** (j - 1)) - 1) * Pprior + 1)
    Post_b[j-1] = (logbase ** (1 / 2 ** (j - 1))) * (1 - Pprior) / (((logbase ** (1 / 2 ** (j - 1))) - 1) * (1 - Pprior) + 1)

index = int(np.ceil(p_value*test_results_array.shape[1]))
P_KNN_pathogenic = np.sort(test_results_array, axis=1)[:, index-1]
P_KNN_benign = 1- np.sort(test_results_array, axis=1)[:, -index]

P_KNN_evidence_array = np.zeros(len(P_KNN_pathogenic))
weights = [8, 4, 2, 1]

print("Pathogenic evidence")
proportion_old = 0
for i, threshold in enumerate(Post_p):
    pathogenic_count = (P_KNN_pathogenic > threshold).sum()
    proportion = pathogenic_count / len(P_KNN_pathogenic)
    print(f"Threshold {threshold:.3f}: {pathogenic_count} ({proportion:.2%}) pathogenic variants", f"{(proportion-proportion_old):.2%}")
    proportion_old = proportion
    P_KNN_evidence_array[(P_KNN_pathogenic > threshold) & (P_KNN_evidence_array == 0)] = weights[i]
    
print("Benign evidence")
proportion_old = 0
for i, threshold in enumerate(Post_b):
    benign_count = (P_KNN_benign > threshold).sum()
    proportion = benign_count / len(P_KNN_benign)
    print(f"Threshold {threshold:.3f}: {benign_count} ({proportion:.2%}) benign variants", f"{(proportion-proportion_old):.2%}")
    proportion_old = proportion
    P_KNN_evidence_array[(P_KNN_benign > threshold) & (P_KNN_evidence_array == 0)] = weights[i]

In [None]:
import numpy as np
pathogenic_threshold_dict = {'REVEL_score': [np.inf, 0.932, 0.773, 0.644],
                             'VEST4_score': [np.inf, 0.965, 0.861, 0.764],
                             'BayesDel_nsfp33a_noAF': [np.inf, 0.5, 0.27, 0.13]
                            }

benign_threshold_dict = {'REVEL_score': [0.003, 0.016, 0.183, 0.290],
                         'VEST4_score': [-np.inf, -np.inf, 0.302, 0.449],
                         'BayesDel_nsfp33a_noAF': [-np.inf, -np.inf, -0.36, -0.18]
                            }

In [None]:
SingleTool_evidence_array = np.zeros((len(test_data), 3))
weights = [8, 4, 2, 1]

for p, tool in enumerate(['REVEL_score', 'VEST4_score', 'BayesDel_nsfp33a_noAF']):
    Post_p = pathogenic_threshold_dict[tool]
    Post_b = benign_threshold_dict[tool]
    score = test_data[tool]
    print(tool)
    print("Pathogenic evidence")
    proportion_old = 0
    
    for i, threshold in enumerate(Post_p):
        pathogenic_count = (score >= threshold).sum()
        proportion = pathogenic_count / len(score)
        print(f"Threshold {threshold:.3f}: {pathogenic_count} ({proportion:.2%}) pathogenic variants", f"{(proportion-proportion_old):.2%}")
        proportion_old = proportion
        SingleTool_evidence_array[(score >= threshold) & (SingleTool_evidence_array[:,p]==0), p]=weights[i]
    print("Benign evidence")
    
    proportion_old = 0
    for i, threshold in enumerate(Post_b):
        benign_count = (score < threshold).sum()
        proportion = benign_count / len(score)
        print(f"Threshold {threshold:.3f}: {benign_count} ({proportion:.2%}) benign variants", f"{(proportion-proportion_old):.2%}")
        proportion_old = proportion
        SingleTool_evidence_array[(score < threshold) & (SingleTool_evidence_array[:,p]==0), p]=weights[i]

In [None]:
import scipy.stats as stats

stat, p_value = stats.wilcoxon(np.abs(P_KNN_evidence_array), np.abs(SingleTool_evidence_array[:,0]))

print(f"Wilcoxon Statistic: {stat}")
print(f"P-value: {p_value}")

if p_value < 0.05:
    print("Significant difference between the two tools.")
else:
    print("No significant difference between the two tools.")

In [None]:
import numpy as np

median_A = np.median(np.abs(P_KNN_evidence_array))
median_B = np.median(np.abs(SingleTool_evidence_array[:,2]))

print(f"Median of P_KNN evidence: {median_A:.3f}")
print(f"Median of SingleTool evidence: {median_B:.3f}")

if median_A > median_B:
    print("P_KNN_evidence_array is generally stronger.")
elif median_A < median_B:
    print("SingleTool_evidence_array is generally stronger.")
else:
    print("Both tools have similar evidence assignment strength.")