# Generate feature and label

In [None]:
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split

np.random.seed(42)
torch.manual_seed(42)

num_samples = 145000  
num_features = 17

features = np.random.normal(loc=0, scale=1, size=(num_samples, num_features)).astype(np.float32)

# Pass through a ramdom MLP and generate label
class ComplexMLP(nn.Module):
    def __init__(self, input_dim):
        super(ComplexMLP, self).__init__()
        self.hidden1 = nn.Linear(input_dim, 32)
        self.hidden2 = nn.Linear(32, 16)
        self.hidden3 = nn.Linear(16, 8)
        self.output = nn.Linear(8, 1)
        self.activation = nn.ReLU()
    
    def forward(self, x):
        x = self.activation(self.hidden1(x))
        x = self.activation(self.hidden2(x))
        x = self.activation(self.hidden3(x))
        x = self.output(x)
        return x

model = ComplexMLP(num_features)
features_tensor = torch.tensor(features)
outputs = model(features_tensor).detach().numpy().flatten()

threshold = np.median(outputs)
labels = (outputs > threshold).astype(int)

# Add noise
features += np.random.normal(loc=0, scale=0.01, size=(num_samples, num_features)).astype(np.float32)

# Split for model training, calibration and test
X_model, X_calibration, y_model, y_calibration = train_test_split(features, labels, test_size=95000/145000, random_state=42)
X_regularization, X_calibration, _, y_calibration = train_test_split(X_calibration, y_calibration, test_size=60000/95000, random_state=42)
X_calibration, X_test, y_calibration, y_test = train_test_split(X_calibration, y_calibration, test_size=50000/60000, random_state=42)

print("Model train feature:", X_model.shape, "Model train label:", y_model.shape)
print("Regularization feature:", X_regularization.shape)
print("Calibrate feature:", X_calibration.shape, "Calibrate label:", y_calibration.shape)
print("Test feature:", X_test.shape, "Test label:", y_test.shape)

# Train elementary tools

In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import roc_auc_score

# Exclude 2 features from training
excluded_features = [0, 1] 
available_features = [i for i in range(num_features) if i not in excluded_features]

np.random.seed(42)
# models = []
model_feature = []
regularization_feature = []
calibration_feature = []
test_feature = []

# Model parameters
model_classes = [GaussianNB, RandomForestClassifier, GaussianNB, RandomForestClassifier, LogisticRegression, LogisticRegression]
model_params = [
    {}, 
    {"n_estimators": 100, "min_samples_split": 10, "n_jobs": -1, "random_state": 42},
    {},  
    {"n_estimators": 100, "min_samples_split": 10, "n_jobs": -1, "random_state": 42},
    {"random_state": 42},
    {"random_state": 42}
]

num_tools = 50
num_rounds = (num_tools//6)+1 

for round_idx in range(num_rounds):
    print(f"Training round {round_idx + 1}/{num_rounds}")
    for i, (model_class, params) in enumerate(zip(model_classes, model_params)):
        selected_features = np.sort(np.random.choice(available_features, 12 - round_idx//2, replace=False))
        # selected_features = np.sort(np.random.choice(available_features, 14 - round_idx//2, replace=False))
        X_model_subset = X_model[:, selected_features]
        X_regularization_subset = X_regularization[:, selected_features]
        X_calibration_subset = X_calibration[:, selected_features]
        X_test_subset = X_test[:, selected_features]

        model = model_class(**params)
        model.fit(X_model_subset, y_model)
        
        y_model_prob = model.predict_proba(X_model_subset)[:, 1]
        y_regularization_prob = model.predict_proba(X_regularization_subset)[:, 1]
        y_calibration_prob = model.predict_proba(X_calibration_subset)[:, 1]
        y_test_prob = model.predict_proba(X_test_subset)[:, 1]

        auc_model = roc_auc_score(y_model, y_model_prob)
        auc_calibration = roc_auc_score(y_calibration, y_calibration_prob)
        auc_test = roc_auc_score(y_test, y_test_prob)
        
        print(f"Model {round_idx*6+i} ({model_class.__name__}) trained with {len(selected_features)} features {selected_features}.")
        print(f"Model train AUC: {auc_model:.4f},  Calibrate AUC: {auc_calibration:.4f}, Calibrate test AUC: {auc_test:.4f}")
        model_feature.append(y_model_prob)
        regularization_feature.append(y_regularization_prob)
        calibration_feature.append(y_calibration_prob)
        test_feature.append(y_test_prob)
        # models.append(model)

        if round_idx*6+i+1 >= num_tools: break

model_feature = np.column_stack(model_feature)
regularization_feature = np.column_stack(regularization_feature)
calibration_feature = np.column_stack(calibration_feature)
test_feature = np.column_stack(test_feature)

print("Tool calibration", calibration_feature.shape, "Tool test", test_feature.shape, "Tool regularization", regularization_feature.shape)

## silhouette score and mutual information of each tool

In [None]:
import torch 
import copy
from P_KNN_GPU import silhouette_score_1d_torch, get_score_rank_torch
from sklearn.feature_selection import mutual_info_classif
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

calibration_feature_rank = get_score_rank_torch(torch.tensor(calibration_feature), torch.tensor(calibration_feature))
calibration_label_bk = y_calibration

silhouette_raw_list = []
silhouette_rank_list = []
MI_raw_list = []
MI_rank_list = []
AUC_list = []

for i in range(calibration_feature_rank.shape[1]): 
    select_feature = i

    valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
    calibration_array = torch.tensor(calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1))
    calibration_array_rank = calibration_feature_rank[valid_calibration_idx][:,select_feature].reshape(-1, 1)
    calibration_label = torch.tensor(copy.deepcopy(calibration_label_bk[valid_calibration_idx]))

    print(f"Tool {i}, calibration size: {calibration_array.shape[0]}")
    silhouette_raw = silhouette_score_1d_torch(calibration_array, calibration_label)
    silhouette_rank = silhouette_score_1d_torch(calibration_array_rank, calibration_label)
    MI_raw = mutual_info_classif(calibration_array, calibration_label)
    MI_rank = mutual_info_classif(calibration_array_rank, calibration_label)
    AUC = roc_auc_score(calibration_label, calibration_array_rank)
    
    print('silhouette_raw', silhouette_raw)
    print('silhouette_rank', silhouette_rank)
    print('mutual information raw', MI_raw[0])
    print('mutual information rank', MI_rank[0])
    print('AUC', AUC)

    silhouette_raw_list.append(silhouette_raw) 
    silhouette_rank_list.append(silhouette_rank) 
    MI_raw_list.append(MI_raw[0])
    MI_rank_list.append(MI_rank[0])
    AUC_list.append(AUC)

    red_mask = calibration_label == 1
    blue_mask = calibration_label == 0

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    axes[0].hist(calibration_array[red_mask], bins=30, color='red', alpha=0.6, label='Label 1')
    axes[0].hist(calibration_array[blue_mask], bins=30, color='blue', alpha=0.6, label='Label 0')
    axes[0].set_title(f"Histogram of tool {i}")
    axes[0].set_xlabel("Calibration Array Values")
    axes[0].set_ylabel("Frequency")
    axes[0].legend()

    axes[1].hist(calibration_array_rank[red_mask], bins=30, color='red', alpha=0.6, label='Label 1')
    axes[1].hist(calibration_array_rank[blue_mask], bins=30, color='blue', alpha=0.6, label='Label 0')
    axes[1].set_title(f"Histogram of Ranked tool {i}")
    axes[1].set_xlabel("Calibration Array Rank Values")
    axes[1].set_ylabel("Frequency")
    axes[1].legend()
    
    plt.tight_layout()
    plt.show()

plt.figure(figsize=(8, 6))
plt.scatter(MI_raw_list, silhouette_raw_list, alpha=0.7, edgecolors='k')
plt.scatter(MI_rank_list, silhouette_rank_list, alpha=0.7, edgecolors='m')
plt.xlabel("Mutual Information")
plt.ylabel("Silhouette Score")
plt.title("Scatter Plot of Mutual Information vs. Silhouette Score")
plt.grid(True)
plt.show()

In [None]:
print("strongest to weakest")
print(np.argsort(AUC_list)[::-1])

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(MI_raw_list, silhouette_raw_list, alpha=0.7, label = "raw")
plt.scatter(MI_rank_list, silhouette_rank_list, alpha=0.7, label = "rank")
plt.xlabel("Mutual Information")
plt.ylabel("Silhouette Score")
plt.title("Scatter Plot of Mutual Information vs. Silhouette Score")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(MI_raw_list, AUC_list, alpha=0.7, label = "raw")
plt.scatter(MI_rank_list, AUC_list, alpha=0.7, label = "rank")
plt.xlabel("Mutual Information")
plt.ylabel("AUC")
plt.title("Scatter Plot of Mutual Information vs. AUC")
plt.legend()
plt.grid(True)
plt.show()

In [None]:
import numpy as np
np.save('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/model_feature.npy', model_feature)
np.save('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/model_label.npy', y_model)
np.save('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_feature.npy', calibration_feature)
np.save('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_label.npy', y_calibration)
np.save('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_feature.npy', test_feature)
np.save('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_label.npy', y_test)
np.save('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/regularization_feature.npy', regularization_feature)

# Single Tool 1D calibration

In [None]:
import numpy as np

calibration_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_feature.npy')
y_calibration = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_label.npy')
test_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_feature.npy')
y_test = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_label.npy')
regularization_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/regularization_feature.npy')

print("calibration_feature.shape:", calibration_feature.shape)
print("calibration_label.shape:", y_calibration.shape)
print("test_feature.shape:", test_feature.shape)
print("test_label.shape:", y_test.shape)
print("regularization_feature.shape:", regularization_feature.shape)

In [None]:
import torch
import pandas as pd
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score, evaluate_result_1D
import copy

#Parameter setting
Pprior = 0.0441 
w_calibration = None
n_calibration_in_window = 100
frac_regularization_in_window = 0.03
batch_size = 512 # 4096 for A100, if V100, use 512
normalization = None
impute = False
mi_scaling = False
n_bootstrap = 100

p_value = 0.05
logbase = 1124

best_mean_evidence_strength = 0
mean_evidence_strength_list = []

combine_data = pd.DataFrame()

for i in range(calibration_feature.shape[1]):    
    select_feature = i
    condition_string = i

    valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]) 
    calibration_array = calibration_feature[valid_calibration_idx][:,select_feature].reshape(-1, 1)
    calibration_label = copy.deepcopy(y_calibration[valid_calibration_idx])

    valid_test_idx = ~np.isnan(test_feature[:,select_feature])
    test_array = test_feature[valid_test_idx][:,select_feature].reshape(-1, 1)
    test_label = copy.deepcopy(y_test[valid_test_idx])

    valid_regularization_idx = ~np.isnan(regularization_feature[:,select_feature])
    regularization_array = regularization_feature[valid_regularization_idx][:,select_feature].reshape(-1, 1)

    print("")
    print(f"Tool {condition_string}")
    print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

    torch.cuda.empty_cache()
    gc.collect()

    test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                     calibration_label, Pprior, w_calibration, 
                                                     n_calibration_in_window, frac_regularization_in_window, 
                                                     normalization, impute, mi_scaling, n_bootstrap, batch_size)

    np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_singletool_{condition_string}.npy', test_results_array)

    test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_singletool_{condition_string}.npy')

    P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

    evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result_1D(test_results_array,
                                                                                                      test_array,
                                                                                                      test_label, 
                                                                                                      p_value, 
                                                                                                      Pprior, 
                                                                                                      logbase, 
                                                                                                      category = condition_string, 
                                                                                                      show_plot=True, 
                                                                                                      save_name=None)
    

    combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

    mean_evidence_strength = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean() + evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()

    pathogenic_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean()
    benign_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()
    mean_evidence_strength = (pathogenic_evidence_mean + benign_evidence_mean)/2
    
    print("pathogenic evidence mean", f"{pathogenic_evidence_mean:.3f}")
    print("benign evidence mean", f"{benign_evidence_mean:.3f}")
    mean_evidence_strength_list.append(mean_evidence_strength)

In [None]:
print("strongest to weakest")
print(np.argsort(mean_evidence_strength_list)[::-1])

# Different number of tools

In [None]:
import numpy as np

calibration_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_feature.npy')
y_calibration = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_label.npy')
test_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_feature.npy')
y_test = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_label.npy')
regularization_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/regularization_feature.npy')

print("calibration_feature.shape:", calibration_feature.shape)
print("calibration_label.shape:", y_calibration.shape)
print("test_feature.shape:", test_feature.shape)
print("test_label.shape:", y_test.shape)
print("regularization_feature.shape:", regularization_feature.shape)

In [None]:
Pprior = 0.0441 # Prior probability of pathogenicity (changes w/ c)
w_calibration = None
n_calibration_in_window = 100   # minimum number of clinvar variants in a local window
frac_regularization_in_window = 0.03
batch_size = 512 # 4096 for A100, if V100, use 512
n_bootstrap = 100

p_value = 0.05
logbase = 1124

pipeline_dict = {
    "rank_MI": {
        "normalization": "rank",
        "mi_scaling": True,
        "impute": False
    },
    "rank_noMI": {
        "normalization": "rank",
        "mi_scaling": False,
        "impute": False
    },
    "z_MI": {
        "normalization": "z",
        "mi_scaling": True,
        "impute": False
    },
    "z_noMI": {
        "normalization": "z",
        "mi_scaling": False,
        "impute": False
    }
}


ACMG_score_dict = {
    "rank_MI": [],
    "rank_noMI": [],
    "z_MI": [],
    "z_noMI": []
}

category_dict = {
    "rank_MI": "ranking + MI",
    "rank_noMI": "ranking, no MI",
    "z_MI": "z-score + MI",
    "z_noMI": "z-score, no MI"
}

In [None]:
import random
import torch
import pandas as pd
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score, evaluate_result
import copy

combine_data = pd.DataFrame()

random.seed(42)
sequence=random.sample(range(50), 50)
print(sequence)
num_tools = [2,3,4,6,8,10,13,15,20,25,30,40,50]

for i in num_tools:
    select_feature = sequence[:i]
    condition_string = i

    valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]).any(axis=1)  
    calibration_array = calibration_feature[valid_calibration_idx][:,select_feature]
    calibration_label = copy.deepcopy(y_calibration[valid_calibration_idx])

    valid_test_idx = ~np.isnan(test_feature[:,select_feature]).any(axis=1) 
    test_array = test_feature[valid_test_idx][:,select_feature]
    test_label = y_test[valid_test_idx]

    valid_regularization_idx = ~np.isnan(regularization_feature[:,select_feature]).any(axis=1) 
    regularization_array = regularization_feature[valid_regularization_idx][:,select_feature]

    print(f"{i} tools:, {select_feature}")
    print(f"train size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

    best_mean_evidence_strength = 0
    
    for pipeline_name, config in pipeline_dict.items():
        print(pipeline_name)
        torch.cuda.empty_cache()
        gc.collect()

        test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                         calibration_label, Pprior, w_calibration, 
                                                         n_calibration_in_window, frac_regularization_in_window, 
                                                         config["normalization"], config["impute"], config["mi_scaling"], 
                                                         n_bootstrap, batch_size)

        np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}_{pipeline_name}.npy', test_results_array)

        test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}_{pipeline_name}.npy')

        P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

        evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result(test_results_array,
                                                                                                        test_label, 
                                                                                                        p_value, 
                                                                                                        Pprior, 
                                                                                                        logbase, 
                                                                                                        category = f"{condition_string} tools\n{pipeline_name}", 
                                                                                                        show_plot=True, 
                                                                                                        save_name=None)

        combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

        pathogenic_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean()
        benign_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()
        mean_evidence_strength = (pathogenic_evidence_mean + benign_evidence_mean)/2

        if mean_evidence_strength >  best_mean_evidence_strength:
            best_mean_evidence_strength = mean_evidence_strength
            best_pipeline = pipeline_name
        
        print("pathogenic evidence mean", f"{pathogenic_evidence_mean:.3f}")
        print("benign evidence mean", f"{benign_evidence_mean:.3f}")
        ACMG_score_dict[pipeline_name].append(mean_evidence_strength)


    print('Best pipeline', best_pipeline)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

colors = ["#006400", "#90EE90", "#FFA500", "#FFDAB9"]

plt.figure(figsize=(6, 3.5))
sns.set_theme(style="ticks")

for i, (key, y_values) in enumerate(ACMG_score_dict.items()):
    plt.plot(num_tools, y_values, marker='o', linestyle='-', color=colors[i], label=category_dict[key])

plt.xlabel("Number of underlying tools", fontsize=14)
plt.ylabel("Mean evidence strength (LLR)", fontsize=14)
plt.legend(loc='lower right', fontsize=14)  
plt.tight_layout()
sns.despine(top=True, right=True)
plt.savefig("Fig2A.svg", format="svg")
plt.show()

# Fraction of highly informative tools

In [None]:
import numpy as np

calibration_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_feature.npy')
y_calibration = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_label.npy')
test_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_feature.npy')
y_test = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_label.npy')
regularization_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/regularization_feature.npy')

print("calibration_feature.shape:", calibration_feature.shape)
print("calibration_label.shape:", y_calibration.shape)
print("test_feature.shape:", test_feature.shape)
print("test_label.shape:", y_test.shape)
print("regularization_feature.shape:", regularization_feature.shape)

In [None]:
Pprior = 0.0441 # Prior probability of pathogenicity (changes w/ c)
w_calibration = None
n_calibration_in_window = 100   # minimum number of clinvar variants in a local window
frac_regularization_in_window = 0.03
batch_size = 512 # 4096 for A100, if V100, use 512
n_bootstrap = 100

p_value = 0.05
logbase = 1124

pipeline_dict = {
    "rank_MI": {
        "normalization": "rank",
        "mi_scaling": True,
        "impute": False
    },
    "rank_noMI": {
        "normalization": "rank",
        "mi_scaling": False,
        "impute": False
    },
    "z_MI": {
        "normalization": "z",
        "mi_scaling": True,
        "impute": False
    },
    "z_noMI": {
        "normalization": "z",
        "mi_scaling": False,
        "impute": False
    }
}

ACMG_score_dict = {
    "rank_MI": [],
    "rank_noMI": [],
    "z_MI": [],
    "z_noMI": []
}

category_dict = {
    "rank_MI": "ranking + MI",
    "rank_noMI": "ranking, no MI",
    "z_MI": "z-score + MI",
    "z_noMI": "z-score, no MI"
}

In [None]:
import torch
import pandas as pd
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score, evaluate_result
import copy

combine_data = pd.DataFrame()

good_tool = [9, 3, 21, 1, 15, 19, 13, 14, 45, 12, 0, 33, 2, 22, 30]
bad_tool=[24, 11, 29, 39, 44, 32, 48, 10, 43, 41, 34, 40, 35, 26, 16]

for i in [0,1,3,6,9,12,14,15]:
    select_feature = good_tool[:i]+bad_tool[i:]
    condition_string = f"{i}_goodtools"

    valid_calibration_idx = ~np.isnan(calibration_feature[:,select_feature]).any(axis=1)  
    calibration_array = calibration_feature[valid_calibration_idx][:,select_feature]
    calibration_label = copy.deepcopy(y_calibration[valid_calibration_idx])

    valid_test_idx = ~np.isnan(test_feature[:,select_feature]).any(axis=1) 
    test_array = test_feature[valid_test_idx][:,select_feature]
    test_label = y_test[valid_test_idx]

    valid_regularization_idx = ~np.isnan(regularization_feature[:,select_feature]).any(axis=1) 
    regularization_array = regularization_feature[valid_regularization_idx][:,select_feature]

    print(f"{i} good tools/15:, {select_feature}")
    print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

    best_mean_evidence_strength = 0
    
    for pipeline_name, config in pipeline_dict.items():
        print(pipeline_name)
        torch.cuda.empty_cache()
        gc.collect()

        test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                         calibration_label, Pprior, w_calibration, 
                                                         n_calibration_in_window, frac_regularization_in_window, 
                                                         config["normalization"], config["impute"], config["mi_scaling"], 
                                                         n_bootstrap, batch_size)

        np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}_{pipeline_name}.npy', test_results_array)

        test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}_{pipeline_name}.npy')

        P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

        evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result(test_results_array,
                                                                                                        test_label, 
                                                                                                        p_value, 
                                                                                                        Pprior, 
                                                                                                        logbase, 
                                                                                                        category = f"{condition_string} tools\n{pipeline_name}", 
                                                                                                        show_plot=True, 
                                                                                                        save_name=None)

        combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

        pathogenic_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean()
        benign_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()
        mean_evidence_strength = (pathogenic_evidence_mean + benign_evidence_mean)/2
        
        if mean_evidence_strength >  best_mean_evidence_strength:
            best_mean_evidence_strength = mean_evidence_strength
            best_pipeline = pipeline_name
        
        print("pathogenic evidence mean", f"{pathogenic_evidence_mean:.3f}")
        print("benign evidence mean", f"{benign_evidence_mean:.3f}")
        ACMG_score_dict[pipeline_name].append(mean_evidence_strength)


    print('Best pipeline', best_pipeline)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

colors = ["#006400", "#90EE90", "#FFA500", "#FFDAB9"]

plt.figure(figsize=(6, 3.5))
sns.set_theme(style="ticks")

for i, (key, y_values) in enumerate(ACMG_score_dict.items()):
    plt.plot([0,1,3,6,9,12,14,15], y_values, marker='o', linestyle='-', color=colors[i], label=category_dict[key])

plt.xlabel("Number of highly informative tools (Out of 15 total))", fontsize = 14)
plt.ylabel("Mean evidence strength (LLR)", fontsize=14)
plt.legend(loc='lower right', fontsize = 14) 
plt.tight_layout()
sns.despine(top=True, right=True)
plt.savefig("Fig2B.svg", format="svg")
plt.show()

# Different pattern of missing value

In [None]:
import numpy as np

calibration_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_feature.npy')
y_calibration = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_label.npy')
test_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_feature.npy')
y_test = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_label.npy')
regularization_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/regularization_feature.npy')

print("calibration_feature.shape:", calibration_feature.shape)
print("calibration_label.shape:", y_calibration.shape)
print("test_feature.shape:", test_feature.shape)
print("test_label.shape:", y_test.shape)
print("regularization_feature.shape:", regularization_feature.shape)

## Function for generating missing

In [None]:
import numpy as np

def apply_missing_values(data, missing_rates, method="random", protected_fraction=0.5, random_state=42):
    rng = np.random.default_rng(random_state)
    data_with_nan = data.copy()
    num_rows, num_cols = data.shape

    all_indices = np.arange(num_rows)
    num_protected = int(num_rows * protected_fraction)
    protected_indices = rng.choice(all_indices, size=num_protected, replace=False)
    non_protected_indices = np.setdiff1d(all_indices, protected_indices)

    for col in range(num_cols):
        num_missing = int(missing_rates[col] * num_rows)
        eligible_indices = non_protected_indices.copy()

        if method == "largest":
            sorted_indices = np.argsort(data[:, col])
            eligible_indices = np.intersect1d(eligible_indices, sorted_indices[int(num_rows * 0.5):])
        elif method == "smallest":
            sorted_indices = np.argsort(data[:, col])
            eligible_indices = np.intersect1d(eligible_indices, sorted_indices[:int(num_rows * 0.5)])
        elif method == "bulk":
            sorted_indices = np.argsort(data[:, col])
            q_low = rng.uniform(0, 0.5)
            q_high = q_low + 0.5
            lower_idx = int(q_low * num_rows)
            upper_idx = int(q_high * num_rows)
            bulk_range = sorted_indices[lower_idx:upper_idx]
            eligible_indices = np.intersect1d(eligible_indices, bulk_range)
        elif method != "random":
            raise ValueError(f"Invalid method: {method}")

        missing_indices = rng.choice(
            eligible_indices,
            size=min(num_missing, len(eligible_indices)),
            replace=False
        )

        data_with_nan[missing_indices, col] = np.nan

    return data_with_nan

## Calibration

In [None]:
import random

Pprior = 0.0441 # Prior probability of pathogenicity (changes w/ c)
w_calibration = None
n_calibration_in_window = 100   # minimum number of clinvar variants in a local window
frac_regularization_in_window = 0.03
batch_size = 512 # 4096 for A100, if V100, use 512
n_bootstrap = 100

p_value = 0.05
logbase = 1124

protected_fraction = 0
random_state = 42

np.random.seed(random_state)
missing_rate_array = np.random.uniform(0, 0.25, size=calibration_feature.shape[1])
print("missing_rate_array")
print(missing_rate_array)

random.seed(random_state)
select_feature=random.sample(range(50), 15)
print("select_feature")
print(select_feature)

pipeline_dict = {
    "rank_ImputeMissing_MI": {
        "normalization": "rank",
        "mi_scaling": True,
        "impute": True
    },
    "rank_ExcludeMissing_MI": {
        "normalization": "rank",
        "mi_scaling": True,
        "impute": False
    }
}

ACMG_score_dict = {
    "rank_ImputeMissing_MI": [],
    "rank_ExcludeMissing_MI": []
}

category_dict = {
    "rank_ExcludeMissing_MI": "Exclude variants with missing values",
    "rank_ImputeMissing_MI": "Impute missing values"
}

In [None]:
import torch
import pandas as pd
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score, evaluate_result
import copy

condition_list = ['random', "largest", "smallest", "bulk"]
combine_data = pd.DataFrame()

for i in range(4):
    condition_string = f"{condition_list[i]}missing"
    best_mean_evidence_strength = 0
    
    for pipeline_name, config in pipeline_dict.items():
        print(condition_string, pipeline_name)
        calibration_array = apply_missing_values(calibration_feature, missing_rate_array, method=condition_list[i], 
                                           protected_fraction=protected_fraction, random_state=random_state)    
        test_array = apply_missing_values(test_feature, missing_rate_array, method=condition_list[i], 
                                          protected_fraction=protected_fraction, random_state=random_state)
        regularization_array = apply_missing_values(regularization_feature, missing_rate_array, method=condition_list[i], 
                                            protected_fraction=protected_fraction, random_state=random_state)

        valid_calibration_idx = ~np.isnan(calibration_array[:,select_feature]).all(axis=1)  
        calibration_array = calibration_array[valid_calibration_idx][:,select_feature]
        calibration_label = copy.deepcopy(y_calibration[valid_calibration_idx])
    
        valid_test_idx = ~np.isnan(test_array[:,select_feature]).all(axis=1) 
        test_array = test_array[valid_test_idx][:,select_feature]
        test_label = y_test[valid_test_idx]
    
        valid_regularization_idx = ~np.isnan(regularization_array[:,select_feature]).all(axis=1) 
        regularization_array = regularization_array[valid_regularization_idx][:,select_feature]
        # print(np.isnan(calibration_array).sum(axis=0))
        print(f"Missing Pattern: {condition_list[i]}")
        print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

        test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                         calibration_label, Pprior, w_calibration, 
                                                         n_calibration_in_window, frac_regularization_in_window, 
                                                         config["normalization"], config["impute"], config["mi_scaling"], 
                                                         n_bootstrap, batch_size)

        np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}_{pipeline_name}.npy', test_results_array)

        test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}_{pipeline_name}.npy')

        P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

        evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result(test_results_array,
                                                                                                        test_label, 
                                                                                                        p_value, 
                                                                                                        Pprior, 
                                                                                                        logbase, 
                                                                                                        category = f"{condition_string} tools\n{pipeline_name}", 
                                                                                                        show_plot=True, 
                                                                                                        save_name=None)

        combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

        pathogenic_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean()
        benign_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()
        mean_evidence_strength = (pathogenic_evidence_mean + benign_evidence_mean)/2
        
        if mean_evidence_strength >  best_mean_evidence_strength:
            best_mean_evidence_strength = mean_evidence_strength
            best_pipeline = pipeline_name

        print("pathogenic evidence mean", f"{pathogenic_evidence_mean:.3f}")
        print("benign evidence mean", f"{benign_evidence_mean:.3f}")
        ACMG_score_dict[pipeline_name].append(mean_evidence_strength)

    print('Best pipeline', best_pipeline)

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import matplotlib.cm as cm

colors = cm.tab10(np.linspace(0, 1, len(ACMG_score_dict))) 
x = np.arange(len(condition_list))

bar_width = 0.2

plt.figure(figsize=(6, 3.5)) 
sns.set_theme(style="ticks")

for i, (key, y_values) in enumerate(ACMG_score_dict.items()):
    plt.bar(x + i * bar_width, y_values, width=bar_width, color=colors[i], alpha=0.7, label=category_dict[key])

plt.xticks(x + bar_width / 2, condition_list) 
plt.xlabel("Simulation of Missing Values", fontsize=14)
plt.ylabel("Mean evidence strength (LLR)", fontsize=14)
plt.legend(loc='upper left', fontsize=13)
plt.ylim([1,3])
new_labels = ["Random\nvalues", "Largest\nvalues", "Smallest\nvalues", "Midrange\nvalues"] 
plt.xticks(x + bar_width / 2, new_labels) 

plt.tight_layout()
sns.despine(top=True, right=True)
plt.savefig("Fig2D.svg", format="svg")
plt.show()


# Different Fraction of missing value

In [None]:
import numpy as np

calibration_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_feature.npy')
y_calibration = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_label.npy')
test_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_feature.npy')
y_test = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_label.npy')
regularization_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/regularization_feature.npy')

print("calibration_feature.shape:", calibration_feature.shape)
print("calibration_label.shape:", y_calibration.shape)
print("test_feature.shape:", test_feature.shape)
print("test_label.shape:", y_test.shape)
print("regularization_feature.shape:", regularization_feature.shape)

## Function for generating missing

In [None]:
import numpy as np

def apply_missing_values(data, missing_rates, method="random", protected_fraction=0.5, random_state=42):
    rng = np.random.default_rng(random_state)
    data_with_nan = data.copy()
    num_rows, num_cols = data.shape

    all_indices = np.arange(num_rows)
    num_protected = int(num_rows * protected_fraction)
    protected_indices = rng.choice(all_indices, size=num_protected, replace=False)
    non_protected_indices = np.setdiff1d(all_indices, protected_indices)

    for col in range(num_cols):
        num_missing = int(missing_rates[col] * num_rows)
        eligible_indices = non_protected_indices.copy()

        if method == "largest":
            sorted_indices = np.argsort(data[:, col])
            eligible_indices = np.intersect1d(eligible_indices, sorted_indices[int(num_rows * 0.5):])
        elif method == "smallest":
            sorted_indices = np.argsort(data[:, col])
            eligible_indices = np.intersect1d(eligible_indices, sorted_indices[:int(num_rows * 0.5)])
        elif method == "bulk":
            sorted_indices = np.argsort(data[:, col])
            q_low = rng.uniform(0, 0.5)
            q_high = q_low + 0.5
            lower_idx = int(q_low * num_rows)
            upper_idx = int(q_high * num_rows)
            bulk_range = sorted_indices[lower_idx:upper_idx]
            eligible_indices = np.intersect1d(eligible_indices, bulk_range)
        elif method != "random":
            raise ValueError(f"Invalid method: {method}")

        # 最終從 eligible 裡抽出要設為 NaN 的 index
        missing_indices = rng.choice(
            eligible_indices,
            size=min(num_missing, len(eligible_indices)),
            replace=False
        )

        data_with_nan[missing_indices, col] = np.nan

    return data_with_nan


## Calibration

In [None]:
import random
import numpy as np

Pprior = 0.0441 # Prior probability of pathogenicity (changes w/ c)
w_calibration = None
n_calibration_in_window = 100   # minimum number of clinvar variants in a local window
frac_regularization_in_window = 0.03
batch_size = 512 # 4096 for A100, if V100, use 512
n_bootstrap = 100

p_value = 0.05
logbase = 1124

protected_fraction = 0
random_state = 42

np.random.seed(random_state)
missing_rate_array = np.random.uniform(0, 0.25, size=calibration_feature.shape[1])
print("missing_rate_array")
print(missing_rate_array)

random.seed(random_state)
select_feature=random.sample(range(50), 15)
print("select_feature")
print(select_feature)

pipeline_dict = {
    "rank_ImputeMissing_MI": {
        "normalization": "rank",
        "mi_scaling": True,
        "impute": True
    },
    "rank_ExcludeMissing_MI": {
        "normalization": "rank",
        "mi_scaling": True,
        "impute": False
    }
}

ACMG_score_dict = {
    "rank_ImputeMissing_MI": [],
    "rank_ExcludeMissing_MI": []
}

category_dict = {
    "rank_ExcludeMissing_MI": "Exclude variants with missing values",
    "rank_ImputeMissing_MI": "Impute missing values"
}

In [None]:
import torch
import pandas as pd
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score, evaluate_result
import copy

missing_fraction = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.4]
combine_data = pd.DataFrame()

for i in range(7):
    condition_string = f"{missing_fraction[i]}missing"
    np.random.seed(42)
    missing_rate_array = np.random.uniform(0, missing_fraction[i], size=calibration_feature.shape[1])
    
    best_mean_evidence_strength = 0
    
    for pipeline_name, config in pipeline_dict.items():
        print(condition_string, pipeline_name)
        calibration_array = apply_missing_values(calibration_feature, missing_rate_array, method="random", 
                                           protected_fraction=protected_fraction, random_state=random_state)    
        test_array = apply_missing_values(test_feature, missing_rate_array, method="random", 
                                          protected_fraction=protected_fraction, random_state=random_state)
        regularization_array = apply_missing_values(regularization_feature, missing_rate_array, method="random", 
                                            protected_fraction=protected_fraction, random_state=random_state)

        valid_calibration_idx = ~np.isnan(calibration_array[:,select_feature]).all(axis=1)  
        calibration_array = calibration_array[valid_calibration_idx][:,select_feature]
        calibration_label = copy.deepcopy(y_calibration[valid_calibration_idx])
    
        valid_test_idx = ~np.isnan(test_array[:,select_feature]).all(axis=1) 
        test_array = test_array[valid_test_idx][:,select_feature]
        test_label = y_test[valid_test_idx]
    
        valid_regularization_idx = ~np.isnan(regularization_array[:,select_feature]).all(axis=1) 
        regularization_array = regularization_array[valid_regularization_idx][:,select_feature]

        print(f"Missing rate: {missing_fraction[i]}")
        print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

        test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                         calibration_label, Pprior, w_calibration, 
                                                         n_calibration_in_window, frac_regularization_in_window, 
                                                         config["normalization"], config["impute"], config["mi_scaling"], 
                                                         n_bootstrap, batch_size)

        np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}_{pipeline_name}.npy', test_results_array)

        test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}_{pipeline_name}.npy')

        P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

        evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result(test_results_array,
                                                                                                        test_label, 
                                                                                                        p_value, 
                                                                                                        Pprior, 
                                                                                                        logbase, 
                                                                                                        category = f"{condition_string} tools\n{pipeline_name}", 
                                                                                                        show_plot=True, 
                                                                                                        save_name=None)

        combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

        pathogenic_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean()
        benign_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()
        mean_evidence_strength = (pathogenic_evidence_mean + benign_evidence_mean)/2
        
        if mean_evidence_strength >  best_mean_evidence_strength:
            best_mean_evidence_strength = mean_evidence_strength
            best_pipeline = pipeline_name
 
        print("pathogenic evidence mean", f"{pathogenic_evidence_mean:.3f}")
        print("benign evidence mean", f"{benign_evidence_mean:.3f}")
        ACMG_score_dict[pipeline_name].append(mean_evidence_strength)

    print('Best pipeline', best_pipeline)

In [None]:
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import seaborn as sns

colors = cm.tab10(np.linspace(0, 1, len(ACMG_score_dict))) 

bar_width = 0.2

plt.figure(figsize=(6, 3.3)) 
sns.set_theme(style="ticks")

for i, (key, y_values) in enumerate(ACMG_score_dict.items()):
    plt.plot([5, 10, 15, 20, 25, 30, 40], y_values, marker='o', linestyle='-', color=colors[i], alpha=0.7, label=category_dict[key])
    
plt.xlabel("Max % of Missing Values per Tool", fontsize=14)
plt.ylabel("Mean evidence strength (LLR)", fontsize=14)
# plt.ylim([3.5,4.5])
plt.legend(loc='lower left', fontsize=13)
# plt.grid(True) 

plt.tight_layout()
sns.despine(top=True, right=True)
plt.savefig("Fig2C.svg", format="svg")
plt.show()

# Contamination tool training data
Testing final pipeline Rank, KNN Impute, Mutual Information Scaling

In [None]:
import numpy as np

model_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/model_feature.npy')
y_model = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/model_label.npy')
print("model_feature.shape:", model_feature.shape)
print("model_label.shape:", y_model.shape)

calibration_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_feature.npy')
y_calibration = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/calibration_label.npy')
test_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_feature.npy')
y_test = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/test_label.npy')
regularization_feature = np.load('/gpfs/home/pl2948/VariantInterpretation/KNNsimulation/regularization_feature.npy')

print("calibration_feature.shape:", calibration_feature.shape)
print("calibration_label.shape:", y_calibration.shape)
print("test_feature.shape:", test_feature.shape)
print("test_label.shape:", y_test.shape)
print("regularization_feature.shape:", regularization_feature.shape)

## Generate data with missing value

In [None]:
import numpy as np
import random

def apply_missing_values(data, missing_rates, method="random", protected_fraction=0.5, random_state=42):
    rng = np.random.default_rng(random_state)
    data_with_nan = data.copy()
    num_rows, num_cols = data.shape

    all_indices = np.arange(num_rows)
    num_protected = int(num_rows * protected_fraction)
    protected_indices = rng.choice(all_indices, size=num_protected, replace=False)
    non_protected_indices = np.setdiff1d(all_indices, protected_indices)

    for col in range(num_cols):
        num_missing = int(missing_rates[col] * num_rows)
        eligible_indices = non_protected_indices.copy()

        if method == "largest":
            sorted_indices = np.argsort(data[:, col])
            eligible_indices = np.intersect1d(eligible_indices, sorted_indices[int(num_rows * 0.5):])
        elif method == "smallest":
            sorted_indices = np.argsort(data[:, col])
            eligible_indices = np.intersect1d(eligible_indices, sorted_indices[:int(num_rows * 0.5)])
        elif method == "bulk":
            sorted_indices = np.argsort(data[:, col])
            q_low = rng.uniform(0, 0.5)
            q_high = q_low + 0.5
            lower_idx = int(q_low * num_rows)
            upper_idx = int(q_high * num_rows)
            bulk_range = sorted_indices[lower_idx:upper_idx]
            eligible_indices = np.intersect1d(eligible_indices, bulk_range)
        elif method != "random":
            raise ValueError(f"Invalid method: {method}")

        # 最終從 eligible 裡抽出要設為 NaN 的 index
        missing_indices = rng.choice(
            eligible_indices,
            size=min(num_missing, len(eligible_indices)),
            replace=False
        )

        data_with_nan[missing_indices, col] = np.nan

    return data_with_nan

## Calibration

In [None]:
import random
import numpy as np

Pprior = 0.0441 # Prior probability of pathogenicity (changes w/ c)
w_calibration = None
n_calibration_in_window = 100   # minimum number of clinvar variants in a local window
frac_regularization_in_window = 0.03
batch_size = 512 # 4096 for A100, if V100, use 512
normalization = "rank"
impute = True
mi_scaling = True
n_bootstrap = 100

p_value = 0.05
logbase = 1124

random.seed(42)
select_feature=random.sample(range(50), 15)
print("select_feature")
print(select_feature)

protected_fraction = 0
random_state = 42

np.random.seed(42)
missing_rate_array = np.random.uniform(0, 0.25, size=calibration_feature.shape[1])
print("missing_rate_array")
print(missing_rate_array)

In [None]:
from P_KNN_GPU import evaluate_result

combine_data = pd.DataFrame()

for i in range(7):
    print(i/2)
    
    condition_string = f"{i/2:.1f}_modeltrain"
    print(condition_string)

    calibration_array = apply_missing_values(calibration_feature, missing_rate_array, method="random", 
                                   protected_fraction=protected_fraction, random_state=random_state)    
    test_array = apply_missing_values(test_feature, missing_rate_array, method="random", 
                                      protected_fraction=protected_fraction, random_state=random_state)
    regularization_array = apply_missing_values(regularization_feature, missing_rate_array, method="random", 
                                        protected_fraction=protected_fraction, random_state=random_state)
    model_array = apply_missing_values(model_feature, missing_rate_array, method="random", 
                                        protected_fraction=0, random_state=random_state)

    valid_idx_model = ~np.isnan(model_array[:,select_feature]).all(axis=1) 
    model_array = model_array[valid_idx_model][:,select_feature]
    model_label = y_model[valid_idx_model]

    valid_idx = ~np.isnan(calibration_array[:,select_feature]).all(axis=1)  
    calibration_array = calibration_array[valid_idx][:,select_feature]
    calibration_label = y_calibration[valid_idx]

    # add part of model_array into train_array
    np.random.seed(42)
    add_count = int((i / 2) * len(calibration_array))
    add_model_idx = np.random.choice(len(model_array), add_count, replace=False)

    selected_model_array = model_array[add_model_idx]
    selected_model_label = model_label[add_model_idx]

    calibration_array = np.vstack((calibration_array, selected_model_array))
    calibration_label = np.hstack((calibration_label, selected_model_label)) 
    
    valid_idx_regularization = ~np.isnan(regularization_array[:,select_feature]).all(axis=1)  
    regularization_array = regularization_array[valid_idx_regularization][:,select_feature]

    valid_idx_test = ~np.isnan(test_array[:,select_feature]).all(axis=1) 
    test_array = test_array[valid_idx_test][:,select_feature]
    test_label = y_test[valid_idx_test]

    print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

    test_results_array = get_bootstrap_KNN_score_gpu(calibration_array, test_array, regularization_array, 
                                                     calibration_label, Pprior, w_calibration, 
                                                     n_calibration_in_window, frac_regularization_in_window, 
                                                     normalization, impute, mi_scaling, 
                                                     n_bootstrap, batch_size)

    np.save(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}.npy', test_results_array)

    test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_numtool_{condition_string}.npy')

    P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

    evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result(test_results_array,
                                                                                                    test_label, 
                                                                                                    p_value, 
                                                                                                    Pprior, 
                                                                                                    logbase, 
                                                                                                    category = f"{i/2:.1f}", 
                                                                                                    show_plot=True, 
                                                                                                    save_name=condition_string)

    combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

    pathogenic_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean()
    benign_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()
    mean_evidence_strength = (pathogenic_evidence_mean + benign_evidence_mean)/2
        
    print("pathogenic evidence mean", f"{pathogenic_evidence_mean:.3f}")
    print("benign evidence mean", f"{benign_evidence_mean:.3f}")

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
import colorsys

from mpl_toolkits.axes_grid1.inset_locator import inset_axes

def lighten_color(color, amount=0.5):
    """make color brighter, the larger the brighter（0~1）"""
    try:
        c = mcolors.cnames[color]
    except:
        c = color
    h, l, s = colorsys.rgb_to_hls(*mcolors.to_rgb(c))
    return colorsys.hls_to_rgb(h, 1 - amount * (1 - l), s)

sns.set_theme(style="white", rc={"axes.facecolor": (0, 0, 0, 0)})
pal = sns.color_palette("dark", n_colors=len(combine_data["Category"].unique()))
g = sns.FacetGrid(combine_data, row="Category", hue="Category", aspect=12, height=0.6, palette=pal)

g.map(sns.kdeplot, "Score", bw_adjust=.6, clip_on=False, fill=True, alpha=0.8, linewidth=1.5)
g.map(sns.kdeplot, "Score", clip_on=False, color="w", lw=1, bw_adjust=.6)
g.refline(y=0, linewidth=2, linestyle="-", color=None, clip_on=False)

dark_gray = '0.3'
for ax, (cat, _) in zip(g.axes.flat, combine_data.groupby("Category")):
    inset_ax = inset_axes(
        ax,
        width="100%", height="30%",
        bbox_to_anchor=(0, 0.15, 1, 0.5),  # (x0, y0, width, height) in axes fraction
        bbox_transform=ax.transAxes,
        loc="lower left",
        borderpad=0
    )

    orig_color = pal[list(combine_data["Category"].unique()).index(cat)]
    light_color = lighten_color(orig_color, 0.2)
    
    sns.boxplot(
        x="Score",
        data=combine_data[combine_data["Category"] == cat],
        ax=inset_ax,
        color=light_color,
        showcaps=True,
        showfliers=False,
        whiskerprops={'color': dark_gray, 'linewidth': 1.5, 'zorder': 2},
        capprops={'color': dark_gray, 'linewidth': 1.5},
        medianprops={'color': dark_gray, 'linewidth': 1.5},
        boxprops={'alpha': 0.7, 'zorder': 2, 'edgecolor': dark_gray, 'linewidth': 1.5},
    )
    inset_ax.set_xlim(ax.get_xlim())
    inset_ax.axis("off")

def label(x, color, label):
    ax = plt.gca()
    ax.text(0.05, 0.15, f"{label}x", fontweight="bold", color=color, fontsize=14,
            ha="left", va="center", transform=ax.transAxes)

g.map(label, "Category")

g.figure.subplots_adjust(hspace=-.6)
g.set_titles("")
g.set(yticks=[], ylabel="")
g.set_xlabels("Evidence strength (LLR)", fontsize=14)
g.despine(bottom=True, left=True)

plt.savefig("Fig2H_with_box.svg", format="svg", bbox_inches="tight")
plt.show()

# P-KNN vs best underlying tool

In [None]:
import torch
import pandas as pd
import gc
from P_KNN_GPU import get_bootstrap_KNN_score_gpu, get_P_KNN_ACMG_score, evaluate_result_1D
import copy

#Parameter setting
Pprior = 0.0441 
w_calibration = None
n_calibration_in_window = 100
frac_regularization_in_window = 0.03
batch_size = 512 # 4096 for A100, if V100, use 512
normalization = None
impute = False
mi_scaling = False
n_bootstrap = 100

p_value = 0.05
logbase = 1124

best_mean_evidence_strength = 0
mean_evidence_strength_list = []

i=1  
select_feature = i
condition_string = i

valid_test_idx = ~np.isnan(test_feature[:,select_feature])
test_array = test_feature[valid_test_idx][:,select_feature].reshape(-1, 1)
test_label = copy.deepcopy(y_test[valid_test_idx])

print("")
print(f"Tool {condition_string}")
print(f"calibration size: {calibration_array.shape[0]}, test size: {test_array.shape[0]}")

test_results_array = np.load(f'/gpfs/home/pl2948/VariantInterpretation/KNNTestResultArrays/Simulation_singletool_{condition_string}.npy')

P_KNN_pathogenic, P_KNN_benign, ACMG_scores = get_P_KNN_ACMG_score(test_results_array, p_value, Pprior, logbase)

evidence_strength_data, pathogenic_calibration_dict, benign_calibration_dict = evaluate_result_1D(test_results_array,
                                                                                                  test_array,
                                                                                                  test_label, 
                                                                                                  p_value, 
                                                                                                  Pprior, 
                                                                                                  logbase, 
                                                                                                  category = condition_string, 
                                                                                                  show_plot=True, 
                                                                                                  save_name=None)


combine_data = pd.concat([combine_data, evidence_strength_data], ignore_index=True)

mean_evidence_strength = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean() + evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()

pathogenic_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Pathogenic variants']['Score'].mean()
benign_evidence_mean = evidence_strength_data[evidence_strength_data['Label']=='Benign variants']['Score'].mean()
mean_evidence_strength = (pathogenic_evidence_mean + benign_evidence_mean)/2

print("pathogenic evidence mean", f"{pathogenic_evidence_mean:.3f}")
print("benign evidence mean", f"{benign_evidence_mean:.3f}")
mean_evidence_strength_list.append(mean_evidence_strength)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

combine_data = combine_data[(
    combine_data['Category']==1) | (
    combine_data['Category']=='0.0')]

## violin plot
sns.set(style="ticks")
plt.figure(figsize=(4, 3.5))
ax = sns.violinplot(
    x="Category",
    y="Score", 
    hue="Label",   # Pathogenic and Benign 
    data=combine_data, 
    split=True,   # on each side of violin
    inner="box", 
    palette={"Pathogenic variants": "red", "Benign variants": "blue"},
    alpha=0.6, 
    density_norm='area',
    hue_order=['Pathogenic variants', 'Benign variants']
)

dark_gray = '0.3'

sns.boxplot(
    x="Category", y="Score", hue="Label", data=combine_data,
    showcaps=True,  
    showfliers=False,
    palette={"Pathogenic variants": "pink", "Benign variants": "#bae0f5"},
    width=0.1, dodge=True, ax=ax,
    whiskerprops={'color': dark_gray, 'linewidth': 1.5, 'zorder': 2},
    capprops={'color': dark_gray, 'linewidth': 1.5},
    medianprops={'color': dark_gray, 'linewidth': 1.5},
    boxprops={'zorder': 2, 'edgecolor': dark_gray, 'linewidth': 1.5},
    hue_order=['Pathogenic variants', 'Benign variants']

)

handles, labels = ax.get_legend_handles_labels()
n_hue = combine_data["Label"].nunique() 
ax.legend(handles[:n_hue], labels[:n_hue], loc="upper right", fontsize=12)

plt.legend(handles, ['Pathogenic variants', 'Benign variants'], fontsize=12, loc='lower center', bbox_to_anchor=(0.45, -0.02))
# plt.legend(handles, ['                                   ', ''], fontsize=12, loc='lower center', bbox_to_anchor=(0.45, -0.02))

category_labels = ["Best individual tool", "P-KNN"]
plt.xticks(ticks=range(len(category_labels)), labels=category_labels, fontsize=14)
plt.xlabel("")
plt.ylabel("Evidence strength (LLR)", fontsize=14)
plt.yticks([-8, -4, 0, 4, 8], fontsize=14)
sns.despine(top=True, right=True)

plt.savefig("Fig2E.svg", format="svg")
plt.show()