# Fig 1

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.patches import Rectangle, Circle

i=4
np.random.seed(0)

# benign point
x_small = np.random.normal(loc=20, scale=15, size=50)
y_small = np.random.normal(loc=-20, scale=15, size=50)

# pathogenic point
x_large = np.random.normal(loc=40, scale=20, size=50)
y_large = np.random.normal(loc=20, scale=20, size=50)

# Nonmeaningful location
y_small_random = np.random.uniform(0, 100, size=50)
y_large_random = np.random.uniform(0, 100, size=50)

# Merge simulation data
x_all = np.concatenate([x_small, x_large])
y_all = np.concatenate([y_small, y_large])
points_all = np.column_stack((x_all, y_all))
y_random_all = np.concatenate([y_small_random, y_large_random])

fig, axes = plt.subplots(2, 1, figsize=(4.1, 5.5), gridspec_kw={'height_ratios': [1.5, 4]})

axes[0].scatter(x_small, y_small_random, color='blue', label='Benign')
axes[0].scatter(x_large, y_large_random, color='red', label='Pathogenic')
axes[0].scatter(x_large[i], y_large_random[i], color='black', s=30, zorder=5)
axes[0].set_yticks([])

# Window in upper graph
center_x = x_large[i]
x_distances = np.abs(x_all - center_x)
closest_indices = np.argsort(x_distances)[:10]
x_selected = x_all[closest_indices]

x_min, x_max = x_selected.min(), x_selected.max()
y_bottom, y_top = axes[0].get_ylim()

rect = Rectangle((x_min, y_bottom), x_max - x_min, y_top - y_bottom,
                 linewidth=1, facecolor='orange', alpha=0.5, edgecolor='orange')
axes[0].add_patch(rect)

# Window for lower 2D graph
center_xy = np.array([x_large[i], y_large[i]])
points_all = np.stack([x_all, y_all], axis=1)
dists = np.linalg.norm(points_all - center_xy, axis=1)
closest_indices_2d = np.argsort(dists)[:10]
selected_points = points_all[closest_indices_2d]

r = np.max(np.linalg.norm(selected_points - center_xy, axis=1))

axes[1].scatter(x_small, y_small, color='blue', label='Benign')
axes[1].scatter(x_large, y_large, color='red', label='Pathogenic')
circle = Circle(center_xy, radius=r, facecolor='orange', alpha=0.5, edgecolor='orange' , linewidth=1)
axes[1].add_patch(circle)
axes[1].scatter(*center_xy, color='black', s=30, zorder=5)

axes[1].set_ylabel('Tool 2 Score', fontsize=14)
axes[1].set_xlabel('Tool 1 Score', fontsize=14)

plt.tight_layout()
plt.savefig("Fig1BC1.svg", format="svg")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

np.random.seed(0)

red_ratios = []
for x_center in x_all:
    x_distances = np.abs(x_all - x_center)
    closest_indices = np.argsort(x_distances)[:10]
    selected_x = x_all[closest_indices]

    red_count = np.sum(np.isin(selected_x, x_large)) 
    red_ratio = red_count / 10  
    red_ratios.append(red_ratio)

x_bins = np.arange(min(x_all), max(x_all) + 0.2, 0.2)
y_bins = np.arange(min(y_all), max(y_all) + 0.2, 0.2)
bin_centers_x = (x_bins[:-1] + x_bins[1:]) / 2
bin_centers_y = (y_bins[:-1] + y_bins[1:]) / 2  

heatmap_values = np.zeros((len(bin_centers_y), len(bin_centers_x)))

for i, x_center in enumerate(bin_centers_x):
    for j, y_center in enumerate(bin_centers_y):
        center_point = np.array([x_center, y_center])
        
        distances = np.linalg.norm(points_all - center_point, axis=1)
        closest_indices = np.argsort(distances)[:10]  
        selected_points = points_all[closest_indices]
        
        red_count = np.sum(np.isin(selected_points[:, 0], x_large))
        red_ratio = red_count / 10  
        
        heatmap_values[j, i] = red_ratio

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(4.3, 6.55), gridspec_kw={'height_ratios': [1.5, 5.05]})

sc = axes[0].scatter(x_all, red_ratios, c=red_ratios, cmap='coolwarm', s=50, edgecolor='black')

# 1D heatmap
x_bins = np.arange(min(x_all), max(x_all) + 0.2, 0.2)
bin_centers = (x_bins[:-1] + x_bins[1:]) / 2  

axes[0].set_xlabel("Tool 1 Score", fontsize=14)

pcm = axes[1].imshow(heatmap_values, origin="lower", aspect="auto",
                     extent=[min(x_bins), max(x_bins), min(y_bins), max(y_bins)], cmap="coolwarm")

axes[1].set_xlabel("Tool 1 Score", fontsize=14)
axes[1].set_ylabel("Tool 2 Score", fontsize=14)
cbar = plt.colorbar(pcm, ax=axes[1], location="top")
# cbar.set_label("Proportion of Pathogenic Points")

plt.tight_layout()
plt.savefig("Fig1BC2.svg", format="svg")
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.feature_selection import mutual_info_classif

np.random.seed(0)
size = 100

# benign point
x_small = np.random.normal(loc=2, scale=1.5, size=size)
y_small = 1.4**np.random.normal(loc=2, scale=1.5, size=size)

# pathogenic point
x_large = np.random.normal(loc=6, scale=2.0, size=size)
y_large = 1.4**np.random.normal(loc=4.5, scale=2.0, size=size)

# Merge simulation data
x_all = np.concatenate([x_small, x_large])
y_all = np.concatenate([y_small, y_large])

x_ranks = np.argsort(np.argsort(x_all))
y_ranks = np.argsort(np.argsort(y_all))

points_all = np.column_stack((x_ranks, y_ranks))
label = np.concatenate((np.zeros(size), np.ones(size)))

MI = mutual_info_classif(points_all, label)

x_MI = x_ranks * MI[0]
y_MI = y_ranks * MI[1]

fig, axes = plt.subplots(1, 3, figsize=(12, 4))

axes[0].scatter(x_all[:size], y_all[:size], color='blue', label='Benign', s=10)
axes[0].scatter(x_all[size:], y_all[size:], color='red', label='Pathogenic', s=10)
axes[0].set_xlabel("Tool 1 Score")
axes[0].set_ylabel("Tool 2 Score")
axes[0].set_title("Raw Score")

axes[1].scatter(x_ranks[:size], y_ranks[:size], color='blue', label='Benign', s=10)
axes[1].scatter(x_ranks[size:], y_ranks[size:], color='red', label='Pathogenic', s=10)
axes[1].set_xlabel("Tool 1 Score")
axes[1].set_ylabel("Tool 2 Score")
axes[1].set_title("Ranking Normalization")

axes[2].scatter(x_MI[:size], y_MI[:size], color='blue', label='Benign', s=10)
axes[2].scatter(x_MI[size:], y_MI[size:], color='red', label='Pathogenic', s=10)
axes[2].set_xlabel("Tool 1 Score")
axes[2].set_ylabel("Tool 2 Score")
axes[2].set_aspect("equal")
axes[2].set_title("Mutual Information Scaling")
axes[2].set_xlim(-5, 85)
axes[2].set_ylim(-25, 65)

plt.tight_layout()
plt.savefig("Fig1C.svg", format="svg")
plt.show()

# Silhouette score

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, roc_auc_score
from sklearn.feature_selection import mutual_info_classif

size = 1000
    
for i in np.arange(2.5, 6, 0.5):
    print("medan difference size", i)
    np.random.seed(42)
    data1 = np.random.normal(loc=0, scale=1, size=size)  # mean=0, std=1
    data2 = np.random.normal(loc=i, scale=1, size=size)  # mean=2.5, std=1
    
    data = np.concatenate((data1, data2))
    labels = np.array([0]*size + [1]*size) 
    
    score_original = silhouette_score(data.reshape(-1, 1), labels)
    print(f"Silhouette Score (raw score): {score_original:.4f}")
    
    data_exp = np.exp(data)
    score_exp = silhouette_score(data_exp.reshape(-1, 1), labels)
    print(f"Silhouette Score (exp score): {score_exp:.4f}")
    
    data_rank = np.argsort(np.argsort(data))
    score_rank = silhouette_score(data_rank.reshape(-1, 1), labels)
    print(f"Silhouette Score (rank score): {score_rank:.4f}")
    
    auc_score_original = roc_auc_score(labels, data)
    print(f"AUC (raw score): {auc_score_original:.4f}")

    mutual_information_original = mutual_info_classif(data.reshape(-1, 1), labels)[0]
    print(f"Mutual information (exp score): {mutual_information_original:.4f}")
    mutual_information_rank = mutual_info_classif(data_rank.reshape(-1, 1), labels)[0]
    print(f"Mutual information (rank score): {mutual_information_rank:.4f}")
    
    plt.figure(figsize=(12, 5))
    
    plt.subplot(1, 3, 1)
    plt.hist(data1, bins=20, alpha=0.6, color='blue', label='Cluster 0')
    plt.hist(data2, bins=20, alpha=0.6, color='red', label='Cluster 1')
    plt.legend()
    plt.title("Raw score")
    
    plt.subplot(1, 3, 2)
    plt.hist(data_exp[:size], bins=20, alpha=0.6, color='blue', label='Cluster 0')
    plt.hist(data_exp[size:], bins=20, alpha=0.6, color='red', label='Cluster 1')
    plt.legend()
    plt.title("Exp score")
    
    plt.subplot(1, 3, 3)
    plt.hist(data_rank[:size], bins=20, alpha=0.6, color='blue', label='Cluster 0')
    plt.hist(data_rank[size:], bins=20, alpha=0.6, color='red', label='Cluster 1')
    plt.legend()
    plt.title("Rank score")
    
    plt.tight_layout()
    plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score, roc_auc_score

np.random.seed(42)

data1 = np.random.normal(loc=0, scale=1, size=100) 
labels_fixed = np.array([0] * 100) 

silhouette_scores_raw = []
silhouette_scores_exp = []
silhouette_scores_rank = []
auc_scores = []
means = np.arange(2.5, 6.5, 0.1) 

for mean in means:
    data2 = np.random.normal(loc=mean, scale=1, size=100)  # mean=mean, std=1
    labels = np.array([1] * 100) 
    
    data = np.concatenate((data1, data2)).reshape(-1, 1)
    all_labels = np.concatenate((labels_fixed, labels))

    sil_score = silhouette_score(data, all_labels)
    silhouette_scores_raw.append(sil_score)

    auc_score = roc_auc_score(all_labels, data)
    auc_scores.append(auc_score)

    data_exp = np.exp(data)
    sil_score = silhouette_score(data_exp, all_labels)
    silhouette_scores_exp.append(sil_score)

    data_rank = np.argsort(np.argsort(data.ravel())).reshape(-1, 1)
    sil_score = silhouette_score(data_rank, all_labels)
    silhouette_scores_rank.append(sil_score)


plt.scatter(auc_scores, silhouette_scores_raw, color='blue', label='raw score')
plt.scatter(auc_scores, silhouette_scores_exp, color='red', label='exp score')
plt.scatter(auc_scores, silhouette_scores_rank, color='orange', label='rank score')
plt.ylabel("Silhouette Score")
plt.xlabel("AUC")
plt.legend()
plt.title("Silhouette Score vs. AUC")
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import skewnorm
from sklearn.metrics import silhouette_score, roc_auc_score

np.random.seed(42)

skew_param1 = 4 
skew_param2 = -4  
data1 = skewnorm.rvs(a=skew_param1, loc=0, scale=1, size=100) 
labels_fixed = np.array([0] * 100) 

data2 = skewnorm.rvs(a=skew_param2, loc=3, scale=1, size=100) 

silhouette_scores_raw = []
silhouette_scores_exp = []
silhouette_scores_rank = []
auc_scores = []
means = np.arange(2.5, 6.5, 0.1) 

for mean in means:
    data2 = skewnorm.rvs(a=skew_param2, loc=mean, scale=1, size=100)
    labels = np.array([1] * 100) 
    
    data = np.concatenate((data1, data2)).reshape(-1, 1)
    all_labels = np.concatenate((labels_fixed, labels))

    sil_score = silhouette_score(data, all_labels)
    silhouette_scores_raw.append(sil_score)

    auc_score = roc_auc_score(all_labels, data)
    auc_scores.append(auc_score)

    data_exp = np.exp(data)
    sil_score = silhouette_score(data_exp, all_labels)
    silhouette_scores_exp.append(sil_score)

    data_rank = np.argsort(np.argsort(data.ravel())).reshape(-1, 1)
    sil_score = silhouette_score(data_rank, all_labels)
    silhouette_scores_rank.append(sil_score)

plt.figure(figsize=(6, 4))
plt.scatter(auc_scores, silhouette_scores_raw, color='blue', label='raw score')
plt.scatter(auc_scores, silhouette_scores_exp, color='red', label='exp score')
plt.scatter(auc_scores, silhouette_scores_rank, color='orange', label='rank score')
plt.ylabel("Silhouette Score")
plt.xlabel("AUC")
plt.title("Silhouette Score vs. AUC in skew normal distribution")
plt.legend()
plt.show()