In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.utils import shuffle

df = pd.read_csv("/gpfs/commons/home/atalukder/Contrastive_Learning/files/RECOMB_26/tsne_4exon_2025_11_11__17_27_21/UMAP_n15_d0.2_embedding.csv")

In [2]:
cluster_labels = df['exon_name']

# --- UMAP coordinates ---
X = df[['x', 'y']].values

# --- Compute the observed silhouette score ---
obs_silhouette = silhouette_score(X, cluster_labels)
print(f"Observed silhouette score: {obs_silhouette:.4f}")

# --- Permutation test for significance ---
n_perm = 1000
perm_scores = []
for i in range(n_perm):
    shuffled = shuffle(cluster_labels, random_state=i)
    perm_score = silhouette_score(X, shuffled)
    perm_scores.append(perm_score)

perm_scores = np.array(perm_scores)

# --- Compute one-sided p-value ---
p_value = (np.sum(perm_scores >= obs_silhouette) + 1) / (n_perm + 1)
print(f"P-value (one-sided): {p_value:.4f}")


Observed silhouette score: 0.3313
P-value (one-sided): 0.0010


In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.utils import shuffle
from scipy.stats import mannwhitneyu

# --- Load your dataframe (already has x, y, tissue) ---
df = pd.read_csv("/gpfs/commons/home/atalukder/Contrastive_Learning/files/RECOMB_26/tsne_4exon_2025_11_11__17_27_21/UMAP_n15_d0.2_embedding.csv")
# --- Ensure consistent column names ---
df.columns = [c.lower() for c in df.columns]

from scipy.stats import kruskal

groups_x = [g["x"].values for _, g in df.groupby("exon_name")]
groups_y = [g["y"].values for _, g in df.groupby("exon_name")]

H_x, p_x = kruskal(*groups_x)
H_y, p_y = kruskal(*groups_y)
print(f"UMAP-x: H = {H_x:.2f}, p = {p_x:.4e}")
print(f"UMAP-y: H = {H_y:.2f}, p = {p_y:.4e}")



UMAP-x: H = 129.08, p = 8.5455e-28
UMAP-y: H = 106.94, p = 5.0088e-23


In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.utils import shuffle
from scipy.stats import mannwhitneyu

# --- Load your dataframe (already has x, y, tissue) ---
df = pd.read_csv("/gpfs/commons/home/atalukder/Contrastive_Learning/files/RECOMB_26/Brain_All_embeddings_2025_11_08__02_21_57/UMAP_n30_d0.3_embedding.csv")

# # --- Extract coordinates and labels ---
# X = df[['x', 'y']].values
# labels = df['tissue']

# # --- Compute observed silhouette score ---
# obs_sil = silhouette_score(X, labels)
# print(f"Observed silhouette: {obs_sil:.4f}")

# --- Split into groups ---
group1 = df[df['tissue'] == 'high']
group2 = df[df['tissue'] == 'low']

# --- Mann–Whitney tests for each UMAP axis ---
u_x, p_x = mannwhitneyu(group1['x'], group2['x'], alternative='two-sided')
u_y, p_y = mannwhitneyu(group1['y'], group2['y'], alternative='two-sided')

print("=== Mann–Whitney U Test ===")
print(f"UMAP-x: U = {u_x:.2f},  p = {p_x:.4e}")
print(f"UMAP-y: U = {u_y:.2f},  p = {p_y:.4e}")

# --- Optional effect size (rank-biserial correlation) ---
n1, n2 = len(group1), len(group2)
rbc_x = 1 - (2 * u_x) / (n1 * n2)
rbc_y = 1 - (2 * u_y) / (n1 * n2)
print(f"Effect size (rank-biserial):  x = {rbc_x:.3f},  y = {rbc_y:.3f}")


=== Mann–Whitney U Test ===
UMAP-x: U = 76222.00,  p = 4.7857e-17
UMAP-y: U = 126992.00,  p = 2.0938e-04
Effect size (rank-biserial):  x = 0.316,  y = -0.140


In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from sklearn.utils import shuffle
from scipy.stats import mannwhitneyu

# --- Load your dataframe (already has x, y, tissue) ---
df = pd.read_csv("/gpfs/commons/home/atalukder/Contrastive_Learning/files/RECOMB_26/tsne_highLowExpr_2025_11_08__01_12_14/UMAP_n10_d0.1_embedding.csv")

# # --- Extract coordinates and labels ---
# X = df[['x', 'y']].values
# labels = df['tissue']

# # --- Compute observed silhouette score ---
# obs_sil = silhouette_score(X, labels)
# print(f"Observed silhouette: {obs_sil:.4f}")

# --- Split into groups ---
group1 = df[df['tissue'] == 1.0]
group2 = df[df['tissue'] == 0.0]

# --- Mann–Whitney tests for each UMAP axis ---
u_x, p_x = mannwhitneyu(group1['x'], group2['x'], alternative='two-sided')
u_y, p_y = mannwhitneyu(group1['y'], group2['y'], alternative='two-sided')

print("=== Mann–Whitney U Test ===")
print(f"UMAP-x: U = {u_x:.2f},  p = {p_x:.4e}")
print(f"UMAP-y: U = {u_y:.2f},  p = {p_y:.4e}")

# --- Optional effect size (rank-biserial correlation) ---
n1, n2 = len(group1), len(group2)
rbc_x = 1 - (2 * u_x) / (n1 * n2)
rbc_y = 1 - (2 * u_y) / (n1 * n2)
print(f"Effect size (rank-biserial):  x = {rbc_x:.3f},  y = {rbc_y:.3f}")


=== Mann–Whitney U Test ===
UMAP-x: U = 99368.00,  p = 1.9911e-08
UMAP-y: U = 120858.00,  p = 3.6446e-01
Effect size (rank-biserial):  x = 0.205,  y = 0.033


In [4]:
n_perm = 1000
perm_scores = []

for i in range(n_perm):
    shuffled = shuffle(labels, random_state=i)
    perm_score = silhouette_score(X, shuffled)
    perm_scores.append(perm_score)

perm_scores = np.array(perm_scores)
p_val = (np.sum(perm_scores >= obs_sil) + 1) / (n_perm + 1)
print(f"P-value (one-sided): {p_val:.4f}")


P-value (one-sided): 0.0010


In [None]:
from hyppo.ksample import Energy
import numpy as np
import pandas as pd

X = df.loc[df['tissue'] == 'high', ['x','y']].values
Y = df.loc[df['tissue'] == 'low', ['x','y']].values

stat, p_val = Energy().test(X, Y)
print(f"Energy distance test: stat = {stat:.4f}, p = {p_val:.4f}")


ModuleNotFoundError: No module named 'skbio'

In [None]:
import matplotlib.pyplot as plt
plt.hist(perm_scores, bins=30, alpha=0.7, label='Permuted')
plt.axvline(obs_silhouette, color='r', linestyle='--', label='Observed')
plt.xlabel('Silhouette Score')
plt.ylabel('Count')
plt.legend()
plt.show()