計算 Silhouette Score 獲得最佳分群數(依照Ward分群)

In [2]:
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from sklearn.metrics import silhouette_score
from scipy.spatial.distance import squareform
import pandas as pd
import numpy as np

In [None]:
data=pd.read_csv("../distance_matrix.csv")
data=data.iloc[0:,1:]
data

In [None]:
# replace NaN with 0
data = data.fillna(1)
# replace 0 with 1
data = data.replace(0, 1)
for i in range(data.shape[0]):
    data.iloc[i,i]=0
condensed_data = squareform(data, force='tovector', checks=False)
linkage_matrix = linkage(condensed_data, method='ward')

dendrogram(linkage_matrix, color_threshold=1.9)
plt.tick_params(axis='x', which='both', bottom=False, top=False, labelbottom=False)
plt.xlabel('Genes')
plt.title('Dendrogram using Ward Method')
plt.show()

In [None]:
silhouette_scores = []

for k in range(2, 11):
    labels = fcluster(linkage_matrix, t=k, criterion='maxclust')
    silhouette_scores.append(silhouette_score(data, labels, metric='precomputed'))

fig, ax1 = plt.subplots(figsize=(8, 6))

ax1.plot(range(2, 11), silhouette_scores, marker='o', color='green')
ax1.set_title('Silhouette Score for Optimal K using Ward Method')
ax1.set_xlabel('Number of Clusters (K)')
ax1.set_ylabel('Silhouette Score')

plt.tight_layout()
plt.show()

In [20]:
labels = fcluster(linkage_matrix, t=8, criterion='maxclust')
df = pd.DataFrame()
df['Gene'] = data.columns
df['Cluster'] = labels
df.to_csv('ward_labels.csv', index=False)

使用Elbow method計算K-means最佳分群數 (outdated)

In [4]:
from sklearn.metrics import silhouette_score
from ipywidgets import interact

In [5]:
data=pd.read_csv("../result/all_beta_normalized_train_0.35.csv")
data=data.iloc[:,1:]

In [None]:
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

tsne = TSNE(n_components=3)
reduced_data_tsne = tsne.fit_transform(data)

kmeans = KMeans(n_clusters=4)  
clusters = kmeans.fit_predict(reduced_data_tsne)

elev_init, azim_init = 20, 30

@interact(elev=(0, 90, 5), azim=(0, 360, 5))
def visualize_3d_scatter(elev, azim):
    fig = plt.figure(figsize=(8, 6))
    ax = fig.add_subplot(111, projection='3d')
    ax.set_xlabel('t-SNE Component 1')
    ax.set_ylabel('t-SNE Component 2')
    ax.set_zlabel('t-SNE Component 3')
    ax.set_title('t-SNE 3D Visualization with KMeans Clustering')
    ax.view_init(elev=elev, azim=azim)
    plt.show()


In [None]:
inertia = []
silhouette_scores = []

for k in range(2, 11):  
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(data)
    inertia.append(kmeans.inertia_) # intertia 每個樣本到其分配的集群中心平方距離和，表示集群內的緊密度。
    labels = kmeans.labels_
    silhouette_avg = silhouette_score(data, labels)
    silhouette_scores.append(silhouette_avg)

plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.plot(range(2, 11), inertia, marker='o')
plt.title('Elbow Method for Optimal K (Inertia)')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Inertia')

plt.subplot(1, 2, 2)
plt.plot(range(2, 11), silhouette_scores, marker='o', color='green')
plt.title('Silhouette Score for Optimal K')
plt.xlabel('Number of Clusters (K)')
plt.ylabel('Silhouette Score')

plt.tight_layout()
plt.show()
