In [3]:
# 导入必要的库
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score

# 加载数据
url = "clusterData/ex7data2.csv"  # 假设数据文件在当前目录下
data = pd.read_csv(url, header=None, names=["X1", "X2"])

# 查看数据的前几行
print("数据的前 5 行:")
print(data.head())

# 数据预处理
# 使用 StandardScaler 对数据进行标准化
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# 使用 K-Means 聚类
# 假设我们选择 3 个簇
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(scaled_data)

# 获取聚类结果
labels = kmeans.labels_
centroids = kmeans.cluster_centers_

# 将聚类结果添加到数据框中
data["Cluster"] = labels

# 绘制聚类结果
fig = px.scatter(
    data,
    x="X1",
    y="X2",
    color="Cluster",
    title="K-Means 聚类结果",
    color_continuous_scale="Viridis",
    labels={"X1": "特征 1", "X2": "特征 2", "Cluster": "簇"},
)

# 绘制聚类中心
fig.add_scatter(
    x=centroids[:, 0],
    y=centroids[:, 1],
    mode="markers",
    marker=dict(
        size=20,
        color="red",
        symbol="x",
        line=dict(width=2, color="DarkSlateGrey"),
    ),
    name="聚类中心",
)

# 显示图形
fig.show()

# 计算聚类效果（轮廓系数）
silhouette_avg = silhouette_score(scaled_data, labels)
print(f"聚类的轮廓系数: {silhouette_avg:.3f}")

# 使用肘部法则选择最佳簇数
inertia = []
for n in range(1, 11):
    kmeans = KMeans(n_clusters=n, random_state=42)
    kmeans.fit(scaled_data)
    inertia.append(kmeans.inertia_)

# 绘制肘部法则图
fig_elbow = px.line(
    x=range(1, 11),
    y=inertia,
    title="肘部法则：选择最佳簇数",
    labels={"x": "簇数", "y": "惯性"},
)
fig_elbow.show()

数据的前 5 行:
         X1        X2
0  1.842080  4.607572
1  5.658583  4.799964
2  6.352579  3.290854
3  2.904017  4.612204
4  3.231979  4.939894


聚类的轮廓系数: 0.702


In [8]:
# 导入必要的库
import pandas as pd  # 用于数据处理和分析
import numpy as np  # 用于数值计算
import plotly.express as px  # 用于创建交互式可视化图表
from sklearn.cluster import DBSCAN  # 导入DBSCAN聚类算法
from sklearn.preprocessing import StandardScaler  # 用于数据标准化
from sklearn.metrics import silhouette_score  # 用于评估聚类效果
from sklearn.cluster import KMeans  # 用于计算聚类中心

# 加载数据
url = "ex7data2.csv"  # 数据文件路径
data = pd.read_csv(url, header=None, names=["X1", "X2"])  # 加载数据，指定列名为X1和X2

# 查看数据的前几行，检查数据是否正确加载
print("数据的前 5 行:")
print(data.head())

# 数据预处理
# 使用 StandardScaler 对数据进行标准化，确保特征具有相同的尺度
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)  # 对数据进行标准化处理

# 使用 DBSCAN 聚类
# 首先找到合适的 eps 和 min_samples 参数
# 定义 eps 和 min_samples 的搜索范围
eps_range = np.linspace(0.1, 1.0, 20)
min_samples_range = range(2, 11)

# 初始化最佳参数和最佳轮廓系数
best_silhouette = -1
best_eps = 0.5
best_min_samples = 5

# 遍历不同的 eps 和 min_samples 组合，寻找最佳参数
for eps in eps_range:
    for min_samples in min_samples_range:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)  # 初始化DBSCAN模型
        dbscan.fit(scaled_data)  # 对数据进行聚类
        labels = dbscan.labels_  # 获取聚类标签
        
        # 计算轮廓系数，评估聚类效果
        if len(set(labels)) > 1:  # 确保有多个簇
            silhouette_avg = silhouette_score(scaled_data, labels)
            if silhouette_avg > best_silhouette:  # 更新最佳参数
                best_silhouette = silhouette_avg
                best_eps = eps
                best_min_samples = min_samples

# 使用最佳参数进行聚类
dbscan = DBSCAN(eps=best_eps, min_samples=best_min_samples)
dbscan.fit(scaled_data)
labels = dbscan.labels_  # 获取最终的聚类标签

# 将聚类结果添加到数据框中
data["Cluster"] = labels

# 使用 K-means 对每个簇进行局部优化，以找到聚类中心
centers = []  # 初始化聚类中心列表
for cluster in np.unique(labels):
    if cluster != -1:  # 跳过噪声点（标签为-1的点）
        cluster_data = scaled_data[labels == cluster]  # 获取当前簇的数据
        # 使用 K-means 找到簇中心
        kmeans = KMeans(n_clusters=1)
        kmeans.fit(cluster_data)
        centers.append(kmeans.cluster_centers_[0])  # 添加簇中心到列表

centers = np.array(centers)  # 将聚类中心转换为numpy数组

# 绘制聚类结果
fig = px.scatter(
    data,
    x="X1",
    y="X2",
    color="Cluster",
    title="DBSCAN 聚类结果",
    color_continuous_scale="Viridis",
    labels={"X1": "特征 1", "X2": "特征 2", "Cluster": "簇"},
)

# 绘制聚类中心
if len(centers) > 0:
    fig.add_scatter(
        x=centers[:, 0],
        y=centers[:, 1],
        mode="markers",
        marker=dict(
            size=20,
            color="red",
            symbol="x",
            line=dict(width=2, color="DarkSlateGrey"),
        ),
        name="聚类中心",
    )

# 显示图形
fig.show()

# 计算聚类效果（轮廓系数）
silhouette_avg = silhouette_score(scaled_data, labels)
print(f"聚类的轮廓系数: {silhouette_avg:.3f}")

# 绘制距离排序图以帮助选择 eps
distances = []
for i in range(len(scaled_data)):
    # 计算每个点与其他所有点的距离
    dist = np.sort(np.linalg.norm(scaled_data[i] - scaled_data, axis=1))
    distances.append(dist)

distances = np.array(distances)
k = 5  # 选择第 k 个最近邻
fig_dist = px.line(
    x=range(len(distances[:, k])),
    y=distances[:, k],
    title="距离排序图（选择 eps）",
    labels={"x": "点索引", "y": "第 5 个最近邻距离"},
)
fig_dist.show()

数据的前 5 行:
         X1        X2
0  1.842080  4.607572
1  5.658583  4.799964
2  6.352579  3.290854
3  2.904017  4.612204
4  3.231979  4.939894


聚类的轮廓系数: 0.700


In [11]:
# 导入必要的库
import pandas as pd
import numpy as np
import plotly.express as px
import hdbscan  # 改进：使用更先进的密度聚类算法
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.model_selection import ParameterGrid
from scipy.spatial.distance import pdist, squareform
from kneed import KneeLocator  # 用于自动寻找拐点

# 加载数据
url = "clusterData/ex7data2.csv"
data = pd.read_csv(url, header=None, names=["X1", "X2"])

# 数据预处理
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)

# 改进1：自动寻找最佳eps参数
def find_optimal_eps(data, min_samples=5, percentile=95):
    """使用k-distance图自动寻找最佳eps参数"""
    distances = pdist(data, 'euclidean')
    dist_matrix = squareform(distances)
    k_distances = np.sort(dist_matrix)[:, min_samples]
    sorted_k_distances = np.sort(k_distances)
    
    # 使用kneed自动检测拐点
    kneedle = KneeLocator(
        x=range(len(sorted_k_distances)),
        y=sorted_k_distances,
        S=1.0,
        curve="convex",
        direction="increasing"
    )
    
    # 返回拐点值或95%分位数作为备选
    return kneedle.knee_y if kneedle.knee else np.percentile(k_distances, percentile)

# 改进2：参数网格搜索优化
param_grid = {
    'min_cluster_size': [5, 10, 15],
    'min_samples': [3, 5, 7],
    'cluster_selection_epsilon': [0.1, 0.2, 0.3]
}

best_score = -1
best_params = {}
best_labels = None

# 网格搜索参数空间
for params in ParameterGrid(param_grid):
    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=params['min_cluster_size'],
        min_samples=params['min_samples'],
        cluster_selection_epsilon=params['cluster_selection_epsilon']
    )
    labels = clusterer.fit_predict(scaled_data)
    
    # 过滤噪声点后计算轮廓系数
    filtered_data = scaled_data[labels != -1]
    filtered_labels = labels[labels != -1]
    
    if len(np.unique(filtered_labels)) > 1 and len(filtered_data) > 0:
        score = silhouette_score(filtered_data, filtered_labels)
        if score > best_score:
            best_score = score
            best_params = params
            best_labels = labels

# 使用最佳参数重新训练
final_clusterer = hdbscan.HDBSCAN(
    min_cluster_size=best_params['min_cluster_size'],
    min_samples=best_params['min_samples'],
    cluster_selection_epsilon=best_params['cluster_selection_epsilon']
).fit(scaled_data)

# 获取聚类结果
labels = final_clusterer.labels_
data["Cluster"] = labels

# 改进3：可视化结果
fig = px.scatter(
    data,
    x="X1",
    y="X2",
    color="Cluster",
    title="HDBSCAN 聚类结果",
    color_discrete_sequence=px.colors.qualitative.Vivid,
    labels={"X1": "特征 1", "X2": "特征 2", "Cluster": "簇"},
    hover_data=["X1", "X2"]
)

# 添加概率等高线（HDBSCAN特有功能）
if hasattr(final_clusterer, 'probabilities_'):
    fig.update_traces(
        marker=dict(
            size=8,
            opacity=0.8,
            line=dict(width=1, color='DarkSlateGrey')
        ),
        selector=dict(mode='markers')
    )

# 显示图形
fig.show()

# 改进4：使用更全面的评估指标
def evaluate_clustering(labels, data):
    valid_data = data[labels != -1]
    valid_labels = labels[labels != -1]
    
    if len(np.unique(valid_labels)) < 2:
        return {"status": "failed", "reason": "Not enough clusters"}
    
    return {
        "silhouette_score": silhouette_score(valid_data, valid_labels),
        "n_clusters": len(np.unique(valid_labels)),
        "noise_ratio": np.sum(labels == -1) / len(labels)
    }

metrics = evaluate_clustering(labels, scaled_data)
print("聚类评估结果:")
print(f"- 轮廓系数: {metrics.get('silhouette_score', 'N/A'):.3f}")
print(f"- 簇数量: {metrics.get('n_clusters', 'N/A')}")
print(f"- 噪声比例: {metrics.get('noise_ratio', 'N/A'):.2%}")

# 改进5：绘制聚类概率图
if hasattr(final_clusterer, 'probabilities_'):
    prob_fig = px.scatter(
        data,
        x="X1",
        y="X2",
        color=final_clusterer.probabilities_,
        title="聚类概率可视化",
        color_continuous_scale="Viridis",
        labels={"color": "归属概率"}
    )
    prob_fig.show()

# 改进6：参数优化过程可视化
param_visualization = px.parallel_coordinates(
    pd.DataFrame(ParameterGrid(param_grid)),
    color=np.random.rand(len(ParameterGrid(param_grid))),
    title="参数空间探索可视化"
)
param_visualization.show()

聚类评估结果:
- 轮廓系数: 0.707
- 簇数量: 3
- 噪声比例: 1.33%
