In [1]:
# main_process.py
# 主程序入口，进行数据加载、预处理、模型训练与测试，并评估分类准确率

import os
import pandas as pd
import numpy as np
import random
import torch
import torch.nn.functional as F
from torch.distributions import Categorical
from torch import nn
from data_loader import *
from sklearn.metrics import *
from sklearn.preprocessing import *
from collections import Counter
from autoencoder import *
from evt import *
import argparse
import time
from datetime import timedelta
from sklearn.cluster import KMeans, DBSCAN
import warnings

start_time = time.time()

In [2]:
# 设置随机种子，保证实验可复现
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)

# 解析命令行参数，支持自定义数据集名称
default_data_name = 'demo'
# parser = argparse.ArgumentParser()
# parser.add_argument('--data_name', type = str, default = default_data_name, help = 'data name')
# args = parser.parse_args()
# data_name = args.data_name
data_name = default_data_name

# 加载数据集，并进行标准化
# data_load: 特征数据, label_load: 标签
data_load, label_load = read_dataset2017(data_name)
data_load = StandardScaler().fit_transform(data_load)

# 统计每个类别的样本数，确定训练/测试划分
count_number = Counter(label_load)
min_num = np.array(list(count_number.values())).min()  # 最小类别样本数
test_per_class = 300  # 每类测试样本数
num_per_class = min_num - test_per_class  # 每类训练样本数
dim = data_load.shape[1]  # 特征维度
b_size = test_per_class   # 批量大小
loss_func = nn.MSELoss()  # 损失函数

sum_num = len(set(list((label_load))))  # 类别总数
train_num = 14  # 训练类别数（只用一个类别做已知，其余为新类）
newclass_num = sum_num - train_num  # 新类别数

# 打乱数据顺序
shun = list(range(data_load.shape[0]))
random.shuffle(shun)
data_load = data_load[shun]
label_load = label_load[shun]

# 随机排列类别索引
allIndex = np.random.permutation(train_num + newclass_num)
# 计算数据加载部分耗费的时间
data_loading_end = time.time()
data_loading_time = data_loading_end - start_time
print(f"=== Data Loading Time ===")
print(f"Data loading time: {str(timedelta(seconds=data_loading_time))}")

Original data size: 34096, After 10% sampling: 34096
Class distribution after oversampling:
0.1
9     2347
4     2322
0     2322
12    2300
6     2288
11    2287
1     2286
3     2283
2     2273
5     2261
14    2258
8     2247
13    2233
10    2210
7     2179
Name: count, dtype: int64
=== Data Loading Time ===
Data loading time: 0:00:37.100098


  data = data.replace([np.inf, -np.inf], np.nan)


In [3]:
# 构建训练集（只包含已知类别）
data = np.zeros((num_per_class * (train_num), dim))
label = np.zeros(num_per_class * (train_num))
for pos in range(train_num):
    i = allIndex[pos]
    data[pos * num_per_class:(pos + 1) * num_per_class,:] = data_load[label_load==i][0:num_per_class, :]
    label[pos * num_per_class:(pos + 1) * num_per_class] = i


# 构建流式测试集（包含所有类别，已知类标记为原标签，未知类标记为999）
streamdata = np.zeros((test_per_class * (train_num + newclass_num), dim))
streamlabel = np.zeros(test_per_class * (train_num + newclass_num))
gtlabel = np.zeros(test_per_class * (train_num + newclass_num))
for pos in range(train_num + newclass_num):
    i = allIndex[pos]
    streamdata[pos * test_per_class:(pos + 1) * test_per_class,:] = data_load[label_load==i][-test_per_class:, :]
    gtlabel[pos * test_per_class:(pos + 1) * test_per_class] = i
    if pos < train_num:
        streamlabel[pos * test_per_class:(pos+1) * test_per_class] = i
    else:
        streamlabel[pos * test_per_class:(pos + 1) * test_per_class] = 999
        
# 输出 gtlabel 中按出现顺序的不同标签
unique_labels = pd.Series(gtlabel).unique()
print("Distinct labels in 'gtlabel':", unique_labels)


# 根据标签统计，筛选样本数大于50的类别
# 返回当前存在的类别列表
def make_lab(label):
    xianyou = pd.DataFrame(label).value_counts()
    curr_lab = []
    for j1 in xianyou.keys():
        for i in range(train_num):
            if (xianyou[j1] > 50) and (j1[0] == allIndex[i]):
                curr_lab.append(j1[0])
                break
        # if xianyou[j1] > 50:
        #     curr_lab.append(j1[0])
    return curr_lab

# 针对每个类别训练自编码器模型，并用SPOT方法确定阈值
# 返回模型列表、阈值列表、类别列表
def train(data, label, curr_lab):
    mod_ls = []
    thred_ls = []
    class_ls = []
    batch = 10
    epoch = 10
    y_in, y1, y2, y3, y4 = data_load.shape[1], 256, 128, 64, 32
    for i in curr_lab:
        class_ls.append(i)
        model = Autoencoder(y_in, y1, y2, y3, y4)
        optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay = 5e-4)
        # 训练自编码器
        for i2 in range(epoch):
            shun = list(range(data[label==i].shape[0]))
            random.shuffle(shun)
            for i3 in range(int(data[label==i].shape[0] / batch)):
                data_input = torch.from_numpy(data[label==i][shun][i3 * batch : (i3+1) * batch]).float()
                pred = model(data_input)
                loss = loss_func(pred, data_input)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        mod_eva = model.eval()
        mod_ls.append(model)
        # 计算重构误差，用于阈值确定
        mse_ls = []
        for i4 in range(int(data[label==i].shape[0] / batch)):
            data_input = torch.from_numpy(data[label==i][i4 * batch : (i4+1) * batch]).float()
            pred = model(data_input)
            for i5 in range(pred.shape[0]):
                loss = loss_func(pred[i5], data_input[i5])
                mse_ls.append(float(loss.detach().numpy()))
        data_input = torch.from_numpy(data[label==i][(int(data[label==i].shape[0] / batch)) * batch:]).float()
        pred = model(data_input)
        for i5 in range(pred.shape[0]):
            loss = loss_func(pred[i5], data_input[i5])
            mse_ls.append(float(loss.detach().numpy()))
        loss_list_use = np.array(mse_ls)
        q = 5e-2 # 风险参数，可调
        if len(loss_list_use) == 0:
            thred_ls.append(0)
            continue
        try:
            s = SPOT(q)
            s.fit(loss_list_use, loss_list_use)
            s.initialize()
            results = s.run_simp()
            # 阈值选取
            if results['thresholds'][0] > 0:
                thred_ls.append(results['thresholds'][0])
            else:
                thred_ls.append(np.sort(s.init_data)[int(0.85 * s.init_data.size)])
        except Exception as e:
            thred_ls.append(np.max(loss_list_use) + 1e-3)
    return mod_ls, thred_ls, class_ls

# 计算数据预处理部分耗费的时间
preprocessing_end = time.time()
preprocessing_time = preprocessing_end - data_loading_end

print(f"\n=== Data Preprocessing Time ===")
print(f"Preprocessing time: {str(timedelta(seconds=preprocessing_time))}")

Distinct labels in 'gtlabel': [ 9. 11.  0. 13.  5.  8.  2.  1. 14.  4.  7. 10. 12.  3.  6.]

=== Data Preprocessing Time ===
Preprocessing time: 0:00:00.048612


In [4]:
# 主流程入口
if __name__ == '__main__':
    print('=== Initializing ===')
    # 获取当前类别
    curr_lab = make_lab(label)
    # 训练初始模型
    mod_ls, thred_ls, class_ls = train(data, label, curr_lab)
    
    res_ls = []  # 预测结果列表
    # 流式数据逐步推理与模型更新
    for i5 in range(streamdata.shape[0]):
        # 每处理完一个新类，更新模型
        if i5 % b_size == 0 and int(i5 / b_size) > train_num:
            updatedata = np.concatenate([data, streamdata[:i5]], axis=0)
            updatelabel = np.concatenate([label, gtlabel[:i5]], axis=0)
            curr_lab = make_lab(updatelabel)
            print('Current labels:', curr_lab)
            mod_ls, thred_ls, class_ls = train(updatedata, updatelabel, curr_lab) 
            print('*** Update model ***')
            print(f'Number of models: {len(mod_ls)}')
            print(f'Thresholds: {thred_ls}')
            print(f'Class labels: {class_ls}')
        # 对当前样本用所有模型计算重构误差
        data_input = torch.from_numpy(streamdata[i5]).float()
        mse_test = []
        for model in mod_ls:
            mod_eva = model.eval()
            pred = model(data_input)
            loss = loss_func(pred, data_input)
            mse_test.append(float(loss.detach().numpy()))
        # 判断是否为新类
        cand_res = np.array(mse_test)[np.array(mse_test) < np.array(thred_ls)]
        if len(cand_res) == 0:
            res_ls.append(999)
        else:
            min_loss_res = cand_res.min()
            res_ls.append(class_ls[mse_test.index(min_loss_res)])
    
    # Calculate training and testing time
    training_end = time.time()
    training_time = training_end - preprocessing_end

    print(f"\n=== Training Time ===")
    print(f"Training time: {str(timedelta(seconds=training_time))}")

    # print()
    # print("Number of models:", len(mod_ls))
    # print("Thresholds:", thred_ls)
    # print("Class labels:", class_ls)
    # Output complete res_ls results
    # print("=== Prediction Results ===")
    # print("res_ls contents:", res_ls)

    # Show data distribution in res_ls
    # res_distribution = Counter(res_ls)
    # print("=== Data Distribution ===")
    # for class_label, count in res_distribution.items():
    #     print(f"Class {class_label}: {count} samples")

    # 对新类样本，将999替换为真实标签
    # for ii in range(train_num + newclass_num):
    #     if ii >= train_num:
    #         rep_npy = np.array(res_ls[test_per_class * ii : test_per_class * (ii + 1)])
    #         rep_npy2 = rep_npy.copy()
    #         rep_npy[rep_npy2==999] = allIndex[ii]
    #         res_ls[test_per_class * ii:test_per_class * (ii + 1)] = list(rep_npy)
    
    # 计算准确率
    # y_pred = np.array(res_ls)
    # y_true = gtlabel[:len(res_ls)].copy()
    # acc = accuracy_score(y_true, y_pred)
    # print('Dataset:', data_name)
    # print('Accuracy:', acc)

    # Separate samples into known and unknown predictions
    predict_known_features = []
    predict_known_labels = []
    predict_known_true_labels = []

    predict_unknown_features = []
    predict_unknown_labels = []
    predict_unknown_true_labels = []
    predict_known_indices = []

    for i in range(test_per_class * train_num):
        # Known class prediction
        predict_known_features.append(streamdata[i])
        predict_known_labels.append(res_ls[i])
        predict_known_true_labels.append(gtlabel[i])
        # Track the index in the original streamdata for known class predictions
        predict_known_indices.append(i)


    for i, pred_label in enumerate(res_ls):
        if (i < test_per_class * train_num):
            continue
        if pred_label == 999:
            # Unknown class prediction
            predict_unknown_features.append(streamdata[i])
            predict_unknown_labels.append(999)
            predict_unknown_true_labels.append(gtlabel[i])
        else:
            # Known class prediction
            predict_known_features.append(streamdata[i])
            predict_known_labels.append(pred_label)
            predict_known_true_labels.append(gtlabel[i])
            # Track the index in the original streamdata for known class predictions
            predict_known_indices.append(i)

    # Convert lists to numpy arrays
    predict_known_features = np.array(predict_known_features)
    predict_known_labels = np.array(predict_known_labels)
    predict_known_true_labels = np.array(predict_known_true_labels)

    predict_unknown_features = np.array(predict_unknown_features)
    predict_unknown_labels = np.array(predict_unknown_labels)
    predict_unknown_true_labels = np.array(predict_unknown_true_labels)

    
    # Calculate accuracy for known predictions
    if len(predict_known_labels) > 0:
        known_accuracy = accuracy_score(predict_known_true_labels, predict_known_labels)
        print("\n=== Known Class Predictions ===")
        print(f"Number of samples: {len(predict_known_labels)}")
        print(f"Accuracy: {known_accuracy:.4f}")
    else:
        print("\nNo known class predictions to evaluate.")
    
    # Ensemble Clustering
    print("\n=== Ensemble Clustering on Unknown Features ===")
    true_labels = []
    predicted_clusters = []

    # Skip clustering if there are no unknown predictions
    if len(predict_unknown_features) == 0:
        print("No unknown samples detected for clustering.")
    else:
        print(f"Clustering {len(predict_unknown_features)} unknown samples...")
        
        # 1. Initialize consensus matrix
        n_samples = len(predict_unknown_features)
        consensus_matrix = np.zeros((n_samples, n_samples))
        
        # Normalize the data for better clustering results
        scaler = StandardScaler()
        normalized_features = scaler.fit_transform(predict_unknown_features)
        
        # 2. Perform multiple clustering with different algorithms and parameters
        warnings.filterwarnings('ignore')
        
        # 2.1 K-means with different parameters (35 times)
        print("Running K-means clustering...")
        for k in range(2, 12):  # Different number of clusters
            for random_state in range(5):  # Different initializations
                kmeans = KMeans(n_clusters=k, random_state=random_state)
                labels = kmeans.fit_predict(normalized_features)
                
                # Update consensus matrix
                for i in range(n_samples):
                    for j in range(i+1, n_samples):
                        if labels[i] == labels[j]:
                            consensus_matrix[i, j] += 1
                            consensus_matrix[j, i] += 1
        
        # 2.2 DBSCAN with different parameters (35 times)
        print("Running DBSCAN clustering...")
        for eps in [0.1, 0.2, 0.3, 0.5, 0.7, 1.0, 1.2]:
            for min_samples in [3, 5, 7, 10, 15]:
                dbscan = DBSCAN(eps=eps, min_samples=min_samples)
                labels = dbscan.fit_predict(normalized_features)
                
                # Filter out noise points (label -1)
                valid_indices = labels != -1
                if sum(valid_indices) < 2:  # Skip if too many noise points
                    continue
                    
                # Update consensus matrix for non-noise points
                for i in range(n_samples):
                    if labels[i] == -1:
                        continue
                    for j in range(i+1, n_samples):
                        if labels[j] != -1 and labels[i] == labels[j]:
                            consensus_matrix[i, j] += 1
                            consensus_matrix[j, i] += 1
        
        # 2.3 Deep Embedded Clustering (DEC) with different parameters (30 times)
        print("Running DEC clustering...")
        try:
            import torch.nn as nn
            
            class DECAutoencoder(nn.Module):
                def __init__(self, input_dim, hidden_dim, latent_dim):
                    super(DECAutoencoder, self).__init__()
                    self.encoder = nn.Sequential(
                        nn.Linear(input_dim, hidden_dim),
                        nn.ReLU(),
                        nn.Linear(hidden_dim, latent_dim)
                    )
                    self.decoder = nn.Sequential(
                        nn.Linear(latent_dim, hidden_dim),
                        nn.ReLU(),
                        nn.Linear(hidden_dim, input_dim)
                    )
                    
                def forward(self, x):
                    z = self.encoder(x)
                    x_recon = self.decoder(z)
                    return x_recon, z
            
            # Function to perform DEC clustering
            def run_dec(features, n_clusters, hidden_dim, latent_dim, epochs=30):
                input_dim = features.shape[1]
                model = DECAutoencoder(input_dim, hidden_dim, latent_dim)
                optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
                
                # Train autoencoder
                X_tensor = torch.FloatTensor(features)
                for epoch in range(epochs):
                    model.train()
                    x_recon, z = model(X_tensor)
                    loss = nn.MSELoss()(x_recon, X_tensor)
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()
                
                # Extract latent features
                model.eval()
                _, z = model(X_tensor)
                latent_features = z.detach().numpy()
                
                # Apply KMeans on latent features
                kmeans = KMeans(n_clusters=n_clusters)
                labels = kmeans.fit_predict(latent_features)
                return labels
            
            # Run DEC with different parameters
            for n_clusters in [3, 4, 5, 6, 7, 8]:
                for hidden_dim in [64, 128]:
                    for latent_dim in [32, 48, 64]:
                        try:
                            labels = run_dec(normalized_features, n_clusters, hidden_dim, latent_dim)
                            
                            # Update consensus matrix
                            for i in range(n_samples):
                                for j in range(i+1, n_samples):
                                    if labels[i] == labels[j]:
                                        consensus_matrix[i, j] += 1
                                        consensus_matrix[j, i] += 1
                        except Exception as e:
                            print(f"DEC failed with parameters: {n_clusters}, {hidden_dim}, {latent_dim}. Error: {e}")
        except Exception as e:
            print(f"Skipping DEC due to error: {e}")
        
        # 3. Normalize consensus matrix
        # total_clusterings = 35 + 35 + 30  # total number of clustering runs
        # consensus_matrix = consensus_matrix / total_clusterings
        
        # Save consensus matrix
        os.makedirs('./output', exist_ok=True)
        pd.DataFrame(consensus_matrix).to_csv('./output/Adjacency_Matrix.csv', index=False)
        print("Consensus matrix saved to ./output/Adjacency_Matrix.csv")
        
        # 4. Find optimal number of clusters using silhouette score
        print("\nFinding optimal number of clusters...")
        max_silhouette = -1
        best_k = 2  # default
        silhouette_scores = []
        
        for k in range(2, min(41, n_samples)):
            kmeans = KMeans(n_clusters=k, random_state=42)
            labels = kmeans.fit_predict(consensus_matrix)
            
            try:
                silhouette = silhouette_score(consensus_matrix, labels)
                silhouette_scores.append(silhouette)
                print(f"K={k}, Silhouette Score={silhouette:.4f}")
                
                if silhouette > max_silhouette:
                    max_silhouette = silhouette
                    best_k = k
            except:
                print(f"Could not compute silhouette score for k={k}")
        
        print(f"\nOptimal number of clusters: {best_k}")
        
        # Perform final clustering with optimal k
        final_kmeans = KMeans(n_clusters=best_k, random_state=42)
        predicted_clusters = final_kmeans.fit_predict(consensus_matrix)
        
        # 5. Evaluate clustering performance
        true_labels = predict_unknown_true_labels
        
        # If the number of unique true labels doesn't match predicted clusters,
        # we need to find the best mapping
        if len(np.unique(true_labels)) != len(np.unique(predicted_clusters)):
            print(f"Note: Number of true clusters ({len(np.unique(true_labels))}) differs from predicted clusters ({len(np.unique(predicted_clusters))})")
        
        # Calculate metrics
        ari = adjusted_rand_score(true_labels, predicted_clusters)
        ami = adjusted_mutual_info_score(true_labels, predicted_clusters)
        
        print("\n=== Clustering Evaluation ===")
        print(f"Number of samples clustered: {len(true_labels)}")
        print(f"True number of clusters: {len(np.unique(true_labels))}")
        print(f"Predicted number of clusters: {len(np.unique(predicted_clusters))}")
        print(f"Adjusted Rand Index (ARI): {ari:.4f}")
        print(f"Adjusted Mutual Information (AMI): {ami:.4f}")
        
        # Display cluster distribution
        print("\nCluster distribution:")
        cluster_counts = pd.Series(predicted_clusters).value_counts().sort_index()
        for cluster, count in cluster_counts.items():
            print(f"Cluster {cluster}: {count} samples")

    # Calculate clustering time and total execution time
    clustering_end = time.time()
    clustering_time = clustering_end - training_end

    print(f"\n=== Clustering Time ===")
    print(f"Clustering time: {str(timedelta(seconds=clustering_time))}")


    print(f"\n=== Total Time ===")
    total_time = clustering_end - start_time
    print(f"Total execution time: {str(timedelta(seconds=total_time))}")

    # Combine the known class predictions and unknown class clusterings for evaluation

    # 1. Create combined true labels array
    # First, convert all arrays to numpy arrays if they aren't already
    predict_known_true_labels_np = np.array(predict_known_true_labels)
    true_labels_np = np.array(true_labels)
    predict_known_labels_np = np.array(predict_known_labels)
    predicted_clusters_np = np.array(predicted_clusters)

    # 2. Offset the cluster labels to avoid overlap with known class labels
    # Add an offset (e.g., 100) to the cluster labels
    offset = 100
    predicted_clusters_offset = predicted_clusters_np + offset

    # 3. Create combined arrays for predictions and true labels
    combined_predictions = np.concatenate([predict_known_labels_np, predicted_clusters_offset])
    combined_true_labels = np.concatenate([predict_known_true_labels_np, true_labels_np])

    # 4. Calculate metrics on the combined data
    combined_ami = adjusted_mutual_info_score(combined_true_labels, combined_predictions)
    combined_ari = adjusted_rand_score(combined_true_labels, combined_predictions)

    # 5. Display results
    print("\n=== Combined Evaluation (Known Classes + Clusters) ===")
    print(f"Class ratio - Known:Unknown = {train_num}:{newclass_num}")
    print(f"Total samples evaluated: {len(combined_predictions)}")
    print(f"Known class samples: {len(predict_known_labels_np)}")
    print(f"Clustered unknown samples: {len(predicted_clusters_np)}")
    print(f"Combined Adjusted Mutual Information (AMI): {combined_ami:.4f}")
    print(f"Combined Adjusted Rand Index (ARI): {combined_ari:.4f}")

=== Initializing ===

=== Training Time ===
Training time: 0:01:55.704786

=== Known Class Predictions ===
Number of samples: 4316
Accuracy: 0.8047

=== Ensemble Clustering on Unknown Features ===
Clustering 184 unknown samples...
Running K-means clustering...
Running DBSCAN clustering...
Running DEC clustering...
Consensus matrix saved to ./output/Adjacency_Matrix.csv

Finding optimal number of clusters...
K=2, Silhouette Score=0.7571
K=3, Silhouette Score=0.7671
K=4, Silhouette Score=0.6837
K=5, Silhouette Score=0.5285
K=6, Silhouette Score=0.3915
K=7, Silhouette Score=0.3869
K=8, Silhouette Score=0.0622
K=9, Silhouette Score=0.0372
K=10, Silhouette Score=0.0372
K=11, Silhouette Score=0.0269
K=12, Silhouette Score=0.0378
K=13, Silhouette Score=0.0378
K=14, Silhouette Score=0.0515
K=15, Silhouette Score=0.0515
K=16, Silhouette Score=0.0458
K=17, Silhouette Score=0.0458
K=18, Silhouette Score=0.0520
K=19, Silhouette Score=0.0520
K=20, Silhouette Score=0.0520
K=21, Silhouette Score=0.06

In [5]:
with open(f'./output/整理/2017_{train_num}-{newclass_num}.txt', 'w') as f:
    # Write header
    f.write(f"=== Experiment Results: Known Classes ({train_num}) vs Unknown Classes ({newclass_num}) ===\n\n")
    
    # Write timing information
    f.write("=== Timing Information ===\n")
    f.write(f"Data loading time: {data_loading_time:.4f} seconds\n")
    f.write(f"Preprocessing time: {preprocessing_time:.4f} seconds\n")
    f.write(f"Training time: {training_time:.4f} seconds\n")
    f.write(f"Clustering time: {clustering_time:.4f} seconds\n")
    f.write(f"Total execution time: {total_time:.4f} seconds\n\n")
    
    # Write model information
    f.write("=== Model Information ===\n")
    f.write(f"Number of models: {len(mod_ls)}\n")
    f.write(f"Class labels: {class_ls}\n")
    f.write(f"Thresholds: {thred_ls}\n\n")
    
    # Write classification results
    f.write("=== Classification Results ===\n")
    f.write(f"Known class accuracy: {known_accuracy:.4f}\n")
    f.write(f"Total samples: {len(combined_predictions)}\n")
    f.write(f"Known class samples: {len(predict_known_labels_np)}\n")
    f.write(f"Unknown class samples: {len(predict_unknown_features)}\n\n")
    
    if (newclass_num != 0):
        # Write clustering results
        f.write("=== Clustering Results ===\n")
        f.write(f"Best number of clusters: {best_k}\n")
        f.write(f"True number of clusters: {len(np.unique(predict_unknown_true_labels))}\n")
        f.write(f"Adjusted Rand Index (ARI): {ari:.4f}\n")
        f.write(f"Adjusted Mutual Information (AMI): {ami:.4f}\n\n")
        
    # Write combined evaluation
    f.write("=== Combined Evaluation ===\n")
    f.write(f"Combined Adjusted Rand Index (ARI): {combined_ari:.4f}\n")
    f.write(f"Combined Adjusted Mutual Information (AMI): {combined_ami:.4f}\n\n")
    
    if (newclass_num != 0):
        # Write cluster distrib ution
        f.write("=== Cluster Distribution ===\n")
        for cluster, count in cluster_counts.items():
            f.write(f"Cluster {cluster}: {count} samples\n")

print(f"Results have been saved to txt")

Results have been saved to txt
