In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

In [3]:
# --- Step 2: Clustering Models and Evaluation ---
def evaluate_clustering(X, labels):
    return {
        "Silhouette Score": silhouette_score(X, labels),
        "Davies-Bouldin Index": davies_bouldin_score(X, labels),
        "Calinski-Harabasz Score": calinski_harabasz_score(X, labels)
    }

def compare_models(results):
    best_models = {}

    metric_names = next(iter(results.values())).keys()

    for metric in metric_names:
        metric_values = {model: scores[metric] for model, scores in results.items()}
        
        if metric == "Davies-Bouldin Index":
            best_model = min(metric_values, key=metric_values.get)  # lower is better
        else:
            best_model = max(metric_values, key=metric_values.get)  # higher is better

        best_value = metric_values[best_model]
        best_models[metric] = (best_model, best_value)

    print("\n✅ Best model per metric:")
    for metric, (model, value) in best_models.items():
        print(f"  • {metric}: {model} (Score: {value:.4f})")

In [4]:
X = pd.read_excel('/home/abdeldjalil-hani/Desktop/emergency-sorting-system/data-sets/1. PreProcessed DATA.xlsx')
results = {}

# 1. KMeans
kmeans = KMeans(n_clusters=5, random_state=42)
kmeans_labels = kmeans.fit_predict(X)
results['KMeans'] = evaluate_clustering(X, kmeans_labels)

# 2. Agglomerative Clustering
agg = AgglomerativeClustering(n_clusters=5)
agg_labels = agg.fit_predict(X)
results['Agglomerative'] = evaluate_clustering(X, agg_labels)

# 3. Gaussian Mixture Model (GMM)
gmm = GaussianMixture(n_components=5, random_state=42)
gmm_labels = gmm.fit_predict(X)
results['GMM'] = evaluate_clustering(X, gmm_labels)

# Display results
print("Clustering Evaluation Scores:")
for model, metrics in results.items():
    print(f"\n🔹 {model} Results:")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

# Compare and pick best models
compare_models(results)

Clustering Evaluation Scores:

🔹 KMeans Results:
  Silhouette Score: 0.2151
  Davies-Bouldin Index: 1.4405
  Calinski-Harabasz Score: 1869.6042

🔹 Agglomerative Results:
  Silhouette Score: 0.1672
  Davies-Bouldin Index: 1.5782
  Calinski-Harabasz Score: 1574.2988

🔹 GMM Results:
  Silhouette Score: 0.0416
  Davies-Bouldin Index: 8.9972
  Calinski-Harabasz Score: 439.9255

✅ Best model per metric:
  • Silhouette Score: KMeans (Score: 0.2151)
  • Davies-Bouldin Index: KMeans (Score: 1.4405)
  • Calinski-Harabasz Score: KMeans (Score: 1869.6042)


In [6]:
# Add KMeans labels to the processed dataframe
df_processed_with_labels = X.copy()
df_processed_with_labels['Cluster'] = kmeans_labels

df_processed_with_labels.to_excel("/home/abdeldjalil-hani/Desktop/emergency-sorting-system/data-sets/2. Clustered Preprocessed DATA.xlsx", index=False)

print("\n📌 First 5 rows with cluster labels:")
df_processed_with_labels.head()


📌 First 5 rows with cluster labels:


Unnamed: 0,age,gender,chest pain type,cholesterol,exercise angina,plasma glucose,skin_thickness,bmi,hypertension,heart_disease,...,Respiratory Rate (breaths/min),SpO2 (%),Glasgow Score,Consciousness,Massive Bleeding,Respiratory Distress,Risk Factors,blood_pressure,heart_pressure,Cluster
0,40,1,2,294,0,108.0,43,19.0,0,0,...,12,99,14,0,0,1,24,123,78,2
1,49,0,3,180,0,75.0,47,18.0,0,0,...,24,86,9,1,0,0,20,93,58,4
2,37,1,2,294,0,98.0,53,23.0,0,0,...,14,95,13,0,0,1,6,130,82,2
3,48,0,4,214,1,72.0,51,18.0,0,0,...,28,88,10,1,0,0,0,136,71,4
4,54,1,3,195,0,108.0,90,21.0,0,0,...,20,99,13,0,0,0,0,122,79,1


In [7]:
df_processed_with_labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6962 entries, 0 to 6961
Data columns (total 25 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   age                             6962 non-null   int64  
 1   gender                          6962 non-null   int64  
 2   chest pain type                 6962 non-null   int64  
 3   cholesterol                     6962 non-null   int64  
 4   exercise angina                 6962 non-null   int64  
 5   plasma glucose                  6962 non-null   float64
 6   skin_thickness                  6962 non-null   int64  
 7   bmi                             6962 non-null   float64
 8   hypertension                    6962 non-null   int64  
 9   heart_disease                   6962 non-null   int64  
 10  Residence_type                  6962 non-null   int64  
 11  smoking_status                  6962 non-null   int64  
 12  Symptom                         69