In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, adjusted_rand_score

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

def simple_matching_distance(point1, point2):
    return np.sum(point1 != point2)

def initialize_prototypes( preprocessed_data, n_clusters):
    indices = np.random.choice(len(numerical_cols), n_clusters, replace=False)
    numerical_prototypes = preprocessed_data[whichnum].iloc[indices].values
    categorical_prototypes = preprocessed_data[whichcat].iloc[indices].values
    print(numerical_prototypes)
    print(categorical_prototypes)
    return numerical_prototypes, categorical_prototypes

def assign_clusters( preprocessed_data , numerical_prototypes, categorical_prototypes):
    assigned_clusters = []
    
    for _, row in preprocessed_data.iterrows():
        distances = []
        for num_proto, cat_proto in zip(numerical_prototypes, categorical_prototypes): # here we iterate through the numerical and categorical prototypes
            num_distance = euclidean_distance(row[whichnum], num_proto)
            cat_distance = simple_matching_distance(row[whichcat], cat_proto)
            total_distance = num_distance + gamma * cat_distance
            distances.append(total_distance)

        assigned_clusters.append(np.argmin(distances))
        
    return assigned_clusters

def update_prototypes(preprocessed_data, assigned_clusters, n_clusters):
    new_numerical_prototypes = []
    new_categorical_prototypes = []
    
    for cluster in range(n_clusters):
        cluster_indices = [i for i, c in enumerate(assigned_clusters) if c == cluster]
        
        cluster_numerical_data = preprocessed_data[whichnum].iloc[cluster_indices]
        new_numerical_prototype = cluster_numerical_data.mean(axis=0)
        new_numerical_prototypes.append(new_numerical_prototype)
        
        cluster_categorical_data = preprocessed_data[whichcat].iloc[cluster_indices]
        new_categorical_prototype = cluster_categorical_data.mode(axis=0).iloc[0]

        new_categorical_prototypes.append(new_categorical_prototype)
    
    return np.array(new_numerical_prototypes), np.array(new_categorical_prototypes)

def k_prototypes(preprocessed_data, n_clusters, max_iterations=100):
    numerical_prototypes, categorical_prototypes = initialize_prototypes(preprocessed_data , n_clusters)
    
    for _ in range(max_iterations):
        assigned_clusters = assign_clusters(preprocessed_data, numerical_prototypes, categorical_prototypes)
        
        new_numerical_prototypes, new_categorical_prototypes = update_prototypes(preprocessed_data, assigned_clusters, n_clusters)
        
        new_categorical_prototypes_str = np.array([str(cat_proto) for cat_proto in new_categorical_prototypes])
        categorical_prototypes_str = np.array([str(cat_proto) for cat_proto in categorical_prototypes])
        
        if np.allclose(numerical_prototypes, new_numerical_prototypes) and np.all(categorical_prototypes_str == new_categorical_prototypes_str):
            break
        
        numerical_prototypes = new_numerical_prototypes
        categorical_prototypes = new_categorical_prototypes
    
    return assigned_clusters, numerical_prototypes, categorical_prototypes

def align_clusters(clusters, true_labels):
    unique_clusters = np.unique(clusters)
    aligned_clusters = np.zeros_like(clusters)
    for cluster in unique_clusters:
        true_label = pd.Series(true_labels[clusters == cluster]).mode()[0]
        aligned_clusters[clusters == cluster] = true_label
    return aligned_clusters


data = pd.read_csv('Heart.csv')
truelabels =data['target']
data.drop(['target'], axis=1, inplace=True)

# Separating numerical and categorical columns
numerical_cols = data[['age','trestbps','chol','thalach','oldpeak','ca']].copy()
categorical_cols = data.drop(['age','trestbps','chol','thalach','oldpeak','ca',], axis=1).copy()

preprocessed_data = pd.concat([numerical_cols, categorical_cols], axis=1)

whichnum = ['age','trestbps','chol','thalach','oldpeak','ca']
whichcat= [col for col in data.columns if col not in numerical_cols]

n_clusters = 2
gamma = 0.5
accuracy_scores = []
rand_index_scores = []
num_runs = 50

for _ in range(num_runs):
    assignedclusters, numerical_prototypes, categorical_prototypes = k_prototypes(preprocessed_data, n_clusters)
    
    aligned_clusters = align_clusters(assignedclusters, truelabels)
    accuracy = accuracy_score(truelabels, aligned_clusters)
    accuracy_scores.append(accuracy)
    
    adjusted_rand_index = adjusted_rand_score(truelabels, aligned_clusters)
    rand_index_scores.append(adjusted_rand_index)

mean_accuracy_score = np.mean(accuracy_scores)
mean_rand_index_score = np.mean(rand_index_scores)

print("Mean Accuracy Score:", mean_accuracy_score)
print("Mean Adjusted Rand Index Score:", mean_rand_index_score)

[[ 57.  140.  192.  148.    0.4   0. ]
 [ 59.  126.  218.  134.    2.2   1. ]]
[[1 0 0 1 0 1 1]
 [1 2 1 1 0 1 1]]
[[ 39.  118.  219.  140.    1.2   0. ]
 [ 70.  160.  269.  112.    2.9   1. ]]
[[1 0 0 1 0 1 3]
 [1 2 0 1 1 1 3]]
[[ 58.  140.  211.  165.    0.    0. ]
 [ 57.  140.  192.  148.    0.4   0. ]]
[[1 2 1 0 0 2 2]
 [1 0 0 1 0 1 1]]
[[ 58.  105.  240.  154.    0.6   0. ]
 [ 47.  110.  275.  118.    1.    1. ]]
[[1 2 0 0 1 1 3]
 [1 0 0 0 1 1 2]]
[[ 39.  140.  321.  182.    0.    0. ]
 [ 35.  138.  183.  182.    1.4   0. ]]
[[1 2 0 0 0 2 2]
 [0 0 0 1 0 2 2]]
[[ 62.  140.  268.  160.    3.6   2. ]
 [ 64.  170.  227.  155.    0.6   0. ]]
[[0 0 0 0 0 0 2]
 [1 3 0 0 0 1 3]]
[[ 51.  100.  222.  143.    1.2   0. ]
 [ 57.  120.  354.  163.    0.6   0. ]]
[[1 2 0 1 1 1 2]
 [0 0 0 1 1 2 2]]
[[ 70.  130.  322.  109.    2.4   3. ]
 [ 39.  118.  219.  140.    1.2   0. ]]
[[1 0 0 0 0 1 2]
 [1 0 0 1 0 1 3]]
[[ 58. 136. 319. 152.   0.   2.]
 [ 67. 120. 237.  71.   1.   0.]]
[[0 1 1 0 0 2 2]
 [1 

In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, adjusted_rand_score

data = pd.read_csv('Heart.csv')
truelabels = LabelEncoder().fit_transform(data['target'])
data.drop(['target'], axis=1, inplace=True)

# One-hot encoding the categorical columns
data_encoded = pd.get_dummies(data)

numerical_cols = data[['age','trestbps','chol','thalach','oldpeak','ca']]
preprocessed_data = pd.concat([numerical_cols, data_encoded], axis=1)

n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters, n_init=50,verbose=2)
kmeans.fit_predict(preprocessed_data)
accuracy = accuracy_score(truelabels, kmeans.labels_)
adjusted_rand_index = adjusted_rand_score(truelabels, kmeans.labels_)

print("Mean Accuracy Score:", mean_accuracy_score)
print("Mean Adjusted Rand Index Score:", mean_rand_index_score)

Initialization complete
Iteration 0, inertia 6643212.419999996.
Iteration 1, inertia 4219066.079482127.
Iteration 2, inertia 4082344.5427724156.
Iteration 3, inertia 4041530.447848799.
Iteration 4, inertia 4032854.4897886897.
Iteration 5, inertia 4029614.968326894.
Iteration 6, inertia 4028914.698899923.
Converged at iteration 6: strict convergence.
Initialization complete
Iteration 0, inertia 6280574.900000002.
Iteration 1, inertia 4144364.541274579.
Iteration 2, inertia 4043262.5580136892.
Iteration 3, inertia 4032175.197856026.
Iteration 4, inertia 4028045.2238663672.
Iteration 5, inertia 4027631.842486128.
Iteration 6, inertia 4026729.6746572.
Iteration 7, inertia 4026480.696523622.
Converged at iteration 7: strict convergence.
Initialization complete
Iteration 0, inertia 5825158.0600000005.
Iteration 1, inertia 4078321.058363896.
Iteration 2, inertia 4027704.5720674167.
Iteration 3, inertia 4026729.6746572.
Iteration 4, inertia 4026480.696523622.
Converged at iteration 4: strict c

In [8]:
#Discritizing the numerical columns to use KModes on the heterogeneous data
from sklearn.preprocessing import KBinsDiscretizer
from kmodes.kmodes import KModes
from sklearn.metrics import accuracy_score

# Discretizing numerical columns
numerical_cols = data[['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']].copy()
kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
numerical_cols_discretized = kbins.fit_transform(numerical_cols)

# Combining the discretized numerical columns with categorical columns
preprocessed_data = pd.concat([pd.DataFrame(numerical_cols_discretized, columns=['age', 'trestbps', 'chol', 'thalach', 'oldpeak', 'ca']), categorical_cols], axis=1)

km = KModes(n_clusters=n_clusters, init='Huang', n_init=50, verbose=2)
km.fit_predict(preprocessed_data)
accuracy_kmodes = accuracy_score(truelabels, km.labels_)
adjusted_rand_index = adjusted_rand_score(truelabels, km.labels_)

print("Mean Kmodes Accuracy Score:", accuracy_kmodes)
print("Mean Kmodes Adjusted Rand Index Score:", adjusted_rand_index)





Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 165, cost: 5038.0
Run 1, iteration: 2/100, moves: 98, cost: 5038.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 249, cost: 4850.0
Run 2, iteration: 2/100, moves: 91, cost: 4829.0
Run 2, iteration: 3/100, moves: 26, cost: 4829.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 154, cost: 4881.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 281, cost: 4996.0
Run 4, iteration: 2/100, moves: 126, cost: 4925.0
Run 4, iteration: 3/100, moves: 15, cost: 4925.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 290, cost: 5024.0
Run 5, iteration: 2/100, moves: 111, cost: 5024.0
Init: initializing centroids
Init: initializing cluste