In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, adjusted_rand_score

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

def simple_matching_distance(point1, point2):
    return np.sum(point1 != point2)

def initialize_prototypes(preprocessed_data, n_clusters):
    numerical_indices = np.random.choice(len(preprocessed_data), n_clusters, replace=False)
    categorical_indices = np.random.choice(len(preprocessed_data), n_clusters, replace=False)
    numerical_prototypes = preprocessed_data[whichnum].iloc[numerical_indices].values
    categorical_prototypes = preprocessed_data[whichcat].iloc[categorical_indices].values
    return numerical_prototypes, categorical_prototypes

def assign_clusters(preprocessed_data, numerical_prototypes, categorical_prototypes, gamma):
    assigned_clusters = []
    for _, row in preprocessed_data.iterrows():
        distances = []
        for num_proto, cat_proto in zip(numerical_prototypes, categorical_prototypes):
            num_distance = euclidean_distance(row[whichnum], num_proto)
            cat_distance = simple_matching_distance(row[whichcat], cat_proto)
            total_distance = num_distance + gamma * cat_distance
            distances.append(total_distance)
        assigned_clusters.append(np.argmin(distances))
    return assigned_clusters

def update_prototypes(preprocessed_data, assigned_clusters, n_clusters):
    new_numerical_prototypes = []
    new_categorical_prototypes = []
    
    for cluster in range(n_clusters):
        cluster_indices = [i for i, c in enumerate(assigned_clusters) if c == cluster]
        cluster_data = preprocessed_data.iloc[cluster_indices]
        new_numerical_prototype = cluster_data[whichnum].mean(axis=0)
        new_categorical_prototype = cluster_data[whichcat].mode(axis=0).iloc[0]
        new_numerical_prototypes.append(new_numerical_prototype)
        new_categorical_prototypes.append(new_categorical_prototype)
    
    return np.array(new_numerical_prototypes), np.array(new_categorical_prototypes)

def k_prototypes(preprocessed_data, n_clusters, max_iterations=100, tol=1e-5):
    numerical_prototypes, categorical_prototypes = initialize_prototypes(preprocessed_data, n_clusters)
    
    for _ in range(max_iterations):
        assigned_clusters = assign_clusters(preprocessed_data, numerical_prototypes, categorical_prototypes, gamma)
        
        new_numerical_prototypes, new_categorical_prototypes = update_prototypes(preprocessed_data, assigned_clusters, n_clusters)
        
        if np.allclose(numerical_prototypes, new_numerical_prototypes, atol=tol) and np.allclose(categorical_prototypes, new_categorical_prototypes, atol=tol):
            break
        
        numerical_prototypes = new_numerical_prototypes
        categorical_prototypes = new_categorical_prototypes
    
    return assigned_clusters, numerical_prototypes, categorical_prototypes

def align_clusters(clusters, true_labels):
    unique_clusters = np.unique(clusters)
    aligned_clusters = np.zeros_like(clusters)
    for cluster in unique_clusters:
        true_label = pd.Series(true_labels[clusters == cluster]).mode()[0]
        aligned_clusters[clusters == cluster] = true_label
    return aligned_clusters


data = pd.read_csv('Dermatology.csv')
truelabels =data['class']
data.drop(['class'], axis=1, inplace=True)

# Separating numerical and categorical columns
numerical_cols = data[['age','erythema','scaling','definite_borders','itching','koebner_phenomenon','polygonal_papules','follicular_papules','oral_mucosal_involvement','knee_and_elbow_involvement','scalp_involvement','melanin_incontinence','eosinophils_in_the_infiltrate','pnl_infiltrate','fibrosis_of_the_papillary_dermis','exocytosis','acanthosis','hyperkeratosis','parakeratosis','clubbing_of_the_rete_ridges','elongation_of_the_rete_ridges','thinning_of_the_suprapapillary_epidermis','spongiform_pustule','munro_microabcess','focal_hypergranulosis','disappearance_of_the_granular_layer','vacuolisation_and_damage_of_basal_layer','spongiosis','saw-tooth_appearance_of_retes','follicular_horn_plug','perifollicular_parakeratosis','inflammatory_monoluclear_inflitrate','band-like_infiltrate']].copy()
categorical_cols = data.drop(['age','erythema','scaling','definite_borders','itching','koebner_phenomenon','polygonal_papules','follicular_papules','oral_mucosal_involvement','knee_and_elbow_involvement','scalp_involvement','melanin_incontinence','eosinophils_in_the_infiltrate','pnl_infiltrate','fibrosis_of_the_papillary_dermis','exocytosis','acanthosis','hyperkeratosis','parakeratosis','clubbing_of_the_rete_ridges','elongation_of_the_rete_ridges','thinning_of_the_suprapapillary_epidermis','spongiform_pustule','munro_microabcess','focal_hypergranulosis','disappearance_of_the_granular_layer','vacuolisation_and_damage_of_basal_layer','spongiosis','saw-tooth_appearance_of_retes','follicular_horn_plug','perifollicular_parakeratosis','inflammatory_monoluclear_inflitrate','band-like_infiltrate'], axis=1).copy()

preprocessed_data = pd.concat([numerical_cols, categorical_cols], axis=1)

whichnum = ['age','erythema','scaling','definite_borders','itching','koebner_phenomenon','polygonal_papules','follicular_papules','oral_mucosal_involvement','knee_and_elbow_involvement','scalp_involvement','melanin_incontinence','eosinophils_in_the_infiltrate','pnl_infiltrate','fibrosis_of_the_papillary_dermis','exocytosis','acanthosis','hyperkeratosis','parakeratosis','clubbing_of_the_rete_ridges','elongation_of_the_rete_ridges','thinning_of_the_suprapapillary_epidermis','spongiform_pustule','munro_microabcess','focal_hypergranulosis','disappearance_of_the_granular_layer','vacuolisation_and_damage_of_basal_layer','spongiosis','saw-tooth_appearance_of_retes','follicular_horn_plug','perifollicular_parakeratosis','inflammatory_monoluclear_inflitrate','band-like_infiltrate']
whichcat= [col for col in data.columns if col not in whichnum]
n_clusters = 6
accuracy_scores = []
rand_index_scores = []
num_runs = 50
gamma=0.5
for _ in range(num_runs):
    assignedclusters, numerical_prototypes, categorical_prototypes = k_prototypes(preprocessed_data, n_clusters)
    
    aligned_clusters = align_clusters(assignedclusters, truelabels)
    accuracy = accuracy_score(truelabels, aligned_clusters)
    accuracy_scores.append(accuracy)
    
    adjusted_rand_index = adjusted_rand_score(truelabels, aligned_clusters)
    rand_index_scores.append(adjusted_rand_index)

mean_accuracy_score = np.mean(accuracy_scores)
mean_rand_index_score = np.mean(rand_index_scores)

print("Mean Accuracy Score:", mean_accuracy_score)
print("Mean Adjusted Rand Index Score:", mean_rand_index_score)

Mean Accuracy Score: 0.37038251366120223
Mean Adjusted Rand Index Score: 0.06425682926716242


In [20]:
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score

data = pd.read_csv('dermatology.csv')
data['family_history'] = data['family_history'].astype('category')
data_encoded = pd.get_dummies(data)
preprocessed_data = data_encoded.copy()
preprocessed_data.drop('class', axis=1, inplace=True)

numerical_cols = data[['age','erythema','scaling','definite_borders','itching','koebner_phenomenon','polygonal_papules','follicular_papules','oral_mucosal_involvement','knee_and_elbow_involvement','scalp_involvement','melanin_incontinence','eosinophils_in_the_infiltrate','pnl_infiltrate','fibrosis_of_the_papillary_dermis','exocytosis','acanthosis','hyperkeratosis','parakeratosis','clubbing_of_the_rete_ridges','elongation_of_the_rete_ridges','thinning_of_the_suprapapillary_epidermis','spongiform_pustule','munro_microabcess','focal_hypergranulosis','disappearance_of_the_granular_layer','vacuolisation_and_damage_of_basal_layer','spongiosis','saw-tooth_appearance_of_retes','follicular_horn_plug','perifollicular_parakeratosis','inflammatory_monoluclear_inflitrate','band-like_infiltrate']]
preprocessed_data = pd.concat([numerical_cols, data_encoded], axis=1)

n_clusters = 6

kmeans = KMeans(n_clusters=n_clusters, n_init=50,verbose=2)
kmeans.fit_predict(preprocessed_data)

accuracy = accuracy_score(truelabels, kmeans.labels_)

adjusted_rand_index = adjusted_rand_score(truelabels, kmeans.labels_)

print("Mean Accuracy Score:", mean_accuracy_score)
print("Mean Adjusted Rand Index Score:", mean_rand_index_score)

Initialization complete
Iteration 0, inertia 46449.0.
Iteration 1, inertia 26201.451105145494.
Iteration 2, inertia 25252.101884546268.
Iteration 3, inertia 24948.729687126233.
Iteration 4, inertia 24816.495214006165.
Iteration 5, inertia 24758.49455539891.
Iteration 6, inertia 24727.32852747242.
Iteration 7, inertia 24717.466379527665.
Iteration 8, inertia 24710.515810395227.
Converged at iteration 8: strict convergence.
Initialization complete
Iteration 0, inertia 41638.0.
Iteration 1, inertia 29274.053186290395.
Iteration 2, inertia 28164.240089937663.
Iteration 3, inertia 27868.691944124083.
Iteration 4, inertia 27590.987693673815.
Iteration 5, inertia 27425.57361877284.
Iteration 6, inertia 27197.184557448272.
Iteration 7, inertia 26816.89571155611.
Iteration 8, inertia 26671.549427430182.
Iteration 9, inertia 26648.240832387302.
Iteration 10, inertia 26620.65542623319.
Iteration 11, inertia 26590.656886916746.
Iteration 12, inertia 26509.27582159204.
Iteration 13, inertia 26193.1

In [15]:
#Discritizing the numerical columns to use KModes on the heterogeneous data
from sklearn.preprocessing import KBinsDiscretizer
from kmodes.kmodes import KModes
from sklearn.metrics import accuracy_score

# Discretize numerical columns
numerical_cols = data[['age','erythema','scaling','definite_borders','itching','koebner_phenomenon','polygonal_papules','follicular_papules','oral_mucosal_involvement','knee_and_elbow_involvement','scalp_involvement','melanin_incontinence','eosinophils_in_the_infiltrate','pnl_infiltrate','fibrosis_of_the_papillary_dermis','exocytosis','acanthosis','hyperkeratosis','parakeratosis','clubbing_of_the_rete_ridges','elongation_of_the_rete_ridges','thinning_of_the_suprapapillary_epidermis','spongiform_pustule','munro_microabcess','focal_hypergranulosis','disappearance_of_the_granular_layer','vacuolisation_and_damage_of_basal_layer','spongiosis','saw-tooth_appearance_of_retes','follicular_horn_plug','perifollicular_parakeratosis','inflammatory_monoluclear_inflitrate','band-like_infiltrate']].copy()
kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
numerical_cols_discretized = kbins.fit_transform(numerical_cols)

# Combine discretized numerical columns with categorical columns
preprocessed_data = pd.concat([pd.DataFrame(numerical_cols_discretized, columns=['age','erythema','scaling','definite_borders','itching','koebner_phenomenon','polygonal_papules','follicular_papules','oral_mucosal_involvement','knee_and_elbow_involvement','scalp_involvement','melanin_incontinence','eosinophils_in_the_infiltrate','pnl_infiltrate','fibrosis_of_the_papillary_dermis','exocytosis','acanthosis','hyperkeratosis','parakeratosis','clubbing_of_the_rete_ridges','elongation_of_the_rete_ridges','thinning_of_the_suprapapillary_epidermis','spongiform_pustule','munro_microabcess','focal_hypergranulosis','disappearance_of_the_granular_layer','vacuolisation_and_damage_of_basal_layer','spongiosis','saw-tooth_appearance_of_retes','follicular_horn_plug','perifollicular_parakeratosis','inflammatory_monoluclear_inflitrate','band-like_infiltrate']), categorical_cols], axis=1)

km = KModes(n_clusters=n_clusters, init='Huang', n_init=50, verbose=2)
km.fit_predict(preprocessed_data)

accuracy_kmodes = accuracy_score(truelabels, km.labels_)
adjusted_rand_index = adjusted_rand_score(truelabels, km.labels_)

print("Mean Kmodes Accuracy Score:", accuracy_kmodes)
print("Mean Kmodes Adjusted Rand Index Score:", adjusted_rand_index)


Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 136, cost: 3183.0
Run 1, iteration: 2/100, moves: 26, cost: 3178.0
Run 1, iteration: 3/100, moves: 0, cost: 3178.0




Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 107, cost: 3206.0
Run 2, iteration: 2/100, moves: 43, cost: 3201.0
Run 2, iteration: 3/100, moves: 4, cost: 3201.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 152, cost: 3301.0
Run 3, iteration: 2/100, moves: 82, cost: 3219.0
Run 3, iteration: 3/100, moves: 20, cost: 3219.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 143, cost: 3289.0
Run 4, iteration: 2/100, moves: 97, cost: 3204.0
Run 4, iteration: 3/100, moves: 49, cost: 3196.0
Run 4, iteration: 4/100, moves: 8, cost: 3196.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 158, cost: 3210.0
Run 5, iteration: 2/100, moves: 49, cost: 3210.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 6, ite