In [7]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, adjusted_rand_score

def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

def simple_matching_distance(point1, point2):
    return np.sum(point1 != point2)

def initialize_prototypes( preprocessed_data, n_clusters):
    indices = np.random.choice(len(numerical_cols), n_clusters, replace=False)
    numerical_prototypes = preprocessed_data[whichnum].iloc[indices].values
    categorical_prototypes = preprocessed_data[whichcat].iloc[indices].values
    print(numerical_prototypes)
    print(categorical_prototypes)
    return numerical_prototypes, categorical_prototypes

def assign_clusters( preprocessed_data , numerical_prototypes, categorical_prototypes):
    assigned_clusters = []
    for _, row in preprocessed_data.iterrows():
        distances = []
        for num_proto, cat_proto in zip(numerical_prototypes, categorical_prototypes): # here we iterate through the numerical and categorical prototypes
            num_distance = euclidean_distance(row[whichnum], num_proto)
            cat_distance = simple_matching_distance(row[whichcat], cat_proto)
            total_distance = num_distance + gamma * cat_distance
            distances.append(total_distance)

        assigned_clusters.append(np.argmin(distances))
        
    return assigned_clusters


def update_prototypes(preprocessed_data, assigned_clusters, n_clusters):
    new_numerical_prototypes = []
    new_categorical_prototypes = []
    
    for cluster in range(n_clusters):
        cluster_indices = [i for i, c in enumerate(assigned_clusters) if c == cluster]
        
        cluster_numerical_data = preprocessed_data[whichnum].iloc[cluster_indices]
        new_numerical_prototype = cluster_numerical_data.mean(axis=0)
        new_numerical_prototypes.append(new_numerical_prototype)
        
        cluster_categorical_data = preprocessed_data[whichcat].iloc[cluster_indices]
        new_categorical_prototype = cluster_categorical_data.mode(axis=0).iloc[0]
        new_categorical_prototypes.append(new_categorical_prototype)
    
    return np.array(new_numerical_prototypes), np.array(new_categorical_prototypes)

def align_clusters(clusters, true_labels):
    unique_clusters = np.unique(clusters)
    aligned_clusters = np.zeros_like(clusters)
    for cluster in unique_clusters:
        true_label = pd.Series(true_labels[clusters == cluster]).mode()[0]
        aligned_clusters[clusters == cluster] = true_label
    return aligned_clusters

def k_prototypes(preprocessed_data, n_clusters, max_iterations=100):
    numerical_prototypes, categorical_prototypes = initialize_prototypes(preprocessed_data , n_clusters)
    
    for _ in range(max_iterations):
        assigned_clusters = assign_clusters(preprocessed_data, numerical_prototypes, categorical_prototypes)
        new_numerical_prototypes, new_categorical_prototypes = update_prototypes(preprocessed_data, assigned_clusters, n_clusters)
     
        if np.allclose(numerical_prototypes, new_numerical_prototypes) and np.all(categorical_prototypes == new_categorical_prototypes):
            break
        
        numerical_prototypes = new_numerical_prototypes
        categorical_prototypes = new_categorical_prototypes
    
    return assigned_clusters, numerical_prototypes, categorical_prototypes

data = pd.read_csv('Autism dataset.csv')
truelabels = np.where(data.iloc[:, -1] == 'No', 0, 1)
data.drop(['Case_No', 'Class/ASD Traits '], axis=1, inplace=True)

# Separating numerical and categorical columns
numerical_cols = data[['Age_Mons', 'Qchat-10-Score']].copy()
categorical_cols = data.drop(['Age_Mons', 'Qchat-10-Score'], axis=1).copy()

preprocessed_data = pd.concat([numerical_cols, categorical_cols], axis=1)

whichnum = ['Age_Mons', 'Qchat-10-Score']
whichcat= [col for col in data.columns if col not in numerical_cols]

n_clusters = 2
gamma = 0.5
accuracy_scores = []
rand_index_scores = []
num_runs = 50

for _ in range(num_runs):
    assignedclusters, numerical_prototypes, categorical_prototypes = k_prototypes(preprocessed_data, n_clusters)
    aligned_clusters = align_clusters(assignedclusters, truelabels)
    
    accuracy = accuracy_score(truelabels, aligned_clusters)
    accuracy_scores.append(accuracy)
    
    adjusted_rand_index = adjusted_rand_score(truelabels, aligned_clusters)
    rand_index_scores.append(adjusted_rand_index)

mean_accuracy_score = np.mean(accuracy_scores)
mean_rand_index_score = np.mean(rand_index_scores)

print("Mean Accuracy Score:", mean_accuracy_score)
print("Mean Adjusted Rand Index Score:", mean_rand_index_score)

[[31  5]
 [28  7]]
[[0 0 1 1 1 1 1 0 0 0 'm' 'White European' 'yes' 'no' 'family member']
 [1 1 0 0 1 1 1 1 1 0 'm' 'south asian' 'no' 'no' 'family member']]
[[36  2]
 [36  8]]
[[0 1 0 0 0 0 1 0 0 0 'm' 'middle eastern' 'no' 'yes' 'family member']
 [1 1 0 1 1 1 1 1 0 1 'm' 'White European' 'yes' 'yes' 'family member']]
[[34  1]
 [30  3]]
[[0 0 0 0 0 1 0 0 0 0 'f' 'White European' 'no' 'yes' 'family member']
 [1 0 0 0 0 0 0 1 0 1 'm' 'middle eastern' 'no' 'no'
  'Health Care Professional']]
[[36  3]
 [15  6]]
[[0 0 0 0 0 0 1 0 1 1 'm' 'south asian' 'no' 'no' 'family member']
 [0 0 1 1 1 0 1 1 1 0 'm' 'White European' 'yes' 'no' 'family member']]
[[26  9]
 [36  3]]
[[1 0 1 1 1 1 1 1 1 1 'm' 'Others' 'no' 'yes' 'family member']
 [0 0 0 1 0 0 1 0 0 1 'm' 'south asian' 'no' 'no' 'family member']]
[[26  4]
 [30  1]]
[[0 0 0 0 1 1 1 0 1 0 'm' 'asian' 'no' 'yes' 'family member']
 [0 0 0 0 0 0 0 1 0 0 'm' 'middle eastern' 'no' 'no' 'family member']]
[[13  0]
 [19  3]]
[[0 0 0 0 0 0 0 0 0 0 'f' 

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, adjusted_rand_score

data = pd.read_csv('Autism dataset.csv')
truelabels = LabelEncoder().fit_transform(data['Class/ASD Traits '])
data.drop(['Case_No', 'Class/ASD Traits '], axis=1, inplace=True)

# One-hot encoding the categorical columns
data_encoded = pd.get_dummies(data)

numerical_cols = data[['Age_Mons', 'Qchat-10-Score']]
preprocessed_data = pd.concat([numerical_cols, data_encoded], axis=1)

n_clusters = 2
kmeans = KMeans(n_clusters=n_clusters, n_init=50,verbose=2)
kmeans.fit_predict(preprocessed_data)

kmaccuracy = accuracy_score(truelabels, kmeans.labels_)

kmadjusted_rand_index = adjusted_rand_score(truelabels, kmeans.labels_)

print("Mean Accuracy Score:", kmaccuracy)
print("Mean Adjusted Rand Index Score:", kmadjusted_rand_index)

Initialization complete
Iteration 0, inertia 101077.0.
Iteration 1, inertia 61684.2627350383.
Iteration 2, inertia 56087.28034624159.
Iteration 3, inertia 54907.56425126303.
Iteration 4, inertia 54694.32389184797.
Converged at iteration 4: strict convergence.
Initialization complete
Iteration 0, inertia 78274.0.
Iteration 1, inertia 57523.80663536931.
Iteration 2, inertia 55272.83828887521.
Iteration 3, inertia 54984.814626963365.
Converged at iteration 3: strict convergence.
Initialization complete
Iteration 0, inertia 110547.0.
Iteration 1, inertia 62742.75942089175.
Iteration 2, inertia 58366.76793712363.
Iteration 3, inertia 55272.83828887521.
Iteration 4, inertia 54984.81462696336.
Converged at iteration 4: strict convergence.
Initialization complete
Iteration 0, inertia 93402.0.
Iteration 1, inertia 54747.29771892735.
Iteration 2, inertia 54694.32389184797.
Converged at iteration 2: strict convergence.
Initialization complete
Iteration 0, inertia 77730.0.
Iteration 1, inertia 547

In [9]:
#Discritizing the numerical columns to use KModes on the heterogeneous data
from sklearn.preprocessing import KBinsDiscretizer
from kmodes.kmodes import KModes
from sklearn.metrics import accuracy_score

# Discretize numerical columns
numerical_cols = data[['Age_Mons', 'Qchat-10-Score']].copy()
kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform')
numerical_cols_discretized = kbins.fit_transform(numerical_cols)

# Combine discretized numerical columns with categorical columns
preprocessed_data = pd.concat([pd.DataFrame(numerical_cols_discretized, columns=['Age_Mons', 'Qchat-10-Score']), categorical_cols], axis=1)

km = KModes(n_clusters=n_clusters, init='Huang', n_init=50, verbose=2)
km.fit_predict(preprocessed_data)

accuracy_kmodes = accuracy_score(truelabels, km.labels_)
adjusted_rand_index_kmodes = adjusted_rand_score(truelabels, km.labels_)

print("Mean Kmodes Accuracy Score:", accuracy_kmodes)
print("Mean Kmodes Adjusted Rand Index Score:", adjusted_rand_index_kmodes)



Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 1, iteration: 1/100, moves: 181, cost: 5340.0




Run 1, iteration: 2/100, moves: 39, cost: 5340.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 2, iteration: 1/100, moves: 164, cost: 5342.0
Run 2, iteration: 2/100, moves: 28, cost: 5342.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 3, iteration: 1/100, moves: 131, cost: 5344.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 4, iteration: 1/100, moves: 213, cost: 5345.0
Run 4, iteration: 2/100, moves: 7, cost: 5345.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 5, iteration: 1/100, moves: 375, cost: 5368.0
Run 5, iteration: 2/100, moves: 122, cost: 5345.0
Run 5, iteration: 3/100, moves: 38, cost: 5342.0
Run 5, iteration: 4/100, moves: 6, cost: 5342.0
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run 6, iteration: 1/100, moves: 343, cost: 5342.0
Run 6, iteration: 2/100, moves: 37, cost: 5342.0
Init: i