In [84]:
import numpy as np
import pandas as pd

In [119]:
data = pd.read_csv("soybean-small.data", header=None, delimiter=",")

data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,4,0,2,1,1,1,0,1,0,2,...,0,0,4,0,0,0,0,0,0,D1
1,5,0,2,1,0,3,1,1,1,2,...,0,0,4,0,0,0,0,0,0,D1
2,3,0,2,1,0,2,0,2,1,1,...,0,0,4,0,0,0,0,0,0,D1
3,6,0,2,1,0,1,1,1,0,0,...,0,0,4,0,0,0,0,0,0,D1
4,4,0,2,1,0,3,0,2,0,2,...,0,0,4,0,0,0,0,0,0,D1


In [161]:
class KPrototypes:
    def __init__(self, n_clusters=3, max_iter=100, verbose=0, random_state=None):
        self.n_clusters = n_clusters
        self.max_iter = max_iter
        self.verbose = verbose
        self.random_state = random_state

    def fit_predict(self, data, categorical_cols):
        np.random.seed(self.random_state)
        num_data = data[:, :-len(categorical_cols)].astype(float)
        cat_data = self._encode_categorical(data[:, -len(categorical_cols):])
        centroids = self._initialize_centroids(num_data, cat_data)

        for _ in range(self.max_iter):
            labels = self._assign_clusters(num_data, cat_data, centroids)
            new_centroids = self._update_centroids(num_data, cat_data, labels)
            if np.allclose(centroids, new_centroids):
                break
            centroids = new_centroids

        return labels

    def _encode_categorical(self, cat_data):
        unique_cats = []
        for i in range(cat_data.shape[1]):
            unique_cats.append(np.unique(cat_data[:, i]))
        encoded_cats = np.zeros_like(cat_data, dtype=int)
        for i in range(cat_data.shape[1]):
            for j, cat in enumerate(unique_cats[i]):
                encoded_cats[cat_data[:, i] == cat, i] = j
        return encoded_cats

    def _initialize_centroids(self, num_data, cat_data):
        np.random.seed(self.random_state) 
        num_centroids = np.random.rand(self.n_clusters, num_data.shape[1])
        cat_centroids = np.empty((self.n_clusters, cat_data.shape[1]), dtype=int)

        for i in range(cat_data.shape[1]):
            cat_centroids[:, i] = np.random.randint(0, len(np.unique(cat_data[:, i])), size=self.n_clusters)

        centroids = np.hstack((num_centroids, cat_centroids))
        return centroids

    def _assign_clusters(self, num_data, cat_data, centroids):
        labels = np.zeros(num_data.shape[0], dtype=int)

        for i in range(num_data.shape[0]):
            distances = np.zeros(self.n_clusters)

            for j in range(self.n_clusters):
                num_instance = num_data[i]
                num_centroid = centroids[j, :num_data.shape[1]]
                cat_instance = cat_data[i]
                cat_centroid = centroids[j, num_data.shape[1]:]
                num_distance = np.linalg.norm(num_instance - num_centroid)
                cat_distance = np.linalg.norm(cat_instance - cat_centroid)
                distances[j] = num_distance + cat_distance

            labels[i] = np.argmin(distances)

        return labels

    def _update_centroids(self, num_data, cat_data, labels):
        new_num_centroids = np.zeros((self.n_clusters, num_data.shape[1]))
        new_cat_centroids = np.zeros((self.n_clusters, cat_data.shape[1]), dtype=int)

        for k in range(self.n_clusters):
            cluster_indices = np.where(labels == k)[0]
            if len(cluster_indices) > 0:
                new_num_centroids[k] = np.mean(num_data[cluster_indices], axis=0)
                for i in range(cat_data.shape[1]):
                    unique_values, counts = np.unique(cat_data[cluster_indices, i], return_counts=True)
                    new_cat_centroids[k, i] = unique_values[np.argmax(counts)]
            else:
                pass

        new_centroids = np.hstack((new_num_centroids, new_cat_centroids))
        return new_centroids

numerical_cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
categorical_cols = [35]

kproto = KPrototypes(n_clusters=3, max_iter=100, verbose=2, random_state=42)
clusters = kproto.fit_predict(data.values, categorical_cols)

print("Cluster labels:")
print(clusters)

Cluster labels:
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0]


In [162]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import silhouette_score

encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data[categorical_cols])
processed_data = np.hstack((data[numerical_cols], encoded_data.toarray()))

silhouette_avg = silhouette_score(processed_data, clusters)
print("Silhouette Score:", silhouette_avg)


Silhouette Score: 0.3759204535803909


# Scitkit-Learn implementation

In [124]:
from kmodes.kprototypes import KPrototypes as kprototy

In [114]:
data = pd.read_csv("soybean-small.data", header=None, delimiter=",")

numerical_cols = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]
categorical_cols = [35]

In [145]:
kproto = kprototy(n_clusters=3, init='Cao', verbose=2)
clusters = kproto.fit_predict(data.values, categorical=categorical_cols)

print("Numerical centroids:")
print(kproto.cluster_centroids_[0])

print("Categorical centroids:")
print(kproto.cluster_centroids_[1])

print("Cluster labels:")
print(clusters)

Initialization method and algorithm are deterministic. Setting n_init to 1.
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 1, iteration: 1/100, moves: 8, ncost: 338.6945535296232
Run: 1, iteration: 2/100, moves: 8, ncost: 251.16461943764273
Run: 1, iteration: 3/100, moves: 1, ncost: 248.65327133223863
Run: 1, iteration: 4/100, moves: 0, ncost: 248.65327133223863
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 2, iteration: 1/100, moves: 11, ncost: 248.65327133223863
Run: 2, iteration: 2/100, moves: 0, ncost: 248.65327133223863
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 3, iteration: 1/100, moves: 22, ncost: 252.24395828061793
Run: 3, iteration: 2/100, moves: 1, ncost: 248.65327133223863
Run: 3, iteration: 3/100, moves: 0, ncost: 248.65327133223863
Init: initializing centroids
Init: initializing clusters
Starting iterations...
Run: 4, iteration: 1/100, moves: 19, ncost: 3

In [146]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import silhouette_score

encoder = OneHotEncoder()
encoded_data = encoder.fit_transform(data[categorical_cols])
processed_data = np.hstack((data[numerical_cols], encoded_data.toarray()))

silhouette_avg = silhouette_score(processed_data, clusters)
print("Silhouette Score:", silhouette_avg)


Silhouette Score: 0.4598654022670218
