In [313]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [314]:
dataset = pd.read_csv("soybean-small.data", header=None, delimiter=",")
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,4,0,2,1,1,1,0,1,0,2,...,0,0,4,0,0,0,0,0,0,D1
1,5,0,2,1,0,3,1,1,1,2,...,0,0,4,0,0,0,0,0,0,D1
2,3,0,2,1,0,2,0,2,1,1,...,0,0,4,0,0,0,0,0,0,D1
3,6,0,2,1,0,1,1,1,0,0,...,0,0,4,0,0,0,0,0,0,D1
4,4,0,2,1,0,3,0,2,0,2,...,0,0,4,0,0,0,0,0,0,D1


In [315]:
# Separating numerical and categorical columns
numerical_df = dataset.select_dtypes(include=['int', 'float'])
categorical_df = dataset.select_dtypes(include=['object'])
label_encoder = LabelEncoder()
categorical_df_encoded = categorical_df.apply(label_encoder.fit_transform)

In [316]:
print("Numerical Dataframe:")
print(numerical_df.head())

Numerical Dataframe:
   0   1   2   3   4   5   6   7   8   9   ...  25  26  27  28  29  30  31  \
0   4   0   2   1   1   1   0   1   0   2  ...   0   0   0   4   0   0   0   
1   5   0   2   1   0   3   1   1   1   2  ...   0   0   0   4   0   0   0   
2   3   0   2   1   0   2   0   2   1   1  ...   0   0   0   4   0   0   0   
3   6   0   2   1   0   1   1   1   0   0  ...   0   0   0   4   0   0   0   
4   4   0   2   1   0   3   0   2   0   2  ...   0   0   0   4   0   0   0   

   32  33  34  
0   0   0   0  
1   0   0   0  
2   0   0   0  
3   0   0   0  
4   0   0   0  

[5 rows x 35 columns]


In [317]:
print("\nCategorical encoded Dataframe:")
print(categorical_df_encoded.head())


Categorical encoded Dataframe:
   35
0   0
1   0
2   0
3   0
4   0


In [318]:
numcatconcat=pd.concat([numerical_df, categorical_df_encoded], axis=1)
numcatconcat.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
0,4,0,2,1,1,1,0,1,0,2,...,0,0,4,0,0,0,0,0,0,0
1,5,0,2,1,0,3,1,1,1,2,...,0,0,4,0,0,0,0,0,0,0
2,3,0,2,1,0,2,0,2,1,1,...,0,0,4,0,0,0,0,0,0,0
3,6,0,2,1,0,1,1,1,0,0,...,0,0,4,0,0,0,0,0,0,0
4,4,0,2,1,0,3,0,2,0,2,...,0,0,4,0,0,0,0,0,0,0


In [319]:
def euclidean_distance(point1, point2):
    return np.sqrt(np.sum((point1 - point2) ** 2))

def simple_matching_distance(point1, point2):
    return np.sum(point1 != point2)

def initialize_prototypes(n_clusters, numerical_df, categorical_df):

    indices = np.random.choice(len(numerical_df), n_clusters, replace=False)
    numerical_prototypes = numerical_df.iloc[indices].values # here we get the prototypes of the numerical indices from the numerical dataframe
    categorical_prototypes = categorical_df.iloc[indices].values # here we get the prototypes of the categorical indices from the categorical dataframe
    
    return numerical_prototypes, categorical_prototypes # numerical and categorical datatype prototypes 

def assign_clusters(numerical_df, categorical_df_encoded, numerical_prototypes, categorical_prototypes):
    assigned_clusters = []
    
    for _, row in numcatconcat.iterrows():
        distances = []
        for num_proto, cat_proto in zip(numerical_prototypes, categorical_prototypes): # here we iterate through the numerical and categorical prototypes
            num_distance = euclidean_distance(row[numerical_df.columns], num_proto)
            cat_distance = simple_matching_distance(row[categorical_df_encoded.columns], cat_proto)
            total_distance = num_distance + 0.5 * cat_distance
            distances.append(total_distance)

        assigned_clusters.append(np.argmin(distances))

    return assigned_clusters

def update_prototypes(numerical_df, categorical_df_encoded, assigned_clusters, n_clusters):
    new_numerical_prototypes = []
    new_categorical_prototypes = []
    
    for cluster in range(n_clusters):
        cluster_indices = [i for i, c in enumerate(assigned_clusters) if c == cluster] # cluster is from 0 to n_clusters, and c is the assigned cluster of the data point
        # if c == cluster, then the data point is assigned to the cluster
        # Calculate new numerical prototype for the cluster
        cluster_numerical_data = numerical_df.iloc[cluster_indices]
        new_numerical_prototype = cluster_numerical_data.mean(axis=0)
        new_numerical_prototypes.append(new_numerical_prototype)
        
        # Calculate new categorical prototype for the cluster
        cluster_categorical_data = categorical_df_encoded.iloc[cluster_indices]
        new_categorical_prototype = cluster_categorical_data.mode(axis=0).iloc[0]  # Mode to handle in categorical data, .iloc[0] to get the first found mode if we got many
        new_categorical_prototypes.append(new_categorical_prototype)
    
    return np.array(new_numerical_prototypes), np.array(new_categorical_prototypes)

def k_prototypes(numerical_df, categorical_df, n_clusters, max_iterations=100):
    # Here we initialize the prototypes
    numerical_prototypes, categorical_prototypes = initialize_prototypes(n_clusters, numerical_df, categorical_df)
    
    for _ in range(max_iterations):
        # Here we assign data points to clusters
        assigned_clusters = assign_clusters(numerical_df, categorical_df_encoded, numerical_prototypes, categorical_prototypes)
        
        # Here we update the prototypes
        new_numerical_prototypes, new_categorical_prototypes = update_prototypes(numerical_df, categorical_df, assigned_clusters, n_clusters)
        
        # Here we check for convergence
        if np.allclose(numerical_prototypes, new_numerical_prototypes) and np.allclose(categorical_prototypes, new_categorical_prototypes):
            break
        
        numerical_prototypes = new_numerical_prototypes
        categorical_prototypes = new_categorical_prototypes
    
    return assigned_clusters, numerical_prototypes, categorical_prototypes

In [320]:
n_clusters = 3

assigned_clusters, numerical_prototypes, categorical_prototypes = k_prototypes(numerical_df, categorical_df_encoded, n_clusters)

print("Assigned Clusters:", assigned_clusters)
print("\nNumerical Prototypes:", numerical_prototypes)
print("\nEncoded Categorical Prototypes:", categorical_prototypes)

Assigned Clusters: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 2, 2, 2, 1, 2, 1, 2, 1, 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1]

Numerical Prototypes: [[4.6        0.         1.         1.3        0.35       1.75
  1.4        1.15       0.5        1.1        1.         1.
  0.         2.         2.         0.         0.         0.
  1.         0.35       1.5        1.8        0.5        0.5
  0.         1.         0.5        0.         4.         0.
  0.         0.         0.         0.         0.        ]
 [1.6875     0.9375     1.8125     0.4375     0.375      2.375
  1.125      1.75       0.4375     1.1875     1.         0.8125
  0.         2.         2.         0.         0.         0.
  1.         0.1875     1.4375     1.8125     0.         0.5
  0.125      0.         0.         3.         4.         0.
  0.         0.         0.         0.         0.8125    ]
 [0.72727273 0.90909091 1.90909091 0.27272727 0.18181818 0.72727273
  1.         1.363636