In [1]:
import pandas as pd
import numpy as np

In [2]:
def read_data(file_name: str) -> pd.DataFrame:
    df = pd.read_csv(file_name, header=None, sep=',')
    df = df.replace('?', np.nan)
    return df.astype(np.float32)

In [3]:
def a_distance(centroid_a, object_a, range_a) -> float:
    if np.isnan(centroid_a) or np.isnan(object_a): return 1
    if centroid_a == object_a                    : return 0
    if range_a == 1                              : return 1
    return abs(centroid_a - object_a) / range_a

In [4]:
def HEOM(centroid, object, ranges) -> float:
    distances = np.array([a_distance(centroid[a], object[a], ranges[a]) for a in range(len(centroid))])
    return np.sqrt(np.sum(np.square(distances)))

In [5]:
def get_extremes(df : pd.DataFrame) -> list:
    return [(df[column].dropna().max(), df[column].dropna().min()) for column in df.columns]

In [6]:
def get_ranges(extremes : list) -> np.array:
    return np.array([ex_tup[0] - ex_tup[1] for ex_tup in extremes])

In [7]:
def get_centroids(extremes: list, k: int, ranges: np.array) -> np.array:
    centroids = []

    for _ in range(k):
        centroid = []

        for i, (max, min) in enumerate(extremes):
            if   ranges[i] == 1            : a = np.random.choice([max, min])
            elif max % 1 > 0 or min % 1 > 0: a = round(np.random.uniform(min, max), 2)
            else                           : a = np.random.randint(min, max)
            centroid.append(a)

        centroids.append(centroid)
        
    return np.array(centroids)

In [8]:
def clustering(centroids: np.array, df: pd.DataFrame, ranges: np.array) -> np.array:
    clusters = np.zeros(len(df))

    for i, row in df.iterrows():
        distances   = np.array([HEOM(centroid, row, ranges) for centroid in centroids])
        clusters[i] = np.argmin(distances)
        
    return clusters

In [9]:
def k_means(df: pd.DataFrame, k: int) -> pd.DataFrame:
    extremes     = get_extremes (df)
    ranges       = get_ranges   (extremes)
    centroids    = get_centroids(extremes, k, ranges)
    clusters     = clustering   (centroids, df, ranges)
    df_clustered = df.copy().assign(cluster=clusters)

    cluster_array = []
    cluster_array.append(clusters)
    while True:
        
        new_centroids = np.array  ([round(df_clustered[df_clustered['cluster'] == i].mean(), 2)[:-1] for i in range(k)])
        new_clusters  = clustering(new_centroids, df, ranges)

        cluster_array.append(new_clusters)

        if len(cluster_array) > 3: cluster_array.pop(0)
        if len(cluster_array) > 2 and np.array_equal(cluster_array[0], cluster_array[2]): break
        if np.array_equal(clusters, new_clusters): break

        clusters = new_clusters
        df_clustered['cluster'] = clusters

        print(f'cemtroids: \n{new_centroids}')
        print(f'clusters: \n{clusters}')
        
    return df.assign(cluster=clusters)

In [10]:
df = read_data('hepatitis\\hepatitis.data')
df_clustered = k_means(df, 3)
df_clustered.to_csv('hepatitis_clustered.csv', index=False)

cemtroids: 
[[  1.57  43.54   1.09   1.76   1.91   1.11   1.43   1.59   1.91   1.26
    1.68   1.34   1.7    1.7    1.87 127.38 114.58   3.56  55.32   1.59]
 [  1.68  38.21   1.32   1.16   1.42   1.11   1.16   1.53   1.89   1.74
    1.68   1.68   2.     1.89   1.61  92.25  89.32   3.92  56.75   1.37]
 [  1.93  40.63   1.07   1.45   1.9    1.53   1.79   1.96   1.77   1.72
    1.89   1.82   1.92   1.97   1.15  97.01  70.31   3.95  67.71   1.4 ]]
clusters: 
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 1. 1. 2. 0. 2. 0. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 0. 1. 1.
 0. 1. 1. 0. 1. 1. 0. 1. 1. 1. 0.]
cemtroids: 
[[  1.22  46.89   1.06   1.56   2.     1.     1.22   1.6