### Data Input

In [12]:
import pandas as pd

# some rows is broken, I fixed it by hand... because just double tab problem

header_list = ["area", "perimeter", "compactness", "length of kernel", "width of kernel", "asymmetry coefficient", "length of kernel groove", "label"]

data_df = pd.read_csv("../data/seeds.txt", sep='\t', lineterminator='\n', names=header_list)
data_df

Unnamed: 0,area,perimeter,compactness,length of kernel,width of kernel,asymmetry coefficient,length of kernel groove,label
0,15.26,14.84,0.8710,5.763,3.312,2.221,5.220,1
1,14.88,14.57,0.8811,5.554,3.333,1.018,4.956,1
2,14.29,14.09,0.9050,5.291,3.337,2.699,4.825,1
3,13.84,13.94,0.8955,5.324,3.379,2.259,4.805,1
4,16.14,14.99,0.9034,5.658,3.562,1.355,5.175,1
...,...,...,...,...,...,...,...,...
205,12.19,13.20,0.8783,5.137,2.981,3.631,4.870,3
206,11.23,12.88,0.8511,5.140,2.795,4.325,5.003,3
207,13.20,13.66,0.8883,5.236,3.232,8.315,5.056,3
208,11.84,13.21,0.8521,5.175,2.836,3.598,5.044,3


### Data Preprocessing

In [13]:
from sklearn.utils import shuffle

# data shuffle
data_df = shuffle(data_df)

### K-means Clustering  Implementation

In [14]:
import numpy as np

def sqare_euclidean_distance(v1, v2):
    result = 0
    # we are not going to calculate the label part
    for i in range(len(v1) - 1):
        result += (v1[i] - v2[i]) * (v1[i] - v2[i])
    
    return result

def which_cluster(v, centroid):
    this_centroid = -1
    min_dis = 10000000
    for i in range(len(centroid)):
        temp_dis = sqare_euclidean_distance(v, centroid[i]) 
        if temp_dis < min_dis:
            min_dis = temp_dis
            this_centroid = i
    
    return this_centroid

def get_mean_vector(np_array_2d):
    T = np.transpose(np_array_2d)
    mean_vector = []
    
    # calculate the label part of label part is redundant, just for keeping array size
    for v in T:
        mean_vector.append(np.mean(v))
        
    return mean_vector

def is_not_equal(v1, v2):
    # if the centroid is really close, we consider they are the same
    if sqare_euclidean_distance(v1, v2) < 0.00000000000001:
        return False
    return True


def k_means_clustering(df, k_value):
    # random select three data, we shuffle again and choose the first three
    df = shuffle(df)
    centroid = []
    for i in range(k_value):
        centroid.append(df.iloc[[i]].to_numpy()[0])
    
     # define container
    cluster_dict = {}
    
    is_terminated = False
    
    while(not is_terminated):
        temp_cluster_dict = {}
        
        for i in range(k_value):
            temp_cluster_dict[i] = []
        
        # cluster each data
        for index, row in df.iterrows():
            v = []
            for colname in df.columns[:]:
                v.append(row[colname])
            
            # find which cluster, and put it in
            cluster_index = which_cluster(v, centroid)
            temp_cluster_dict[cluster_index].append(v)
        
        # get new centroid
        new_centroid = [-1 for i in range(k_value)]
        for key in temp_cluster_dict:
            new_centroid[key] = get_mean_vector(temp_cluster_dict[key])
        
        # check if converge
        is_converge = True
        for i in range(len(centroid)):
            if is_not_equal(centroid[i], new_centroid[i]):
                is_converge = False
                break
        
        if is_converge:
            is_terminated = True
            centroid = new_centroid
            cluster_dict = temp_cluster_dict
        else:
            centroid = new_centroid
    
    # save the original classified result, the order is the same
    original_cluster = cluster_dict.copy()
    original_df = pd.DataFrame()
    for key in original_cluster:
        d = pd.DataFrame(original_cluster[key])
        original_df = pd.concat([original_df, d])
    
    
    # assign new label
    for key in cluster_dict:
        # calculate which label is the most in count
        label_record = {}
        for v in cluster_dict[key]:
            label = v[len(v) - 1]
            if label not in label_record.keys():
                label_record[label] = 1
            else:
                label_record[label] += 1
        
        max_label = 0
        max_label_count = -1
        for k in label_record:
            if label_record[k] > max_label_count:
                max_label_count = label_record[k]
                max_label = k
        
        # redefine all label in that cluster
        for v in cluster_dict[key]:
            v[len(v) - 1] = int(max_label)
            
    # transfer to huge dataframe
    result_df = pd.DataFrame()
    for key in cluster_dict:
        d = pd.DataFrame(cluster_dict[key])
        result_df = pd.concat([result_df, d])
    
    return result_df, centroid, original_df

dfc = data_df.copy()
rdf, cen, original = k_means_clustering(dfc, 3)

### Result

In [15]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score
from prettytable import PrettyTable

predict = rdf[7]
answer = original[7]

cm = confusion_matrix(answer, predict)
acc = accuracy_score(answer, predict)
rc = recall_score(answer, predict, average='macro')
pcs = precision_score(answer, predict, average='macro')

# print table
x = PrettyTable()

x.field_names = ["Confusion Matrix", "Accuracy", "Recall", "Precision"]
x.add_row([cm, acc, rc, pcs])
print(x)


+------------------+--------------------+--------------------+--------------------+
| Confusion Matrix |      Accuracy      |       Recall       |     Precision      |
+------------------+--------------------+--------------------+--------------------+
|   [[57  1 12]    | 0.8904761904761904 | 0.8904761904761904 | 0.8960037875397106 |
|    [10 60  0]    |                    |                    |                    |
|    [ 0  0 70]]   |                    |                    |                    |
+------------------+--------------------+--------------------+--------------------+
