In [1]:
import pandas as pd
import numpy as np
import math as mt
import random as rd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


In [2]:
Wine = pd.read_csv('..\datafiles\wine\wine.csv',sep=',')
Wine.head()

Unnamed: 0,Class,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [3]:
WineData = np.array(Wine)[:,1:]
WineData[:5]

array([[1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
        3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
        1.065e+03],
       [1.320e+01, 1.780e+00, 2.140e+00, 1.120e+01, 1.000e+02, 2.650e+00,
        2.760e+00, 2.600e-01, 1.280e+00, 4.380e+00, 1.050e+00, 3.400e+00,
        1.050e+03],
       [1.316e+01, 2.360e+00, 2.670e+00, 1.860e+01, 1.010e+02, 2.800e+00,
        3.240e+00, 3.000e-01, 2.810e+00, 5.680e+00, 1.030e+00, 3.170e+00,
        1.185e+03],
       [1.437e+01, 1.950e+00, 2.500e+00, 1.680e+01, 1.130e+02, 3.850e+00,
        3.490e+00, 2.400e-01, 2.180e+00, 7.800e+00, 8.600e-01, 3.450e+00,
        1.480e+03],
       [1.324e+01, 2.590e+00, 2.870e+00, 2.100e+01, 1.180e+02, 2.800e+00,
        2.690e+00, 3.900e-01, 1.820e+00, 4.320e+00, 1.040e+00, 2.930e+00,
        7.350e+02]])

In [4]:
def setStartingCentroids(table, k, centroid_list=[]):
    if centroid_list == []:
        samp = rd.sample(range(0,len(table)), k)
        for x in range (0,k):
            centroid_list.append((table[samp[x]]))
    return centroid_list
    
def assignGroups(distances):
    groups = []
    for row in range(0,len(distances)):
        groups.append(distances[row].index(min(distances[row])))
    return groups
        
def setDistances(table, centroids):
    distances = []

    for row in range (0,len(table)):
        point_distances = []
        for x in range(0, len(centroids)):
            single_distance = 0
            for z in range(len(table[0])):
                single_distance +=mt.pow(table[row][z]-centroids[x][z],2)
            single_distance = mt.sqrt(single_distance)
            point_distances.append(single_distance)
        distances.append(point_distances)
    groups = assignGroups(distances)
    return groups

def newCentroids(table, centroids, groups):
    new_centroids = []
    for k in range (0, len(centroids)):
        count = 0
        centroid = []
        for x in range(0, len(table[0])):           #zrobienie w arrayu centroid tyle miejsc na wartości ile jest w tabeli
            centroid.append(0)
        for group in range(0, len(groups)):         #iteracja po groups[] i wybranie odpowiednich wartości z tabeli do wyliczenia nowego centroidu
            if (groups[group] == k):
                count += 1
                for z in range(0,len(centroid)):    
                    centroid[z] += table[k][z]
        if count != 0:                              #jeśli przynajmniej jeden punkt należy do centroidu to wyliczamy jego średnie koordynaty
            for av in range(0,len(centroid)):
                centroid[av] /= count
            new_centroids.append(centroid)
        else: print("removing useless centroid")    #w przeciwnym wypadku nawet go nie wyliczamy
    return new_centroids
    
    
def kMeans(table, k, n, centroid_list=[]):
    centroids = setStartingCentroids(table, k, centroid_list)
    groups = setDistances(table, centroids)
    print("Starting centroids\n", np.array(centroids))
    print(groups)
    for iterations in range(1,n):
        previous_centroids = centroids
        centroids = newCentroids(table, centroids, groups)
        if  (np.array(centroids) == np.array(previous_centroids)).all():
            print("After "+str(iterations)+" iterations program found an optimum. Leaving the loop...")
            break
        print("Centroids after recalculation, iteration nr "+str(iterations)+":\n", np.array(centroids))
        groups = setDistances(table, centroids)

In [5]:
kMeans(WineData, 3, 5)


Starting centroids
 [[1.296e+01 3.450e+00 2.350e+00 1.850e+01 1.060e+02 1.390e+00 7.000e-01
  4.000e-01 9.400e-01 5.280e+00 6.800e-01 1.750e+00 6.750e+02]
 [1.422e+01 3.990e+00 2.510e+00 1.320e+01 1.280e+02 3.000e+00 3.040e+00
  2.000e-01 2.080e+00 5.100e+00 8.900e-01 3.530e+00 7.600e+02]
 [1.434e+01 1.680e+00 2.700e+00 2.500e+01 9.800e+01 2.800e+00 1.310e+00
  5.300e-01 2.700e+00 1.300e+01 5.700e-01 1.960e+00 6.600e+02]]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 0, 2, 2, 2, 2, 0, 2, 2, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 1, 2, 2, 2, 0, 2, 0, 2, 2, 0, 0, 1, 2, 2, 2, 2, 1, 1, 1, 1, 2]
Centroids after recalculation, iteratio

In [6]:
kmeans = KMeans(
    n_clusters=3,
    n_init=10,
    max_iter=100,
    random_state=42
)

In [7]:
sc = StandardScaler()
WineData_scaled = sc.fit_transform(WineData)
kmeans.fit(WineData_scaled)

KMeans(max_iter=100, n_clusters=3, random_state=42)

In [8]:
kmeans.inertia_

1277.928488844642