In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from pso import ParticleSwarmOptimizedClustering
from particle import quantization_error, calc_sse
from utils import normalize
from kmeans import KMeans
from sklearn.metrics import silhouette_score

In [3]:
data = pd.read_csv('./data/iris.data', sep=',', header=None)
data.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
x = data.drop([4], axis=1)
x = x.values
x = normalize(x)

# K-Means

In [5]:
kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
kmeans.fit(x)

In [8]:
predicted_kmeans = kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans.SSE)
print('Quantization:', quantization_error(centroids=kmeans.centroid, data=x, labels=predicted_kmeans))

Silhouette: 0.5043188549150883
SSE: 6.998114004826762
Quantization: 0.19524413664147766


In [9]:
kmeans2 = KMeans(n_cluster=3, init_pp=True, seed=2018)
kmeans2.fit(x)
predicted_kmeans2 = kmeans2.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans2.SSE)
print('Quantization:', quantization_error(centroids=kmeans2.centroid, data=x, labels=predicted_kmeans2))

Silhouette: 0.5043188549150883
SSE: 10.90827498962253
Quantization: 0.2049806792871817


# PSO

In [10]:
pso = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=50)

  dist /= len(idx)


In [11]:
hist = pso.run()

Initial global best score 0.19524413664147766
Iteration 0001/2000 current gbest score 0.195244136641477661
Iteration 0051/2000 current gbest score 0.194827246862884662
Iteration 0101/2000 current gbest score 0.194698369971672669
Iteration 0151/2000 current gbest score 0.194552969232267142
Iteration 0201/2000 current gbest score 0.194541944216088086
Iteration 0251/2000 current gbest score 0.194541628213423995
Iteration 0301/2000 current gbest score 0.194541620158153933
Iteration 0351/2000 current gbest score 0.194541620089732303
Iteration 0401/2000 current gbest score 0.194541620089447614
Iteration 0451/2000 current gbest score 0.194541620089446560
Iteration 0501/2000 current gbest score 0.194541620089446560
Iteration 0551/2000 current gbest score 0.194541620089446560
Iteration 0601/2000 current gbest score 0.194541620089446560
Iteration 0651/2000 current gbest score 0.194541620089446560
Iteration 0701/2000 current gbest score 0.194541620089446560
Iteration 0751/2000 current gbest score

In [12]:
pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)

In [13]:
pso_kmeans.centroid = pso.gbest_centroids.copy()
pso_kmeans.centroid

array([[0.19851845, 0.58080113, 0.08115941, 0.05887771],
       [0.68747887, 0.44691558, 0.78758223, 0.82459663],
       [0.44400504, 0.31794577, 0.57825324, 0.5471383 ]])

In [14]:
predicted_pso = pso_kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_pso))
print('SSE:', calc_sse(centroids=pso.gbest_centroids, data=x, labels=predicted_pso))
print('Quantization:', pso.gbest_score)

Silhouette: 0.5043188549150883
SSE: 7.031103523946592
Quantization: 0.19454162008944656


# Repeated Test

### K-Means++

In [15]:
kmeanspp = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    kmean_rep = KMeans(n_cluster=3, init_pp=True)
    kmean_rep.fit(x)
    predicted_kmean_rep = kmean_rep.predict(x)
    silhouette = silhouette_score(x, predicted_kmean_rep)
    sse = kmean_rep.SSE
    quantization = quantization_error(centroids=kmean_rep.centroid, data=x, labels=predicted_kmean_rep)
    kmeanspp['silhouette'].append(silhouette)
    kmeanspp['sse'].append(sse)
    kmeanspp['quantization'].append(quantization)

### PSO 

In [16]:
%%time
pso_plain = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=False, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_plain['silhouette'].append(silhouette)
    pso_plain['sse'].append(sse)
    pso_plain['quantization'].append(quantization)

Initial global best score 0.2086473926239856
Iteration 0001/2000 current gbest score 0.208647392623985600
Finish with gbest score 0.197997170067414574
Initial global best score 0.20365216341317946
Iteration 0001/2000 current gbest score 0.203652163413179460


  dist /= len(idx)


Finish with gbest score 0.181392139064168095
Initial global best score 0.21849147498327967
Iteration 0001/2000 current gbest score 0.218491474983279671
Finish with gbest score 0.200016981083945694
Initial global best score 0.23406512076938202
Iteration 0001/2000 current gbest score 0.222980998282051840
Finish with gbest score 0.202727582369571113
Initial global best score 0.22415677728120617
Iteration 0001/2000 current gbest score 0.224156777281206171


  dist /= len(idx)


Finish with gbest score 0.188510140638140200
Initial global best score 0.2291942871105539
Iteration 0001/2000 current gbest score 0.218498635415427994


  dist /= len(idx)


Finish with gbest score 0.197200182522942136
Initial global best score 0.20357402359639934
Iteration 0001/2000 current gbest score 0.203574023596399339


  dist /= len(idx)


Finish with gbest score 0.172551183463512725
Initial global best score 0.20116819256884724
Iteration 0001/2000 current gbest score 0.201168192568847243


  dist /= len(idx)


Finish with gbest score 0.173592377210918386
Initial global best score 0.2261045835947644
Iteration 0001/2000 current gbest score 0.226104583594764402


  dist /= len(idx)


Finish with gbest score 0.211636314759352295
Initial global best score 0.2328809764767863
Iteration 0001/2000 current gbest score 0.232880976476786294
Finish with gbest score 0.212799303636352549
Initial global best score 0.1959299208563782
Iteration 0001/2000 current gbest score 0.195929920856378198


  dist /= len(idx)


Finish with gbest score 0.180199653143560456
Initial global best score 0.19320149404272788
Iteration 0001/2000 current gbest score 0.193201494042727878


  dist /= len(idx)


Finish with gbest score 0.176732134149863551
Initial global best score 0.2456264461564982
Iteration 0001/2000 current gbest score 0.245626446156498202


  dist /= len(idx)


Finish with gbest score 0.193695096469532674
Initial global best score 0.24737687284139717
Iteration 0001/2000 current gbest score 0.247376872841397172


  dist /= len(idx)


Finish with gbest score 0.184361830670260934
Initial global best score 0.2195841023352716
Iteration 0001/2000 current gbest score 0.219584102335271591
Finish with gbest score 0.201402518699666994
Initial global best score 0.205873487875695
Iteration 0001/2000 current gbest score 0.205873487875695010


  dist /= len(idx)


Finish with gbest score 0.179009892370034174
Initial global best score 0.2099022285001411
Iteration 0001/2000 current gbest score 0.209902228500141091


  dist /= len(idx)


Finish with gbest score 0.190453122352908788
Initial global best score 0.2280105497756684
Iteration 0001/2000 current gbest score 0.228010549775668397


  dist /= len(idx)


Finish with gbest score 0.190514109685863159
Initial global best score 0.21605080349961145
Iteration 0001/2000 current gbest score 0.216050803499611449


  dist /= len(idx)


Finish with gbest score 0.176692734553735326
Initial global best score 0.20858829674791876
Iteration 0001/2000 current gbest score 0.208588296747918761


  dist /= len(idx)


Finish with gbest score 0.189696407672581474
Wall time: 2min 1s


### PSO Hybrid

In [17]:
%%time
pso_hybrid = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=3, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=3, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_hybrid['silhouette'].append(silhouette)
    pso_hybrid['sse'].append(sse)
    pso_hybrid['quantization'].append(quantization)

Initial global best score 0.19618667657047784
Iteration 0001/2000 current gbest score 0.196186676570477836
Finish with gbest score 0.192768132383846019
Initial global best score 0.19618667657047784
Iteration 0001/2000 current gbest score 0.196186676570477836


  dist /= len(idx)


Finish with gbest score 0.193396590192720597
Initial global best score 0.19524413664147766
Iteration 0001/2000 current gbest score 0.195244136641477661
Finish with gbest score 0.194619252890615879
Initial global best score 0.19524413664147766
Iteration 0001/2000 current gbest score 0.195244136641477661


  dist /= len(idx)


Finish with gbest score 0.194780035122247258
Initial global best score 0.2049806792871817
Iteration 0001/2000 current gbest score 0.204980679287181694


  dist /= len(idx)


Finish with gbest score 0.171939411148728821
Initial global best score 0.19618667657047784
Iteration 0001/2000 current gbest score 0.196186676570477836
Finish with gbest score 0.192432463921336644
Initial global best score 0.19524413664147766
Iteration 0001/2000 current gbest score 0.195244136641477661
Finish with gbest score 0.194777353342043297
Initial global best score 0.19618667657047784
Iteration 0001/2000 current gbest score 0.196186676570477836
Finish with gbest score 0.193432307356696448
Initial global best score 0.19524413664147766
Iteration 0001/2000 current gbest score 0.195244136641477661


  dist /= len(idx)


Finish with gbest score 0.194520649159762998
Initial global best score 0.19056365811169776
Iteration 0001/2000 current gbest score 0.190563658111697759
Finish with gbest score 0.189041654924837377
Initial global best score 0.19618667657047784
Iteration 0001/2000 current gbest score 0.196186676570477836
Finish with gbest score 0.192872051429266417
Initial global best score 0.19618667657047784
Iteration 0001/2000 current gbest score 0.196186676570477836


  dist /= len(idx)


Finish with gbest score 0.194211595720774399
Initial global best score 0.2049806792871817
Iteration 0001/2000 current gbest score 0.204980679287181694


  dist /= len(idx)


Finish with gbest score 0.175754356733512279
Initial global best score 0.19524413664147766
Iteration 0001/2000 current gbest score 0.195244136641477661


  dist /= len(idx)


Finish with gbest score 0.194655731274969351
Initial global best score 0.2049806792871817
Iteration 0001/2000 current gbest score 0.204980679287181694
Finish with gbest score 0.173010101956820539
Initial global best score 0.19618667657047784
Iteration 0001/2000 current gbest score 0.196186676570477836
Finish with gbest score 0.195488426214421057
Initial global best score 0.19618667657047784
Iteration 0001/2000 current gbest score 0.196186676570477836


  dist /= len(idx)


Finish with gbest score 0.193371444916501600
Initial global best score 0.19524413664147766
Iteration 0001/2000 current gbest score 0.195244136641477661
Finish with gbest score 0.194717670617997690
Initial global best score 0.19618667657047784
Iteration 0001/2000 current gbest score 0.196186676570477836
Finish with gbest score 0.192892407499808155
Initial global best score 0.19524413664147766
Iteration 0001/2000 current gbest score 0.195244136641477661
Finish with gbest score 0.194619453232529938
Wall time: 1min 46s


# Comparison

In [18]:
benchmark = {
    'method' : ['K-Means++', 'PSO', 'PSO Hybrid'],
    'sse_mean' : [
        np.around(np.mean(kmeanspp['sse']), decimals=10),
        np.around(np.mean(pso_plain['sse']), decimals=10),
        np.around(np.mean(pso_hybrid['sse']), decimals=10),
    ],
    'sse_stdev' : [
        np.around(np.std(kmeanspp['sse']), decimals=10),
        np.around(np.std(pso_plain['sse']), decimals=10),
        np.around(np.std(pso_hybrid['sse']), decimals=10),
    ],
    'silhouette_mean' : [
        np.around(np.mean(kmeanspp['silhouette']), decimals=10),
        np.around(np.mean(pso_plain['silhouette']), decimals=10),
        np.around(np.mean(pso_hybrid['silhouette']), decimals=10),
    ],
    'silhouette_stdev' : [
        np.around(np.std(kmeanspp['silhouette']), decimals=10),
        np.around(np.std(pso_plain['silhouette']), decimals=10),
        np.around(np.std(pso_hybrid['silhouette']), decimals=10),
    ],
    'quantization_mean' : [
        np.around(np.mean(kmeanspp['quantization']), decimals=10),
        np.around(np.mean(pso_plain['quantization']), decimals=10),
        np.around(np.mean(pso_hybrid['quantization']), decimals=10),
    ],
    'quantization_stdev' : [
        np.around(np.std(kmeanspp['quantization']), decimals=10),
        np.around(np.std(pso_plain['quantization']), decimals=10),
        np.around(np.std(pso_hybrid['quantization']), decimals=10),
    ],
}

In [19]:
benchmark

{'method': ['K-Means++', 'PSO', 'PSO Hybrid'],
 'sse_mean': [8.4018037743, 11.3315742281, 8.111109324],
 'sse_stdev': [1.8400728479, 2.363883766, 1.888061075],
 'silhouette_mean': [0.4965292348, 0.5014150901, 0.4932304089],
 'silhouette_stdev': [0.0086095693, 0.0143308715, 0.0122563634],
 'quantization_mean': [0.1988875615, 0.1900590437, 0.1906650545],
 'quantization_stdev': [0.0044863841, 0.0117514983, 0.0073324902]}

In [20]:
benchmark_df = pd.DataFrame.from_dict(benchmark)
benchmark_df

Unnamed: 0,method,sse_mean,sse_stdev,silhouette_mean,silhouette_stdev,quantization_mean,quantization_stdev
0,K-Means++,8.401804,1.840073,0.496529,0.00861,0.198888,0.004486
1,PSO,11.331574,2.363884,0.501415,0.014331,0.190059,0.011751
2,PSO Hybrid,8.111109,1.888061,0.49323,0.012256,0.190665,0.007332


In [21]:
benchmark_df.to_excel('benchmark_iris_res.xlsx', index=False)

In [22]:
benchmark_df.to_csv('benchmark_iris_res.csv', index=False)