In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from pso import ParticleSwarmOptimizedClustering
from particle import quantization_error, calc_sse
from utils import normalize
from kmeans import KMeans
from sklearn.metrics import silhouette_score

In [3]:
data = pd.read_csv('./data/heart_processed.txt', sep=' ', header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,1
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [4]:
x = data.drop([13], axis=1)
x = x.values
x = normalize(x)
x

array([[0.70833333, 1.        , 0.        , ..., 1.        , 0.        ,
        0.85714286],
       [0.79166667, 1.        , 1.        , ..., 0.5       , 1.        ,
        0.42857143],
       [0.79166667, 1.        , 1.        , ..., 0.5       , 0.66666667,
        1.        ],
       ...,
       [0.58333333, 1.        , 1.        , ..., 0.5       , 0.33333333,
        1.        ],
       [0.58333333, 0.        , 0.33333333, ..., 0.5       , 0.33333333,
        0.42857143],
       [0.1875    , 1.        , 0.66666667, ..., 0.        , 0.        ,
        0.42857143]])

# K-Means

In [5]:
kmeans = KMeans(n_cluster=2, init_pp=False, seed=2018)
kmeans.fit(x)

In [6]:
predicted_kmeans = kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans.SSE)
print('Quantization:', quantization_error(centroids=kmeans.centroid, data=x, labels=predicted_kmeans))

Silhouette: 0.1493071878000349
SSE: 343.7566715135276
Quantization: 1.0443919297332505


In [7]:
kmeans2 = KMeans(n_cluster=2, init_pp=True, seed=2018)
kmeans2.fit(x)
predicted_kmeans2 = kmeans2.predict(x)
print('Silhouette:', silhouette_score(x, predicted_kmeans))
print('SSE:', kmeans2.SSE)
print('Quantization:', quantization_error(centroids=kmeans2.centroid, data=x, labels=predicted_kmeans2))

Silhouette: 0.1493071878000349
SSE: 324.86578873437315
Quantization: 1.0144138871665043


# PSO

In [8]:
pso = ParticleSwarmOptimizedClustering(
        n_cluster=2, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=50)

In [9]:
hist = pso.run()

Initial global best score 1.0008511461371405
Iteration 0001/2000 current gbest score 1.000851146137140546
Iteration 0051/2000 current gbest score 0.999552410265212465
Iteration 0101/2000 current gbest score 0.999399309290100124
Iteration 0151/2000 current gbest score 0.999385281139022474
Iteration 0201/2000 current gbest score 0.999384546256717687
Iteration 0251/2000 current gbest score 0.999384515842823618
Iteration 0301/2000 current gbest score 0.999384515203008528
Iteration 0351/2000 current gbest score 0.999384515196801382
Iteration 0401/2000 current gbest score 0.999384515196776402
Iteration 0451/2000 current gbest score 0.999384515196776402
Iteration 0501/2000 current gbest score 0.999384515196776402
Iteration 0551/2000 current gbest score 0.999384515196776402
Iteration 0601/2000 current gbest score 0.999384515196776402
Iteration 0651/2000 current gbest score 0.999384515196776402
Iteration 0701/2000 current gbest score 0.999384515196776402
Iteration 0751/2000 current gbest score 

In [10]:
pso_kmeans = KMeans(n_cluster=2, init_pp=False, seed=2018)

In [11]:
pso_kmeans.centroid = pso.gbest_centroids.copy()
pso_kmeans.centroid

array([[5.63747577e-01, 8.12099918e-01, 8.84139454e-01, 3.74277820e-01,
        2.82000225e-01, 1.29794694e-01, 5.57032497e-01, 5.14552088e-01,
        1.01140286e+00, 2.37934796e-01, 4.05514992e-01, 2.85100638e-01,
        8.14546811e-01],
       [5.05134106e-01, 6.43551528e-01, 6.43508349e-01, 3.45984645e-01,
        2.73972835e-01, 1.27404271e-01, 4.31465722e-01, 6.62726667e-01,
        3.23459207e-04, 1.26805085e-01, 2.25268388e-01, 1.75526372e-01,
        5.80365617e-01]])

In [13]:
predicted_pso = pso_kmeans.predict(x)
print('Silhouette:', silhouette_score(x, predicted_pso))
print('SSE:', calc_sse(centroids=pso.gbest_centroids, data=x, labels=predicted_pso))
print('Quantization:', pso.gbest_score)

Silhouette: 0.21415960170661166
SSE: 322.12998244727646
Quantization: 0.9993845151967764


# Repeated Test

### K-Means++

In [14]:
kmeanspp = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    kmean_rep = KMeans(n_cluster=2, init_pp=True)
    kmean_rep.fit(x)
    predicted_kmean_rep = kmean_rep.predict(x)
    #print(predicted_kmean_rep)
    silhouette = silhouette_score(x, predicted_kmean_rep)
    sse = kmean_rep.SSE
    quantization = quantization_error(centroids=kmean_rep.centroid, data=x, labels=predicted_kmean_rep)
    kmeanspp['silhouette'].append(silhouette)
    kmeanspp['sse'].append(sse)
    kmeanspp['quantization'].append(quantization)

### PSO 

In [15]:
%%time
pso_plain = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=2, n_particles=10, data=x, hybrid=False, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=2, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_plain['silhouette'].append(silhouette)
    pso_plain['sse'].append(sse)
    pso_plain['quantization'].append(quantization)

Initial global best score 1.2299559995147136
Iteration 0001/2000 current gbest score 1.188959983701518119
Finish with gbest score 0.990836554326653518
Initial global best score 1.2465638316473533
Iteration 0001/2000 current gbest score 1.246563831647353293
Finish with gbest score 1.054751935197026480
Initial global best score 1.1821371165401162
Iteration 0001/2000 current gbest score 1.112487109499767923
Finish with gbest score 1.031184135414821679
Initial global best score 1.2139483034636651
Iteration 0001/2000 current gbest score 1.204041934476347109
Finish with gbest score 1.061136602830366593
Initial global best score 1.1661134543898521
Iteration 0001/2000 current gbest score 1.142906684374604520
Finish with gbest score 1.022750943100667609
Initial global best score 1.1688576338358614
Iteration 0001/2000 current gbest score 1.145342002799194292
Finish with gbest score 0.906775373411234265
Initial global best score 1.167728235019688
Iteration 0001/2000 current gbest score 1.13437078

  dist /= len(idx)


Finish with gbest score 0.947064701450361923
Initial global best score 1.2373688924652329
Iteration 0001/2000 current gbest score 1.174900478287265626
Finish with gbest score 1.036758072185795854
Initial global best score 1.2488149981432355
Iteration 0001/2000 current gbest score 1.204011802624101568
Finish with gbest score 1.029531171613160101
Initial global best score 1.1890194406022296
Iteration 0001/2000 current gbest score 1.189019440602229638


  dist /= len(idx)


Finish with gbest score 0.960225082482218051
Initial global best score 1.2073906235199567
Iteration 0001/2000 current gbest score 1.142048257213426732
Finish with gbest score 0.954022567616964579
Initial global best score 1.28901175916646
Iteration 0001/2000 current gbest score 1.208543436787681280
Finish with gbest score 1.056509937859603898
Initial global best score 1.2511176125358587
Iteration 0001/2000 current gbest score 1.158928548423200589
Finish with gbest score 1.041883702708267911
Initial global best score 1.2252588861166478
Iteration 0001/2000 current gbest score 1.201904111089846294
Finish with gbest score 1.038611777249234702
Initial global best score 1.2206102810630448
Iteration 0001/2000 current gbest score 1.218916672366865406


  dist /= len(idx)


Finish with gbest score 1.059489514003370658
Initial global best score 1.2432391932763063
Iteration 0001/2000 current gbest score 1.209502036772479583


  dist /= len(idx)


Finish with gbest score 1.079255862116273557
Initial global best score 1.2357369368224087
Iteration 0001/2000 current gbest score 1.100221076505336848


  dist /= len(idx)


Finish with gbest score 0.849721640882342699
Initial global best score 1.2224605349037558
Iteration 0001/2000 current gbest score 1.222460534903755836
Finish with gbest score 1.004598320774849896
Initial global best score 1.20207124010029
Iteration 0001/2000 current gbest score 1.195088775935689185
Finish with gbest score 0.984634865226650779
Initial global best score 1.1753217575525199
Iteration 0001/2000 current gbest score 1.175321757552519886
Finish with gbest score 1.017377646401218083
Wall time: 1min 54s


### PSO Hybrid

In [16]:
%%time
pso_hybrid = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=2, n_particles=10, data=x, hybrid=True, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=2, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_hybrid['silhouette'].append(silhouette)
    pso_hybrid['sse'].append(sse)
    pso_hybrid['quantization'].append(quantization)

Initial global best score 1.0008511461371405
Iteration 0001/2000 current gbest score 1.000851146137140546
Finish with gbest score 0.999214620224922800
Initial global best score 1.0008511461371405
Iteration 0001/2000 current gbest score 1.000851146137140546
Finish with gbest score 1.000417327817308522
Initial global best score 1.0048541984610915
Iteration 0001/2000 current gbest score 1.004854198461091475
Finish with gbest score 1.002653326547340518
Initial global best score 1.0008511461371405
Iteration 0001/2000 current gbest score 1.000851146137140546
Finish with gbest score 0.999401866033216457
Initial global best score 1.0008511461371405
Iteration 0001/2000 current gbest score 1.000851146137140546
Finish with gbest score 0.999254609220285861
Initial global best score 1.0008511461371405
Iteration 0001/2000 current gbest score 1.000851146137140546
Finish with gbest score 0.999422224253790348
Initial global best score 1.0008511461371405
Iteration 0001/2000 current gbest score 1.0008511

# Comparison

In [17]:
benchmark = {
    'method' : ['K-Means++', 'PSO', 'PSO Hybrid'],
    'sse_mean' : [
        np.around(np.mean(kmeanspp['sse']), decimals=10),
        np.around(np.mean(pso_plain['sse']), decimals=10),
        np.around(np.mean(pso_hybrid['sse']), decimals=10),
    ],
    'sse_stdev' : [
        np.around(np.std(kmeanspp['sse']), decimals=10),
        np.around(np.std(pso_plain['sse']), decimals=10),
        np.around(np.std(pso_hybrid['sse']), decimals=10),
    ],
    'silhouette_mean' : [
        np.around(np.mean(kmeanspp['silhouette']), decimals=10),
        np.around(np.mean(pso_plain['silhouette']), decimals=10),
        np.around(np.mean(pso_hybrid['silhouette']), decimals=10),
    ],
    'silhouette_stdev' : [
        np.around(np.std(kmeanspp['silhouette']), decimals=10),
        np.around(np.std(pso_plain['silhouette']), decimals=10),
        np.around(np.std(pso_hybrid['silhouette']), decimals=10),
    ],
    'quantization_mean' : [
        np.around(np.mean(kmeanspp['quantization']), decimals=10),
        np.around(np.mean(pso_plain['quantization']), decimals=10),
        np.around(np.mean(pso_hybrid['quantization']), decimals=10),
    ],
    'quantization_stdev' : [
        np.around(np.std(kmeanspp['quantization']), decimals=10),
        np.around(np.std(pso_plain['quantization']), decimals=10),
        np.around(np.std(pso_hybrid['quantization']), decimals=10),
    ],
}

In [18]:
benchmark

{'method': ['K-Means++', 'PSO', 'PSO Hybrid'],
 'sse_mean': [330.6852543714, 388.1279087207, 325.2416052599],
 'sse_stdev': [15.4857910012, 48.2412848434, 6.3182686057],
 'silhouette_mean': [0.1993638408, 0.1162906875, 0.206067336],
 'silhouette_stdev': [0.0126193037, 0.0807356425, 0.0113011807],
 'quantization_mean': [1.0238658568, 1.0063560203, 1.0036133244],
 'quantization_stdev': [0.0330283956, 0.0567077927, 0.0080741982]}

In [19]:
benchmark_df = pd.DataFrame.from_dict(benchmark)
benchmark_df

Unnamed: 0,method,sse_mean,sse_stdev,silhouette_mean,silhouette_stdev,quantization_mean,quantization_stdev
0,K-Means++,330.685254,15.485791,0.199364,0.012619,1.023866,0.033028
1,PSO,388.127909,48.241285,0.116291,0.080736,1.006356,0.056708
2,PSO Hybrid,325.241605,6.318269,0.206067,0.011301,1.003613,0.008074


In [20]:
benchmark_df.to_excel('benchmark_heart_res.xlsx', index=False)

In [21]:
benchmark_df.to_csv('benchmark_heart_res.csv', index=False)

# Impact of the inertia $\omega$ on the exploration and exploitation balance.

In [None]:
%%time
pso_hybrid = {
    'silhouette': [],
    'sse' : [],
    'quantization' : [],
}
d=0
for _ in range(20):
    pso_rep = ParticleSwarmOptimizedClustering(
        n_cluster=2, n_particles=10,  data=x, hybrid=True, max_iter=2000, print_debug=2000)
    pso_rep.run()
    pso_kmeans = KMeans(n_cluster=2, init_pp=False, seed=2018)
    pso_kmeans.centroid = pso_rep.gbest_centroids.copy()
    predicted_pso_rep = pso_kmeans.predict(x)
    
    silhouette = silhouette_score(x, predicted_pso_rep)
    sse = calc_sse(centroids=pso_rep.gbest_centroids, data=x, labels=predicted_pso_rep)
    quantization = pso_rep.gbest_score
    pso_hybrid['silhouette'].append(silhouette)
    pso_hybrid['sse'].append(sse)
    pso_hybrid['quantization'].append(quantization)