In [1]:
import sys

sys.path.append('..')

import pandas as pd
import numpy as np

from sklearn.cluster import DBSCAN
from DBOD_avenga import CLTree, Data

from sklearn.datasets import load_iris

from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import davies_bouldin_score

from collections import Counter
import time

In [2]:
iris = load_iris() 
test_df = pd.DataFrame(iris.data, columns=iris.feature_names)

## DBSCAN clustering

In [3]:
results = []
for e in [i/10 for i in range(1, 11)]:
    for s in range(1, 11):
        dbscan_clustering = DBSCAN(eps=e, min_samples=s).fit(test_df)
        if len(set(list(dbscan_clustering.labels_))) > 2:
            score = round(silhouette_score(test_df, dbscan_clustering.labels_), 2)
            results.append({'eps': e, 'min_samples': s, 'score': score})

best_e, best_s, best_score = None, None, -1
for res in results:
    if res['score'] > best_score:
        best_e, best_s, best_score = res['eps'], res['min_samples'], res['score']

print('eps: ', best_e, 'min_samples: ', best_s, 'score: ', best_score)

eps:  0.6 min_samples:  6 score:  0.55


In [4]:
dbscan_start = time.time()
dbscan_clustering = DBSCAN(eps=best_e, min_samples=best_s).fit(test_df)
dbscan_finish = time.time()
print('DBSCAN execution time: ', round(dbscan_finish - dbscan_start, 5), 's.')

DBSCAN execution time:  0.00399 s.


In [5]:
dbscan_labels_arr = dbscan_clustering.labels_
dbscan_res_df = test_df.copy()
dbscan_res_df['Cluster'] = dbscan_labels_arr
dbscan_res_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Cluster
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,1
146,6.3,2.5,5.0,1.9,1
147,6.5,3.0,5.2,2.0,1
148,6.2,3.4,5.4,2.3,1


## DBOD clustering

In [6]:
cltree_data = Data(test_df.values)

In [7]:
results = []
for y in [i/100 for i in range(1, 51)]:
    for rd in [i/10 for i in range(1, 5)]:
        cltree_clustering = CLTree().build(cltree_data)
        cltree_clustering.prune(min_y=y, min_rd=rd)
        clusters = cltree_clustering.get_clusters()
        if len(np.unique(clusters)) > 2:
            score = round(silhouette_score(test_df, clusters), 2)
            results.append({'min_y': y, 'min_rd': rd, 'score': score})

best_min_y, best_rd, best_score = None, None, -1
for res in results:
    if res['score'] > best_score:
        best_min_y, best_rd, best_score = res['min_y'], res['min_rd'], res['score']

print('min_y: ', best_min_y, 'min_rd: ', best_rd, 'score: ', best_score)

min_y:  0.23 min_rd:  0.2 score:  0.33


In [8]:
params = {'min_y': best_min_y, 'min_rd': best_rd}

cltree_start = time.time()
cltree_clustering = CLTree().build(cltree_data)
cltree_clustering.prune(**params)
cltree_finish = time.time()

print('CLTree execution time: ', round(cltree_finish - cltree_start, 5), 's.')
cltree_labels_arr = cltree_clustering.get_clusters()

CLTree execution time:  0.29107 s.


In [9]:
cltree_res_df = test_df.copy()
cltree_res_df['Cluster'] = cltree_labels_arr
cltree_res_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Cluster
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0
146,6.3,2.5,5.0,1.9,0
147,6.5,3.0,5.2,2.0,0
148,6.2,3.4,5.4,2.3,0


In [10]:
# renaming clusters, because it is obvious that cltree cluster names are mixed up
cltree_res_df.loc[cltree_res_df['Cluster'] == 1, 'Cluster'] = 2
cltree_res_df.loc[cltree_res_df['Cluster'] == 0, 'Cluster'] = 1
cltree_res_df.loc[cltree_res_df['Cluster'] == 2, 'Cluster'] = 0

cltree_labels_arr = cltree_res_df.iloc[:, -1]

cltree_res_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),Cluster
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,1
146,6.3,2.5,5.0,1.9,1
147,6.5,3.0,5.2,2.0,1
148,6.2,3.4,5.4,2.3,1


In [11]:
comparison = dbscan_res_df.iloc[:, -1] == cltree_res_df.iloc[:, -1]
identical = comparison[comparison == True].count()
print('Accuracy of cltree towards dbscan: ', round(identical / comparison.count(), 2))

Accuracy of cltree towards dbscan:  0.78


In [12]:
chi_dbscan = round(calinski_harabasz_score(test_df, dbscan_labels_arr), 2)
chi_cltree = round(calinski_harabasz_score(test_df, cltree_labels_arr), 2)

print('(Calinski-Harabasz Index)  DBSCAN:', chi_dbscan, ' CLTree: ', chi_cltree )

(Calinski-Harabasz Index)  DBSCAN: 219.72  CLTree:  88.86


In [13]:
dbi_dbscan = round(davies_bouldin_score(test_df, dbscan_labels_arr), 2)
dbi_cltree = round(davies_bouldin_score(test_df, cltree_labels_arr), 2)

print('(Davies-Bouldin Index)  DBSCAN:', dbi_dbscan, ' CLTree: ', dbi_cltree, ' (the lower - the better)' )

(Davies-Bouldin Index)  DBSCAN: 8.41  CLTree:  1.78  (the lower - the better)
