# DBSCAN Parameter Optimisation
### We optimize the DBSCAN parameters minPts and eps according to the DCSI score.


Necessary imports and installations:

In [None]:
%pip install category_encoders

In [1]:
import os
current_path = os.getcwd()
x = current_path.rfind("/")
new_path = current_path[:x]
os.chdir(new_path)

import numpy as np
import pandas as pd
from tqdm import tqdm

from src.utils.DataLoader import DataLoader
from sklearn.cluster import DBSCAN
from src.evaluation.dcsi import dcsiscore

In [2]:
# define all datasets and different configurations
datasets = ['adult','adult4','bank3','communities','diabetes','adult2','adult5','bank','adult_m','adult_g','adult_r']
# collect all results
results = []
# collect best parameters
best_params ={}
# loop through all datasets and different configurations
for dataset in datasets:
    # construct a DataLoader
    dataloader = DataLoader(dataset, categorical=False)
    dataloader.load()
    data_wo_sensitive = dataloader.get_data()
    data = np.array(data_wo_sensitive)
    # set to 0 for each dataset
    max_dcsi = 0
    max_eps = 0
    max_mu = 0
    num_cl = 0
    labels = 0
    # loop through potential parameters for minpts
    for minpts in tqdm([4,5,10, 15 ,2*data.shape[1]-1]):
        # loop through potential parameters for eps
        for eps in [0.01,0.05,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5,0.6,0.7,0.8,0.9,1,1.1,1.2,1.5,1.6,1.7,1.8,2,2.1,2.2,2.3,2.4,2.5,2.6,2.8,3,3.25,3.5,3.75]:
            # calculate DBSCAN clustering
            clustering = DBSCAN(eps=eps, min_samples=minpts).fit(data)
            clustering = clustering.labels_
            # extract distribution of the data points across the clusters
            lab, counts = np.unique(clustering, return_counts=True)
            num_cl_current = len(lab)
            # calculate DCSI score
            dcsi = dcsiscore(data, clustering)
            # if new best score save parameters
            if dcsi > max_dcsi:
                max_dcsi = dcsi
                max_eps = eps
                max_mu = minpts
                labels = lab
                num_cl = num_cl_current
            # number of noise points
            num_noise = counts[0] if -1 in lab else 0
            # noise percentage
            noise_percent = num_noise / np.sum(counts)
            # add current results to result list
            results.append({'Data':dataset, 'DCSI':dcsi, 'MinPts':minpts, 'Eps':eps, 'N_clusters':num_cl_current, 'labels':lab, 'Noise':noise_percent})
    # save best parameters per dataset
    best_params[dataset] = {'DCSI':max_dcsi, 'EPS':max_eps, 'mu':max_mu, 'N_clusters':num_cl, 'labels':labels}

adult


100%|██████████| 5/5 [00:43<00:00,  8.61s/it]


adult4


100%|██████████| 5/5 [00:27<00:00,  5.56s/it]


bank3


100%|██████████| 5/5 [07:30<00:00, 90.13s/it] 


communities


100%|██████████| 5/5 [00:03<00:00,  1.51it/s]


diabetes


100%|██████████| 5/5 [03:25<00:00, 41.08s/it]


adult2


100%|██████████| 5/5 [00:44<00:00,  8.94s/it]


adult5


100%|██████████| 5/5 [00:26<00:00,  5.33s/it]


bank


100%|██████████| 5/5 [06:52<00:00, 82.53s/it] 


adult_m


100%|██████████| 5/5 [01:10<00:00, 14.06s/it]


adult_g


100%|██████████| 5/5 [00:25<00:00,  5.10s/it]


adult_r


100%|██████████| 5/5 [00:40<00:00,  8.15s/it]


In [4]:
result_df = pd.DataFrame(results)
# remove all parameter sets that produce only one cluster
result_df = result_df[result_df['N_clusters'] != 1]
# possibly rename dataset columns
#df = df.replace('adult4', 'Adult (gender)')
#df = df.replace('adult', 'Adult (race)')
#df = df.replace('communities', 'Communities')
#df = df.replace('diabetes', 'Diabetes')
#df = df.replace('bank3', 'Bank')

In [5]:
# save the dataframe
result_df.to_csv('auxiliary/Parameters/DBSCAN_opt.csv')

In [6]:
# print result df to latex for manuscript
print(result_df.to_latex(index=False,
                         float_format="{:.2f}".format,
                         ))

\begin{tabular}{lrrrrlr}
\toprule
Data & DCSI & MinPts & Eps & N_clusters & labels & Noise \\
\midrule
adult & 0.00 & 4 & 0.10 & 20 & [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18] & 0.94 \\
adult & 0.00 & 4 & 0.15 & 34 & [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32] & 0.87 \\
adult & 0.00 & 4 & 0.20 & 40 & [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38] & 0.76 \\
adult & 0.00 & 4 & 0.25 & 32 & [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30] & 0.69 \\
adult & 0.00 & 4 & 0.30 & 36 & [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34] & 0.62 \\
adult & 0.00 & 4 & 0.35 & 41 & [-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39] & 0.54 \\
adult & 0.00 & 4 & 0.40 &

In [7]:
# print the best parameters for each dataset
for data in best_params.keys():
    print(data)
    print(best_params[data])

adult
{'DCSI': 0.9920356605285473, 'EPS': 2.1, 'mu': 4, 'N_clusters': 2, 'labels': array([-1,  0,  1])}
adult4
{'DCSI': 0.9675933209146912, 'EPS': 0.15, 'mu': 9, 'N_clusters': 2, 'labels': array([-1,  0,  1])}
bank3
{'DCSI': 0.9856422394930905, 'EPS': 1.5, 'mu': 4, 'N_clusters': 2, 'labels': array([-1,  0,  1])}
communities
{'DCSI': 0.6510929926648031, 'EPS': 3.25, 'mu': 10, 'N_clusters': 1, 'labels': array([-1,  0,  1])}
diabetes
{'DCSI': 0.8801812069091122, 'EPS': 0.45, 'mu': 10, 'N_clusters': 2, 'labels': array([-1,  0,  1,  2,  3])}
adult2
{'DCSI': 0.9920356605285473, 'EPS': 2.1, 'mu': 4, 'N_clusters': 2, 'labels': array([-1,  0,  1])}
adult5
{'DCSI': 0.9675933209146912, 'EPS': 0.15, 'mu': 9, 'N_clusters': 2, 'labels': array([-1,  0,  1])}
bank
{'DCSI': 0.9856422394930905, 'EPS': 1.5, 'mu': 4, 'N_clusters': 2, 'labels': array([-1,  0,  1])}
adult_m
{'DCSI': 0.9927573004357375, 'EPS': 1.2, 'mu': 4, 'N_clusters': 3, 'labels': array([-1,  0,  1])}
adult_g
{'DCSI': 0.9675932166724094, 