# FairDen MinPts Ablation
### We calculate results for different minPts values for FairDen.

Necessary imports and installations:

In [None]:
%pip install category_encoders
%pip install scikit-learn-extra

In [3]:
import os
current_path = os.getcwd()
x = current_path.rfind("/")
new_path = current_path[:x]
os.chdir(new_path)

import numpy as np
import pandas as pd

from tqdm import tqdm

from src.utils.DataLoader import DataLoader
from src.evaluation.dcsi import dcsiscore
from src.evaluation.balance import balance_score
from src.utils.ClusteringAlgorithm import ClusteringAlgorithm

In [4]:
# define all datasets and different configurations
datasets = ['adult', 'adult4', 'bank3', 'communities', 'diabetes', 'adult2', 'adult5', 'bank']
# collect all results
results = []
for dataset in datasets:
    # construct a DataLoader
    dataloader = DataLoader(dataset, categorical=False)
    dataloader.load()
    data_wo_sensitive = dataloader.get_data()
    data = np.array(data_wo_sensitive)
    # set to 0 for each dataset
    num_cl = 0
    labels = 0
    # loop through potential parameters for minpts
    for minpts in tqdm([4,5,10, 15 ,2 * (data.shape[1] + len(dataloader.get_sens_attr())) - 1]):
        try:
            # calculate FairDen clustering
            algorithm = ClusteringAlgorithm('FairDen', dataloader, minpts, dataset)
            labels = algorithm.run(2)
            # calculate DCSI score
            dcsi = dcsiscore(data, labels)
            # extract distribution of the data points across the clusters
            lab, counts =np.unique(labels, return_counts=True)
            # calculate balance
            balance, b1, b2 = balance_score(dataset, dataloader.get_sens_attr(), np.array(labels),
                                            dataloader.get_sensitive_columns(), per_cluster=True)
            num_cl = len(lab)
            # number of noise points
            num_noise = counts[0] if -1 in lab else 0
            # noise percentage
            noise_percent = num_noise / np.sum(counts)
            # add current results to result list
            results.append({'Data':dataset, 'DCSI':dcsi, 'Balance':balance, 'MinPts':minpts, 'N_clusters':num_cl, 'labels':lab, 'Noise':noise_percent})
        except :
            results.append({'Data':dataset, 'DCSI': '-', 'Balance': '-', 'MinPts':minpts, 'N_clusters': '-', 'labels': '-', 'Noise': '-'})

adult


100%|██████████| 5/5 [00:58<00:00, 11.65s/it]


adult4


100%|██████████| 5/5 [00:51<00:00, 10.22s/it]


bank3


100%|██████████| 5/5 [08:57<00:00, 107.43s/it]


communities


100%|██████████| 5/5 [00:59<00:00, 11.90s/it]


diabetes


100%|██████████| 5/5 [07:40<00:00, 92.06s/it]


adult2


100%|██████████| 5/5 [00:46<00:00,  9.34s/it]


adult5


100%|██████████| 5/5 [00:48<00:00,  9.63s/it]


bank


100%|██████████| 5/5 [08:01<00:00, 96.20s/it]


In [5]:
result_df = pd.DataFrame(results)

In [7]:
# save the dataframe
result_df.to_latex()
result_df.to_csv('auxiliary/FairDen_MinPts.csv')