In [None]:
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import davies_bouldin_score
from sklearn.metrics import calinski_harabasz_score
from sklearn.metrics import silhouette_samples, silhouette_score


In [None]:
def clean_dataset(df):
  """ Remove lines with nonsensical values, if any.
   Keyword arguments: df is a Pandas' library dataframe.
  """
  assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
  df.dropna(inplace=True)
  indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf,"*"]).any(1)
  return df[indices_to_keep].astype(np.float64)

In [None]:
def plotValidityIndices(ss, dbs, chs, maximumClusters):
  """ Shows lineplots of three different cluster validity indices.
  The values are for a number of clusters ranging from 2 to maximumClusters
   Keyword arguments: 
   - ss is a numpy array with the average Silhouette Score.
   - dbs is a numpy array with the Davies Bouldin score.
   - chs is a numpy array with the Calinski Harabasz score.
  """
  range_n_clusters = range(2, maximumClusters+1)
  fig = plt.figure(figsize=(21,4))

  fig.add_subplot(131)
  plt.grid(True)
  plt.plot(range_n_clusters, ss,'b-',label='Silhouette Score')
  plt.xlabel("Number of cluster")
  plt.ylabel("Silhouette Score")
  plt.legend()

  fig.add_subplot(132)
  plt.grid(True)
  plt.plot(range_n_clusters, dbs,'b-',label='Davies Bouldin score')
  plt.xlabel("Number of cluster")
  plt.ylabel("Davies Bouldin score")
  plt.legend()

  fig.add_subplot(133 )
  plt.grid(True)
  plt.plot(range_n_clusters, chs,'b-',label='Calinski Harabasz score')
  plt.xlabel("Number of cluster")
  plt.ylabel("Calinski Harabasz score")
  plt.legend()
  plt.show()

In [None]:

def multiCluster(dataFrame, maximumClusters, plot = True):
  """ Applies the KMeans clustering algorithm to a Pandas dataframe 
  varying the number of clusters to be formed (ranging from 2 to maximumClusters). 
  Also computes three different cluster validity indices.
   Keyword arguments: 
   - dataFrame: Pandas dataframe with the data to be clustered.
   - maximumClusters: defines the range of different groups to be formed.
   - plot: boolean that determines if the indices are to be ploted.
   Returns:
   - ss: a numpy array with the Silhouette Score for the different number of clusters.
   - dbs: a numpy array with the Davies Bouldin score for the different clusters.
   - chs: a numpy array with the Calinski Harabasz score the different clusters.
  """
  ss = []
  dbs = []
  chs = []
  range_n_clusters = range(2,maximumClusters+1)
 
  for n_clust in range_n_clusters:
    # Initialize the clusterer with n_clust value and a random generator
    # seed of 123 for reproducibility.
    clusterer = KMeans(n_clusters=n_clust, random_state=123)
    cluster_labels = clusterer.fit_predict(dataFrame)

    silhouette_avg = silhouette_score(dataFrame, cluster_labels)
    db_avg = davies_bouldin_score(dataFrame, cluster_labels)
    ch_avg = calinski_harabasz_score(dataFrame, cluster_labels)
    ss.append(silhouette_avg)
    dbs.append(db_avg)
    chs.append(ch_avg)

  if plot:
    plotValidityIndices(ss, dbs, chs, maximumClusters)

  return ss, dbs, chs

In [None]:
""" Load the datasets.
The following code is obfuscated and should be optimized for your own needs.
In this case we are assuming that C0 always refers to the original data.
For the study at hand C2,C3,C4,C5 are 4 different datasets that result from
4 different anonymization techniques applied to C0.
"""
path = '/a/b/c/'
#If there are unecessary or unused columns:
dropColumns = ['col_x','col_y', 'col_z']
#If the files are available in xlsx format:
C0 = pd.DataFrame(pd.read_excel(path + 'originalData.xlsx')).drop(dropColumns, axis = 1)
C2 = pd.DataFrame(pd.read_excel(path + 'anonym1.xlsx')).drop(dropColumns, axis = 1)
C3 = pd.DataFrame(pd.read_excel(path + 'anonym2.xlsx')).drop(dropColumns, axis = 1)
C4 = pd.DataFrame(pd.read_excel(path + 'anonym3.xlsx')).drop(dropColumns, axis = 1)
C5 = pd.DataFrame(pd.read_excel(path + 'anonym4.xlsx')).drop(dropColumns, axis = 1)

In [None]:
""" Data cleaning.
"""
X_C0 = clean_dataset(C0).to_numpy()
X_C2 = clean_dataset(C2).to_numpy()
X_C3 = clean_dataset(C3).to_numpy()
X_C4 = clean_dataset(C4).to_numpy()
X_C5 = clean_dataset(C5).to_numpy()

In [None]:
""" Data normalization.
"""
scaler = StandardScaler()
S_C0 = np.c_[scaler.fit_transform(X_C0)]
S_C2 = np.c_[scaler.fit_transform(X_C2)]
S_C3 = np.c_[scaler.fit_transform(X_C3)]
S_C4 = np.c_[scaler.fit_transform(X_C4)]
S_C5 = np.c_[scaler.fit_transform(X_C5)]

In [None]:
""" Define the maximum number of clusters.
A good first approximation could be the square root of the original dataset cardinality.
"""
maxNumberOfClusters = 40

""" Apply the clustering+validity pipeline to the original (S_C0) and 
to the anonymized data.
"""
scaledData = [S_C0, S_C2, S_C3, S_C4, S_C5]
for sd in scaledData:
  ss, dbs, chs  =  multiCluster(sd, maxNumberOfClusters, True)

