### Install gower package

In [2]:
pip install gower

Collecting gower
  Downloading gower-0.1.2-py3-none-any.whl (5.2 kB)
Collecting numpy
  Downloading numpy-1.22.4-cp39-cp39-macosx_10_15_x86_64.whl (17.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.7/17.7 MB[0m [31m625.0 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: numpy, gower
  Attempting uninstall: numpy
    Found existing installation: numpy 1.23.1
    Uninstalling numpy-1.23.1:
      Successfully uninstalled numpy-1.23.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
daal4py 2021.3.0 requires daal==2021.2.3, which is not installed.[0m[31m
[0mSuccessfully installed gower-0.1.2 numpy-1.22.4
Note: you may need to restart the kernel to use updated packages.


### Import required packages

In [3]:
import pandas as pd
import gower
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics



### Select required attributes

In [5]:
columns = ['estimatedp', 'wtr_srcdes', 'wtr_hauled', 'priv_wells', \
               'wc_exists', 'wc_adeq', 'wc_hlth', 'ww_public', 'ppl_nowat_r', 'ppl_noww_r', 'ppl_yeswat_r', 'ppl_yesww_r']

### Specifiy file paths

In [6]:
colonias_Y_path = 'dataset/colonias_Y_norm.csv'
colonias_N_path = 'dataset/colonias_N_norm.csv'

### For colonias with public water services

In [8]:
# Read corresponding csv file
df = pd.read_csv(colonias_Y_path)[columns]
df.head()

Unnamed: 0,estimatedp,wtr_srcdes,wtr_hauled,priv_wells,wc_exists,wc_adeq,wc_hlth,ww_public,ppl_nowat_r,ppl_noww_r,ppl_yeswat_r,ppl_yesww_r
0,157,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
1,72,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
2,147,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
3,86,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
4,41,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0


In [9]:
# Select records in which population rate is not larger than 1 
selected_df = df[(df['ppl_nowat_r'] <= 1) & (df['ppl_noww_r'] <= 1)
                 & (df['ppl_yeswat_r'] <= 1) & (df['ppl_yesww_r'] <= 1)]

### Construct Similarity Metric

In [11]:
# Use gower distance to construct similarity metric
similarity_metric = gower.gower_matrix(selected_df)
print(similarity_metric)

[[0.0000000e+00 2.3611111e-04 2.7777778e-05 ... 7.7777775e-05
  2.5289723e-01 3.3532500e-01]
 [2.3611111e-04 0.0000000e+00 2.0833334e-04 ... 1.5833334e-04
  2.5313333e-01 3.3556110e-01]
 [2.7777778e-05 2.0833334e-04 0.0000000e+00 ... 4.9999999e-05
  2.5292501e-01 3.3535278e-01]
 ...
 [7.7777775e-05 1.5833334e-04 4.9999999e-05 ... 0.0000000e+00
  2.5297499e-01 3.3540279e-01]
 [2.5289723e-01 2.5313333e-01 2.5292501e-01 ... 2.5297499e-01
  0.0000000e+00 8.4238887e-02]
 [3.3532500e-01 3.3556110e-01 3.3535278e-01 ... 3.3540279e-01
  8.4238887e-02 0.0000000e+00]]


In [16]:
# Select median value in the similarity_metric as the preference value (defined in Affinity Propagation algorithm)
preference = np.median(similarity_metric)
print(preference)

-0.25000554


In [17]:
# Replace diagonal value with the preference value
np.fill_diagonal(similarity_metric, preference)
# Multiply with -1 
similarity_metric *= -1
print(similarity_metric)

[[2.5000554e-01 2.3611111e-04 2.7777778e-05 ... 7.7777775e-05
  2.5289723e-01 3.3532500e-01]
 [2.3611111e-04 2.5000554e-01 2.0833334e-04 ... 1.5833334e-04
  2.5313333e-01 3.3556110e-01]
 [2.7777778e-05 2.0833334e-04 2.5000554e-01 ... 4.9999999e-05
  2.5292501e-01 3.3535278e-01]
 ...
 [7.7777775e-05 1.5833334e-04 4.9999999e-05 ... 2.5000554e-01
  2.5297499e-01 3.3540279e-01]
 [2.5289723e-01 2.5313333e-01 2.5292501e-01 ... 2.5297499e-01
  2.5000554e-01 8.4238887e-02]
 [3.3532500e-01 3.3556110e-01 3.3535278e-01 ... 3.3540279e-01
  8.4238887e-02 2.5000554e-01]]


### Apply Affinity Propagation and compare clustering results under different damping factors and iterations

In [18]:
dampings = [0.5, 0.6, 0.7, 0.8, 0.9]
iterations = range(200, 1050, 50)

In [20]:
for damping in dampings:
    for i in iterations:
        af = AffinityPropagation(affinity='precomputed', damping=damping, max_iter=i).fit(similarity_metric) # Apply calculated similarity_metric to AP algorithm
        # Get the number of clusters
        n_clusters = len(np.unique(af.labels_)) 
        # Get corresponding labels
        cluster_labels = af.labels_ 
        # Calculate silhouette score
        silhouette_avg = metrics.silhouette_score(similarity_metric, cluster_labels) 
        print('Damping=', damping, ', iteration=', i, ' n_clusters=', n_clusters,
              'The average SScore=', silhouette_avg)

Damping= 0.5 , iteration= 200  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 250  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 300  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 350  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 400  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 450  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 500  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 550  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 600  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 650  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 700  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 750  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 800  n_clusters= 5 The average SScore= 0.561317
Damping= 0.5 , iteration= 850  n_clust