### Install gower package

In [1]:
pip install gower

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'd:\pycharm\pycharmprojects\ap_test\venv\scripts\python.exe -m pip install --upgrade pip' command.


### Import  required packages

In [2]:
import pandas as pd
import gower
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics

### Select required attributes

In [3]:
columns = ['estimatedp', 'wtr_srcdes', 'wtr_hauled', 'priv_wells', \
               'wc_exists', 'wc_adeq', 'wc_hlth', 'ww_public', 'ppl_nowat_r', 'ppl_noww_r', 'ppl_yeswat_r', 'ppl_yesww_r']

### Specify file paths

In [4]:
colonias_Y_path = 'dataset/colonias_Y_norm.csv'
colonias_N_path = 'dataset/colonias_N_norm.csv'

### For Colonias with public water services

In [5]:
# 1. Read corresponding csv file
df = pd.read_csv(colonias_Y_path)[columns]
df.head()

Unnamed: 0,estimatedp,wtr_srcdes,wtr_hauled,priv_wells,wc_exists,wc_adeq,wc_hlth,ww_public,ppl_nowat_r,ppl_noww_r,ppl_yeswat_r,ppl_yesww_r
0,157,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
1,72,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
2,147,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
3,86,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
4,41,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0


In [6]:
# 2. Select records in which population rate is not larger than 1
selected_df = df[(df['ppl_nowat_r'] <= 1) & (df['ppl_noww_r'] <= 1)
                 & (df['ppl_yeswat_r'] <= 1) & (df['ppl_yesww_r'] <= 1)]

#### Construct Similarity Metric

In [7]:
# use gower distance to construct similarity metric
similarity_metric = gower.gower_matrix(selected_df)
print(similarity_metric)

[[0.0000000e+00 2.3611111e-04 2.7777778e-05 ... 7.7777775e-05
  2.5289723e-01 3.3532500e-01]
 [2.3611111e-04 0.0000000e+00 2.0833334e-04 ... 1.5833334e-04
  2.5313333e-01 3.3556110e-01]
 [2.7777778e-05 2.0833334e-04 0.0000000e+00 ... 4.9999999e-05
  2.5292501e-01 3.3535278e-01]
 ...
 [7.7777775e-05 1.5833334e-04 4.9999999e-05 ... 0.0000000e+00
  2.5297499e-01 3.3540279e-01]
 [2.5289723e-01 2.5313333e-01 2.5292501e-01 ... 2.5297499e-01
  0.0000000e+00 8.4238887e-02]
 [3.3532500e-01 3.3556110e-01 3.3535278e-01 ... 3.3540279e-01
  8.4238887e-02 0.0000000e+00]]


In [8]:
# Select median value in the similarity_metrix as the preference value (defined in Affinity Propagation algorithm)
preference = np.median(similarity_metric)
print(preference)

0.25000554


In [9]:
# Replace diagonal value with the preference value
np.fill_diagonal(similarity_metric, preference)
# Multiply with -1
similarity_metric *= -1
print(similarity_metric)

[[-2.5000554e-01 -2.3611111e-04 -2.7777778e-05 ... -7.7777775e-05
  -2.5289723e-01 -3.3532500e-01]
 [-2.3611111e-04 -2.5000554e-01 -2.0833334e-04 ... -1.5833334e-04
  -2.5313333e-01 -3.3556110e-01]
 [-2.7777778e-05 -2.0833334e-04 -2.5000554e-01 ... -4.9999999e-05
  -2.5292501e-01 -3.3535278e-01]
 ...
 [-7.7777775e-05 -1.5833334e-04 -4.9999999e-05 ... -2.5000554e-01
  -2.5297499e-01 -3.3540279e-01]
 [-2.5289723e-01 -2.5313333e-01 -2.5292501e-01 ... -2.5297499e-01
  -2.5000554e-01 -8.4238887e-02]
 [-3.3532500e-01 -3.3556110e-01 -3.3535278e-01 ... -3.3540279e-01
  -8.4238887e-02 -2.5000554e-01]]


#### Applying Affinity Propagation and compare clustering results under different damping factors and iterations

In [10]:
dampings = [0.5, 0.6, 0.7, 0.8, 0.9]
iterations = range(200, 1050, 50)

In [11]:
for damping in dampings:
    for i in iterations:
        # Apply calculated similarity_metric to AP algorithm
        af = AffinityPropagation(affinity='precomputed', damping=damping, max_iter=i).fit(similarity_metric)
        # Get the number of clusters
        n_clusters = len(np.unique(af.labels_))
        # Get corresponding labels
        cluster_labels = af.labels_
        # Calculate silhouette score
        silhouette_avg = metrics.silhouette_score(similarity_metric, cluster_labels)
        print('Damping=', damping, ', iteration=', i, ' n_clusters=', n_clusters,
              'The average SScore=', silhouette_avg)

Damping= 0.5 , iteration= 200  n_clusters= 876 The average SScore= 0.43687734
Damping= 0.5 , iteration= 250  n_clusters= 869 The average SScore= 0.43619883
Damping= 0.5 , iteration= 300  n_clusters= 37 The average SScore= 0.8347605
Damping= 0.5 , iteration= 350  n_clusters= 37 The average SScore= 0.8347605
Damping= 0.5 , iteration= 400  n_clusters= 162 The average SScore= 0.43954504
Damping= 0.5 , iteration= 450  n_clusters= 37 The average SScore= 0.8347605
Damping= 0.5 , iteration= 500  n_clusters= 884 The average SScore= 0.437121
Damping= 0.5 , iteration= 550  n_clusters= 868 The average SScore= 0.4371905
Damping= 0.5 , iteration= 600  n_clusters= 884 The average SScore= 0.437121
Damping= 0.5 , iteration= 650  n_clusters= 22 The average SScore= 0.8310452
Damping= 0.5 , iteration= 700  n_clusters= 22 The average SScore= 0.8310452
Damping= 0.5 , iteration= 750  n_clusters= 22 The average SScore= 0.8310452
Damping= 0.5 , iteration= 800  n_clusters= 37 The average SScore= 0.8347605
Dampi

### For Colonias without public water services

In [12]:
# 1. Read corresponding csv file
df = pd.read_csv(colonias_N_path)[columns]
df.head()

Unnamed: 0,estimatedp,wtr_srcdes,wtr_hauled,priv_wells,wc_exists,wc_adeq,wc_hlth,ww_public,ppl_nowat_r,ppl_noww_r,ppl_yeswat_r,ppl_yesww_r
0,67,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0
1,16,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0
2,46,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0
3,181,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0
4,293,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0


In [14]:
# 2. Select records in which population rate is not larger than 1
selected_df = df[(df['ppl_nowat_r'] <= 1) & (df['ppl_noww_r'] <= 1)
                 & (df['ppl_yeswat_r'] <= 1) & (df['ppl_yesww_r'] <= 1)]

#### Construct Similarity Metric

In [15]:
# use gower distance to construct similarity metric
similarity_metric = gower.gower_matrix(selected_df)
print(similarity_metric)

[[0.0000000e+00 5.0595240e-04 2.0833334e-04 ... 3.3365080e-01
  3.3578372e-01 3.3485121e-01]
 [5.0595240e-04 0.0000000e+00 2.9761906e-04 ... 3.3352181e-01
  3.3628967e-01 3.3535713e-01]
 [2.0833334e-04 2.9761906e-04 0.0000000e+00 ... 3.3344245e-01
  3.3599207e-01 3.3505952e-01]
 ...
 [3.3365080e-01 3.3352181e-01 3.3344245e-01 ... 0.0000000e+00
  2.7678572e-03 1.8353175e-03]
 [3.3578372e-01 3.3628967e-01 3.3599207e-01 ... 2.7678572e-03
  0.0000000e+00 9.3253970e-04]
 [3.3485121e-01 3.3535713e-01 3.3505952e-01 ... 1.8353175e-03
  9.3253970e-04 0.0000000e+00]]


In [16]:
# Select median value in the similarity_metrix as the preference value (defined in Affinity Propagation algorithm)
preference = np.median(similarity_metric)
print(preference)

0.3336508


In [17]:
# Replace diagonal value with the preference value
np.fill_diagonal(similarity_metric, preference)
# Multiply with -1
similarity_metric *= -1
print(similarity_metric)

[[-3.3365080e-01 -5.0595240e-04 -2.0833334e-04 ... -3.3365080e-01
  -3.3578372e-01 -3.3485121e-01]
 [-5.0595240e-04 -3.3365080e-01 -2.9761906e-04 ... -3.3352181e-01
  -3.3628967e-01 -3.3535713e-01]
 [-2.0833334e-04 -2.9761906e-04 -3.3365080e-01 ... -3.3344245e-01
  -3.3599207e-01 -3.3505952e-01]
 ...
 [-3.3365080e-01 -3.3352181e-01 -3.3344245e-01 ... -3.3365080e-01
  -2.7678572e-03 -1.8353175e-03]
 [-3.3578372e-01 -3.3628967e-01 -3.3599207e-01 ... -2.7678572e-03
  -3.3365080e-01 -9.3253970e-04]
 [-3.3485121e-01 -3.3535713e-01 -3.3505952e-01 ... -1.8353175e-03
  -9.3253970e-04 -3.3365080e-01]]


#### Applying Affinity Propagation and compare clustering results under different damping factors and iterations

In [18]:
dampings = [0.5, 0.6, 0.7, 0.8, 0.9]
iterations = range(200, 1050, 50)

In [19]:
for damping in dampings:
    for i in iterations:
        # Apply calculated similarity_metric to AP algorithm
        af = AffinityPropagation(affinity='precomputed', damping=damping, max_iter=i).fit(similarity_metric)
        # Get the number of clusters
        n_clusters = len(np.unique(af.labels_))
        # Get corresponding labels
        cluster_labels = af.labels_
        # Calculate silhouette score
        silhouette_avg = metrics.silhouette_score(similarity_metric, cluster_labels)
        print('Damping=', damping, ', iteration=', i, ' n_clusters=', n_clusters,
              'The average SScore=', silhouette_avg)

Damping= 0.5 , iteration= 200  n_clusters= 11 The average SScore= 0.43082762
Damping= 0.5 , iteration= 250  n_clusters= 112 The average SScore= 0.28626224
Damping= 0.5 , iteration= 300  n_clusters= 59 The average SScore= 0.28480664
Damping= 0.5 , iteration= 350  n_clusters= 68 The average SScore= 0.4302451
Damping= 0.5 , iteration= 400  n_clusters= 116 The average SScore= 0.28634334
Damping= 0.5 , iteration= 450  n_clusters= 59 The average SScore= 0.28480664
Damping= 0.5 , iteration= 500  n_clusters= 111 The average SScore= 0.28619343
Damping= 0.5 , iteration= 550  n_clusters= 116 The average SScore= 0.28634334
Damping= 0.5 , iteration= 600  n_clusters= 116 The average SScore= 0.28634334
Damping= 0.5 , iteration= 650  n_clusters= 11 The average SScore= 0.43082762
Damping= 0.5 , iteration= 700  n_clusters= 59 The average SScore= 0.28480664
Damping= 0.5 , iteration= 750  n_clusters= 66 The average SScore= 0.43016735
Damping= 0.5 , iteration= 800  n_clusters= 116 The average SScore= 0.286