### Import  required packages

In [1]:
import pandas as pd
import gower
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics

### Select required attributes

In [3]:
columns = ['estimatedp', 'wtr_srcdes', 'wtr_hauled', 'priv_wells', \
               'wc_exists', 'wc_adeq', 'wc_hlth', 'ww_public', 'ppl_nowat_r', 'ppl_noww_r', 'ppl_yeswat_r', 'ppl_yesww_r']

### Specify file paths

In [4]:
colonias_Y_path = 'dataset/colonias_Y_norm.csv'
colonias_N_path = 'dataset/colonias_N_norm.csv'

### For Colonias with public water services

In [7]:
# 1. Read corresponding csv file
df = pd.read_csv(colonias_Y_path)[columns]
df.head()

Unnamed: 0,estimatedp,wtr_srcdes,wtr_hauled,priv_wells,wc_exists,wc_adeq,wc_hlth,ww_public,ppl_nowat_r,ppl_noww_r,ppl_yeswat_r,ppl_yesww_r
0,157,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
1,72,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
2,147,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
3,86,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0
4,41,Public Water System,N,N,Y,Y,N,N,0.0,1.0,1.0,0.0


In [8]:
# 2. Select records in which population rate is not larger than 1
selected_df = df[(df['ppl_nowat_r'] <= 1) & (df['ppl_noww_r'] <= 1)
                 & (df['ppl_yeswat_r'] <= 1) & (df['ppl_yesww_r'] <= 1)]

In [9]:
# 3. Construct Similarity Metric
similarity_metric = gower.gower_matrix(selected_df)
print(similarity_metric)
preference = np.median(similarity_metric)
np.fill_diagonal(similarity_metric, preference)
similarity_metric *= -1

[[0.0000000e+00 2.3611111e-04 2.7777778e-05 ... 7.7777775e-05
  2.5289723e-01 3.3532500e-01]
 [2.3611111e-04 0.0000000e+00 2.0833334e-04 ... 1.5833334e-04
  2.5313333e-01 3.3556110e-01]
 [2.7777778e-05 2.0833334e-04 0.0000000e+00 ... 4.9999999e-05
  2.5292501e-01 3.3535278e-01]
 ...
 [7.7777775e-05 1.5833334e-04 4.9999999e-05 ... 0.0000000e+00
  2.5297499e-01 3.3540279e-01]
 [2.5289723e-01 2.5313333e-01 2.5292501e-01 ... 2.5297499e-01
  0.0000000e+00 8.4238887e-02]
 [3.3532500e-01 3.3556110e-01 3.3535278e-01 ... 3.3540279e-01
  8.4238887e-02 0.0000000e+00]]


In [10]:
"""
Colonias_Y:
Best params: Damping= 0.6 , iteration= 300  n_clusters= 24 The average SScore= 0.8435476
"""
# 4. Apply Affinity Propagation
af = AffinityPropagation(affinity='precomputed', damping = 0.6, max_iter=300).fit(similarity_metric)
n_clusters = len(np.unique(af.labels_))
cluster_labels = af.labels_
print(pd.DataFrame(af.labels_).value_counts())
print("num clusters:", n_clusters)
print(metrics.silhouette_score(similarity_metric, cluster_labels))

with open('dataset/colonias_Y_label.csv', 'w') as f:
    for label in cluster_labels:
        f.write(str(label) + "\n")
f.close()

21    860
19    857
0      45
9      29
15     20
4      17
5      16
20     13
22     12
18      9
1       9
2       8
7       7
13      7
12      5
23      5
14      4
17      4
11      4
6       4
16      1
10      1
8       1
3       1
dtype: int64
num clusters: 24
0.8435476


### For Colonias without public water services

In [13]:
# 1. Read corresponding csv file
df = pd.read_csv(colonias_N_path)[columns]
df.head()

Unnamed: 0,estimatedp,wtr_srcdes,wtr_hauled,priv_wells,wc_exists,wc_adeq,wc_hlth,ww_public,ppl_nowat_r,ppl_noww_r,ppl_yeswat_r,ppl_yesww_r
0,67,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0
1,16,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0
2,46,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0
3,181,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0
4,293,Private Wells,N,Y,N,N,N,N,1.0,1.0,0.0,0.0


In [14]:
# 2. Select records in which population rate is not larger than 1
selected_df = df[(df['ppl_nowat_r'] <= 1) & (df['ppl_noww_r'] <= 1)
                 & (df['ppl_yeswat_r'] <= 1) & (df['ppl_yesww_r'] <= 1)]

In [15]:
# 3. Construct Similarity Metric
similarity_metric = gower.gower_matrix(selected_df)
print(similarity_metric)
preference = np.median(similarity_metric)
np.fill_diagonal(similarity_metric, preference)
similarity_metric *= -1

[[0.0000000e+00 5.0595240e-04 2.0833334e-04 ... 3.3365080e-01
  3.3578372e-01 3.3485121e-01]
 [5.0595240e-04 0.0000000e+00 2.9761906e-04 ... 3.3352181e-01
  3.3628967e-01 3.3535713e-01]
 [2.0833334e-04 2.9761906e-04 0.0000000e+00 ... 3.3344245e-01
  3.3599207e-01 3.3505952e-01]
 ...
 [3.3365080e-01 3.3352181e-01 3.3344245e-01 ... 0.0000000e+00
  2.7678572e-03 1.8353175e-03]
 [3.3578372e-01 3.3628967e-01 3.3599207e-01 ... 2.7678572e-03
  0.0000000e+00 9.3253970e-04]
 [3.3485121e-01 3.3535713e-01 3.3505952e-01 ... 1.8353175e-03
  9.3253970e-04 0.0000000e+00]]


In [17]:
"""
Colonias_N:
Best params: Damping= 0.6 , iteration= 500  n_clusters= 11 The average SScore= 0.6063733
"""
# 4. Apply Affinity Propagation
af = AffinityPropagation(affinity='precomputed', damping = 0.6, max_iter=500).fit(similarity_metric)
n_clusters = len(np.unique(af.labels_))
cluster_labels = af.labels_
print(pd.DataFrame(af.labels_).value_counts())
print("num clusters:", n_clusters)
print(metrics.silhouette_score(similarity_metric, cluster_labels))

with open('dataset/colonias_N_label.csv', 'w') as f:
    for label in cluster_labels:
        f.write(str(label) + "\n")
f.close()

1     62
0     61
10    56
7     11
2      7
6      5
5      4
8      4
3      3
9      3
4      1
dtype: int64
num clusters: 11
0.6063733
