### Install gower package

In [1]:
pip install gower

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'd:\pycharm\pycharmprojects\ap_test\venv\scripts\python.exe -m pip install --upgrade pip' command.


### Import  required packages

In [2]:
import pandas as pd
import gower
import numpy as np
from sklearn.cluster import AffinityPropagation
from sklearn import metrics

### Select required attributes

In [3]:
columns = ['estimatedp', 'wtr_srcdes', 'wtr_hauled', 'priv_wells', \
               'wc_exists', 'wc_adeq', 'wc_hlth', 'ww_public', 'ppl_nowat_r', 'ppl_noww_r', 'ppl_yeswat_r', 'ppl_yesww_r']

### Specify file paths

In [4]:
colonias_Y_path = 'dataset/colonias_Y_norm.csv'
colonias_N_path = 'dataset/colonias_N_norm.csv'

### For Colonias with public water services

In [None]:
# 1. Read corresponding csv file
df = pd.read_csv(colonias_Y_path)[columns]
df.head()

In [None]:
# 2. Select records in which population rate is not larger than 1
selected_df = df[(df['ppl_nowat_r'] <= 1) & (df['ppl_noww_r'] <= 1)
                 & (df['ppl_yeswat_r'] <= 1) & (df['ppl_yesww_r'] <= 1)]

#### Construct Similarity Metric

In [None]:
# use gower distance to construct similarity metric
similarity_metric = gower.gower_matrix(selected_df)
print(similarity_metric)

In [None]:
# Select median value in the similarity_metrix as the preference value (defined in Affinity Propagation algorithm)
preference = np.median(similarity_metric)
print(preference)

In [None]:
# Replace diagonal value with the preference value
np.fill_diagonal(similarity_metric, preference)
# Multiply with -1
similarity_metric *= -1
print(similarity_metric)

#### Applying Affinity Propagation and compare clustering results under different damping factors and iterations

In [None]:
dampings = [0.5, 0.6, 0.7, 0.8, 0.9]
iterations = range(200, 1050, 50)

In [None]:
for damping in dampings:
    for i in iterations:
        # Apply calculated similarity_metric to AP algorithm
        af = AffinityPropagation(affinity='precomputed', damping=damping, max_iter=i).fit(similarity_metric)
        # Get the number of clusters
        n_clusters = len(np.unique(af.labels_))
        # Get corresponding labels
        cluster_labels = af.labels_
        # Calculate silhouette score
        silhouette_avg = metrics.silhouette_score(similarity_metric, cluster_labels)
        print('Damping=', damping, ', iteration=', i, ' n_clusters=', n_clusters,
              'The average SScore=', silhouette_avg)

### For Colonias without public water services

In [None]:
# 1. Read corresponding csv file
df = pd.read_csv(colonias_N_path)[columns]
df.head()

In [None]:
# 2. Select records in which population rate is not larger than 1
selected_df = df[(df['ppl_nowat_r'] <= 1) & (df['ppl_noww_r'] <= 1)
                 & (df['ppl_yeswat_r'] <= 1) & (df['ppl_yesww_r'] <= 1)]

#### Construct Similarity Metric

In [None]:
# use gower distance to construct similarity metric
similarity_metric = gower.gower_matrix(selected_df)
print(similarity_metric)

In [None]:
# Select median value in the similarity_metrix as the preference value (defined in Affinity Propagation algorithm)
preference = np.median(similarity_metric)
print(preference)

In [None]:
# Replace diagonal value with the preference value
np.fill_diagonal(similarity_metric, preference)
# Multiply with -1
similarity_metric *= -1
print(similarity_metric)

#### Applying Affinity Propagation and compare clustering results under different damping factors and iterations

In [None]:
dampings = [0.5, 0.6, 0.7, 0.8, 0.9]
iterations = range(200, 1050, 50)

In [None]:
for damping in dampings:
    for i in iterations:
        # Apply calculated similarity_metric to AP algorithm
        af = AffinityPropagation(affinity='precomputed', damping=damping, max_iter=i).fit(similarity_metric)
        # Get the number of clusters
        n_clusters = len(np.unique(af.labels_))
        # Get corresponding labels
        cluster_labels = af.labels_
        # Calculate silhouette score
        silhouette_avg = metrics.silhouette_score(similarity_metric, cluster_labels)
        print('Damping=', damping, ', iteration=', i, ' n_clusters=', n_clusters,
              'The average SScore=', silhouette_avg)