# # Appendix E - Avoid hyperparameters

In [None]:
import pandas as pd
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import SpectralClustering
from sklearn.metrics import pairwise_distances
from sklearn.cluster import KMeans
from numba import jit, prange
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio

pio.renderers.default = 'iframe'

In [None]:
data = pd.read_csv('Mall_Customers.csv', usecols=['Annual Income (k$)', 'Spending Score (1-100)', 'Age']).dropna()
data = normalized_df=(data - data.mean()) / data.std()

data.head()

In [None]:
fig = px.scatter_3d(data, x='Annual Income (k$)', y='Spending Score (1-100)', z='Age', opacity=1, size=0.1 * np.ones(len(data)))
fig.show()

In [None]:
def find_communities(adj):
    communities = np.zeros(adj.shape[0], dtype=np.int32)

    label = 0
    pool = set()
    for idx, val in enumerate(communities):
        if val == 0:
            # Change the community
            label += 1
            communities[idx] = label
        
            # Neighbours are in the same community
            neighbours = np.nonzero(adj[idx])[0]
            
            # Neighbours of neighbours are in the same community
            pool |= set(neighbours)
            while pool:
                neigh_idx = pool.pop()
                neigh_val = communities[neigh_idx]

                # Don't look at previously used data
                if neigh_val == 0:
                    communities[neigh_idx] = label

                    neighbours = np.nonzero(adj[neigh_idx])[0]
                    pool |= set(neighbours)
                    
    return communities

@jit(nopython=True, nogil=True, parallel=True, fastmath=True)
def compute_modularity(adj, communities):    
    n_edges_doubled = np.sum(adj)
    k_all = np.sum(adj, axis=1)
    
    out = np.zeros((adj.shape[0], adj.shape[0]), np.float32)
    for row_i_idx in prange(adj.shape[0]):
        for row_j_idx in prange(row_i_idx+1):
            # Compute it only for nodes of the same community
            if communities[row_i_idx] == communities[row_j_idx]:
                A_ij = adj[row_i_idx, row_j_idx]
                P_ij = (k_all[row_i_idx] * k_all[row_j_idx]) / n_edges_doubled
                
                local_mod = A_ij - P_ij
                out[row_i_idx][row_j_idx] = local_mod
                out[row_j_idx][row_i_idx] = local_mod

    out_sum = np.sum(out) / n_edges_doubled
    return out_sum

In [None]:
distances = pairwise_distances(data)

In [None]:
modularities = []
space = np.linspace(0.01, 0.99, 100)
for threshold in space:
    binarized = (distances < threshold).astype(np.int32)
    
    communities = find_communities(binarized)
    modularity = compute_modularity(binarized, communities)
    modularities.append(modularity)

In [None]:
fig = go.Figure(data=go.Scatter(x=space, y=modularities, mode='lines+markers'))
fig.update_layout(xaxis_title='Threshold',
                   yaxis_title='Modularity')

In [None]:
# Binarize network
binarized = (distances < 0.41).astype(int)
plt.imshow(binarized)
plt.show()

# Compute the laplacian
D = np.diag(np.sum(binarized, axis=0))
laplacian = D - binarized

# Compute eigenvalues and eigenvectors
evals, evects = np.linalg.eigh(laplacian)

In [None]:
# Plot eigenvalues
plt.figure(figsize=(20,10))
plt.bar(np.arange(60), evals[0:60])
plt.xticks(np.arange(0, 60), labels=np.arange(1, 60+1))
plt.xlabel('Eigenvalues')
plt.ylabel('Intensity')
plt.savefig('../figures/mall-eigens.eps')
plt.show()

In [None]:
kmeans = KMeans(n_clusters=57)
labeled_data = data.copy()
labeled_data['labels'] = kmeans.fit(evects[:,0:57]).labels_

In [None]:
fig = px.scatter_3d(labeled_data, x='Annual Income (k$)', y='Spending Score (1-100)', z='Age',
                    opacity=1, color='labels', size=0.1 * np.ones(len(data)))
fig.show()

In [None]:
clustering = SpectralClustering(n_clusters=5, assign_labels='kmeans').fit(data)
labeled_data = data.copy()
labeled_data['labels'] = clustering.labels_

In [None]:
fig = px.scatter_3d(labeled_data, x='Annual Income (k$)', y='Spending Score (1-100)', z='Age',
                    opacity=1, color='labels', size=0.1 * np.ones(len(data)))
fig.show()