# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 2.3) Clustering Analysis: Hierarchical
### *Antonio Strippoli, Valerio Mariani*

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score, davies_bouldin_score
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cut_tree

pd.set_option('mode.chained_assignment', None)

In [None]:
def plot(ax, folder="clustering_hierarchical", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        path = os.path.join("..", "report", "imgs", folder)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, filename))
    plt.show()
    plt.close()

In [None]:
# Load dataset for clustering
cdf = pd.read_csv("customer_profilation.csv", index_col=0)

### Definition of attributes employed for clustering

In [None]:
# We choosed these 3 attributes, since they usually classify very well customers
attr_cluster = ['Recency', 'Frequency', 'Monetary']
cdf_cluster = cdf[attr_cluster]

# Normalize values
scaler = MinMaxScaler()
X = scaler.fit_transform(cdf_cluster.values)

### Clusterization using different algorithms

In [None]:
sn.heatmap(squareform(pdist(X, metric='euclidean')))

In [None]:
# 3D Scatter plot of the attributes clusterized
def cluster_scatter_3d(view_init=None, label=None, filename=""):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    c1, c2, c3 = attr_cluster
    scatter = ax.scatter3D(cdf_cluster[c1], cdf_cluster[c2], cdf_cluster[c3], c=label, label=label, s=20)
    ax.set_xlabel(c1)
    ax.set_ylabel(c2)
    ax.set_zlabel(c3)

    legend1 = ax.legend(*scatter.legend_elements(), title="Clusters")
    ax.add_artist(legend1)

    if view_init:
        ax.view_init(*view_init)

    plot(None, figsize=(6,6), filename=filename)

def hierarchical_analysis(data_dist, method, metric, height=None):
    data_link = linkage(data_dist, method=method, metric=metric)
    plot(dendrogram(data_link, truncate_mode='lastp'), filename=f"{method}_{metric}")
    cdf['label'] = cut_tree(data_link, height=height)
    cluster_scatter_3d(label=cdf['label'], filename=f'{method}_{metric}_Scatter3D')

def analysis(X, metric='euclidean'):
    data_dist = pdist(X, metric=metric)

    # Nearest Point Algorithm - MIN
    hierarchical_analysis(data_dist, method='single', metric=metric, height=0.14)

    # Voor Hees Algorithm - MAX
    hierarchical_analysis(data_dist, method='complete', metric=metric, height=1.05)

    # UPGMA Algorithm - Group Average
    hierarchical_analysis(data_dist, method='average', metric=metric, height=0.6)

In [None]:
analysis(X, metric='euclidean')

In [None]:
analysis(X, metric='cityblock')

In [None]:
def last_iterations(metric = 'cityblock', method = 'complete', hs=[]):
    data_dist = pdist(X, metric=metric)
    data_link = linkage(data_dist, method=method, metric=metric)

    dendrogram(data_link, truncate_mode='lastp')

    for i, h in enumerate(hs):
        cdf['label'] = cut_tree(data_link, height=h)
        cluster_scatter_3d(label=cdf['label'], filename=f"Scatter3D_{i+1}")

In [None]:
# Analyze hierarchies produced by euclidean-max
last_iterations(metric='euclidean', method = 'complete', hs=[1.4, 1.1, 1, .75, .6, .49])

In [None]:
# Other tries
last_iterations(metric = 'cityblock', method = 'average', hs=[1,.8,.6])
last_iterations(metric = 'cityblock', method = 'complete', hs=[2,1.5,1.3])

In [None]:
data_dist = pdist(X, metric='euclidean')
data_link = linkage(data_dist, method='complete', metric='euclidean')
labels = cut_tree(data_link, height=.6)

sil = round(silhouette_score(X, labels), 2)
db = round(davies_bouldin_score(X, labels), 2)
print("Silhouette:", sil)
print("Davies Bouldin:", db)