# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 2.2) Clustering Analysis: DBScan
### *Antonio Strippoli, Valerio Mariani*

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from IPython.display import clear_output
from matplotlib.colors import LinearSegmentedColormap

from sklearn.cluster import DBSCAN
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score

pd.set_option('mode.chained_assignment', None)

In [None]:
def plot(ax, folder="clustering_dbscan", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        path = os.path.join("..", "report", "imgs", folder)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, filename))
    plt.show()
    plt.close()

In [None]:
# Load dataset for clustering
cdf = pd.read_csv("customer_profilation.csv", index_col=0)

### Definition of attributes employed for clustering and hyperparameters search

In [None]:
# We choosed these 3 attributes, since they usually classify very well customers
attr_cluster = ['Recency', 'Frequency', 'Monetary']
cdf_cluster = cdf[attr_cluster]

# Normalize values
scaler = MinMaxScaler() # Minmax?
X = scaler.fit_transform(cdf_cluster.values)

In [None]:
# Create a matrix to select best values for eps and min_pts
eps_list = np.arange(0.01,0.2,0.001)
min_pts_list = np.arange(0,15,1)
mat = np.zeros(( eps_list.size, min_pts_list.size) )

for i, eps in enumerate(eps_list):
    for j, min_pts in enumerate(min_pts_list):
        dbscan = DBSCAN(eps=eps, min_samples=min_pts)
        dbscan.fit(X)
        mat[i,j] = silhouette_score(X, dbscan.labels_) + davies_bouldin_score(X, dbscan.labels_)

        clear_output(wait=True)
        print(i, '/', eps_list.size)

mat1 = pd.DataFrame(mat.transpose(), columns=np.round(eps_list,3))
plot(sn.heatmap(mat1, cmap='coolwarm'), figsize=(12, 9), filename="Heatmap_Hyperparameters")

In [None]:
print("BEST EPS:", eps_list[np.where(mat == np.amax(mat))[0]])
print("BEST MIN_PTS:", min_pts_list[np.where(mat == np.amax(mat))[1]])

### Clusterization and distribution of the choosed attributes

In [None]:
# We choosed 0.056 as our eps and 10 as our min_samples, proceede with the clusterization
dbscan = DBSCAN(eps=0.056, min_samples=10)
dbscan.fit(X)

# Save number of clusters
k = len(np.unique(dbscan.labels_))

# Define a new colormap to employ in the visualizations
colors = np.array([
    (219, 42, 42, 255), # Red
    (219, 116, 42, 255), # Orange
    (80, 219, 42, 255), # Green
    (42, 219, 213, 255), # Cyan
    (42, 48, 219, 255), # Blue
    (219, 42, 213, 255) # Fuchsia
])
colors = colors[:k]
colors = colors / 255.0
cm = LinearSegmentedColormap.from_list('clusters_6', colors, N=6)

In [None]:
# 3D Scatter plot of the attributes clusterized
def cluster_scatter_3d(view_init=None, filename=""):
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')

    c1, c2, c3 = attr_cluster
    scatter = ax.scatter3D(cdf_cluster[c1], cdf_cluster[c2], cdf_cluster[c3], c=dbscan.labels_, label=dbscan.labels_, s=20, cmap=cm)
    ax.set_xlabel(c1)
    ax.set_ylabel(c2)
    ax.set_zlabel(c3)

    legend1 = ax.legend(*scatter.legend_elements(), title="Clusters")
    ax.add_artist(legend1)

    if view_init:
        ax.view_init(*view_init)

    plot(None, figsize=(6,6), filename=filename)

cluster_scatter_3d(filename="Attr_3D")
cluster_scatter_3d(filename="Attr_3D_rot", view_init=(30,60))