# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 2.1) Clustering Analysis
### *Antonio Strippoli, Valerio Mariani*

In [None]:
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats
from math import log, ceil
from natsort import natsorted
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn import metrics
from itertools import combinations

pd.set_option('mode.chained_assignment', None)

In [None]:
cdf = pd.read_csv("customer_profilation.csv", index_col=0)
# HACK
cdf = cdf[cdf['Frequency'] < 2000]

In [None]:
def cluster_score(X, k):
    kmeans = KMeans(n_clusters=k, n_init=10, max_iter=1000)
    kmeans.fit(X)
    centers = kmeans.cluster_centers_

    # Values
    inertia = -1* kmeans.inertia_
    sil = silhouette_score(X, kmeans.labels_) * 100
    separation = metrics.davies_bouldin_score(X, kmeans.labels_) * 100

    return inertia + sil + separation

In [None]:
cdf = pd.read_csv("customer_profilation.csv", index_col=0)

combos = list(combinations(cdf.columns.array, 3))

cdf.drop(columns=['PrefItem', 'MainCountry'], inplace=True)

score = -100000
best = []
best_k = 0
for combo in combos:
    new_cdf = cdf[list(combo)]
    scaler = MinMaxScaler()
    X = scaler.fit_transform(new_cdf.values)
    
    for k in range(3, 10):
        new_score = cluster_score(X, k)
        if new_score > score:
            score = new_score
            best = list(combo)
            best_k = k

print(best)
print(best_k)
print(score)

In [None]:
# Reimpostazione dataframe
# cdf = cdf[cdf['MainCountry'] == 'United Kingdom']
# Best of 3
attr_cluster = ['TotItems','MaxSale','NBaskets']
# Best of 4
attr_cluster = ['TotItems', 'MaxSale', 'MeanItemSale', 'NBaskets']
# Attempt
attr_cluster = ['MeanItemSale', 'PReturn', 'E-Qta']
# Attempt 2
attr_cluster = ['MeanItemSale', 'TotSale']
# Attempt 3
attr_cluster = ['Recency', 'Frequency', 'Monetary']

cdf_cluster = cdf[attr_cluster]

# cdf['MainCountry'] = pd.factorize(cdf['MainCountry'])[0]
# cdf['PrefItem'] = pd.factorize(cdf['PrefItem'])[0]

# normalizzazione (prova a denormalizzare)
scaler = MinMaxScaler() # Minmax?
X = scaler.fit_transform(cdf_cluster.values)

#selezionare miglior valore di k
sse_list = []
sil_list = []
max_k = 30
for k in range(2, max_k + 1):
    kmeans = KMeans(n_clusters=k, init="k-means++", n_init=10, max_iter=100) # Facciamo con K-means++?
    kmeans.fit(X)
    
    sse = kmeans.inertia_
    sil = silhouette_score(X, kmeans.labels_) * 100
    sse_list.append(sse)
    sil_list.append(sil)

plt.plot(range(2, len(sse_list) + 2), sse_list, marker='o')
plt.ylabel('SSE', fontsize=22)
plt.xlabel('K', fontsize=22)
plt.show()
plt.close()

plt.plot(range(2, len(sil_list) + 2), sil_list, marker='o')
plt.ylabel('Silhouette Score', fontsize=22)
plt.xlabel('K', fontsize=22)
plt.show()

# clusterizzazione
kmeans = KMeans(n_clusters=7, init="k-means++", n_init=100, max_iter=1000) # Facciamo con K-means++?
kmeans.fit(X)
centers = scaler.inverse_transform(kmeans.cluster_centers_)
print(centers)

# grafico 2D
"""
plt.scatter(cdf['E'], cdf['SaleRate'], c=kmeans.labels_, s=20)
plt.scatter(centers[:, 0], centers[:, 1], s=200, marker='*', c='k')
plt.show()
quit()
"""

combos = list(combinations(attr_cluster, 3))

for c in combos:
    c1, c2, c3 = c

    # Grafico 3D
    fig = plt.figure(figsize = (10, 7))
    ax = plt.axes(projection ="3d")

    ax.scatter3D(cdf_cluster[c1], cdf_cluster[c2], cdf_cluster[c3], c=kmeans.labels_, s=20)
    ax.scatter3D(centers[:, 0], centers[:, 1], centers[:, 2], s=200, marker='*', c='k')
    
    ax.set_xlabel(c1)
    ax.set_ylabel(c2)
    ax.set_zlabel(c3)
    plt.show()

# Normalized centers
norm_centers = kmeans.cluster_centers_

plt.figure(figsize=(8, 4))
for i in range(0, len(norm_centers)):
    plt.plot(norm_centers[i], marker='o', label='Cluster %s' % i)
plt.tick_params(axis='both', which='major', labelsize=22)
plt.xticks(range(0, len(cdf_cluster.columns)), cdf_cluster.columns, fontsize=18)
plt.legend(fontsize=20)
plt.show()

from math import pi

# number of variable
N = len(cdf_cluster.columns)
# What will be the angle of each axis in the plot? (we divide the plot / number of variable)
for i in range(0, len(norm_centers)):
    angles = [n / float(N) * 2 * pi for n in range(N)]
    values = norm_centers[i].tolist()
    values += values[:1]
    angles += angles[:1]
    # Initialise the spider plot
    ax = plt.subplot(polar=True)
    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], cdf_cluster.columns, color='grey', size=8) 
    # Plot data
    ax.plot(angles, values, linewidth=1, linestyle='solid')
    # Fill area
    ax.fill(angles, values, 'b', alpha=0.1)

plt.show()

"""
sn.heatmap(cdf.drop('CustomerID', axis=1).corr(), annot=True)
plt.show()
plt.close()

pd.plotting.scatter_matrix(cdf,figsize=(15,15))
plt.show()
plt.close()
"""

# TODO
# Aggiusta grafici
# Fai grafico istogramma per mostrare valori centroidi per i 3 attributi

In [None]:
cdf_cluster['ClusterLabels'] = kmeans.labels_

In [None]:
sn.boxplot(x='ClusterLabels', y='Recency', data=cdf_cluster)
plt.show()
plt.close()
sn.boxplot(x='ClusterLabels', y='Frequency', data=cdf_cluster)
plt.show()
plt.close()
sn.boxplot(x='ClusterLabels', y='Monetary', data=cdf_cluster)
plt.show()
plt.close()

In [None]:
cdf['ClusterLabels'] = kmeans.labels_

In [None]:
cdf.plot.scatter(x='E-Qta', y='E-Sale')

In [None]:
sn.boxplot(x='ClusterLabels', y='E-Qta', data=cdf)
plt.show()
plt.close()
sn.boxplot(x='ClusterLabels', y='E-Sale', data=cdf)
plt.show()
plt.close()

In [None]:
# TODO
# Per ogni cluster guardare PReturn, PrefItem, MainCountry, WeekDayPref, WeekMonthPref, MeanItemSale