# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 3.1) Predictive Analysis: new customer profile
### *Antonio Strippoli, Valerio Mariani*

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score, davies_bouldin_score

pd.set_option('mode.chained_assignment', None)

In [None]:
def plot(ax, folder="clustering_kmeans", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        path = os.path.join("..", "report", "imgs", folder)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, filename))
    plt.show()
    plt.close()

### Define class using MeanSale

In [None]:
# Load dataset
cdf = pd.read_csv("customer_profilation.csv", index_col=0)
cdf.sort_values("MeanSale", inplace=True)
brutti = cdf[cdf['MeanSale'] >= 3000]
cdf = cdf[cdf['MeanSale'] < 3000]

In [None]:
print(cdf['MeanSale'].describe())
plot(cdf['MeanSale'].plot.box())
plot(cdf['MeanSale'].hist(bins=100))

In [None]:
attr_cluster = ['MeanSale']
cdf_cluster = cdf[attr_cluster]

# Normalize values
scaler = MinMaxScaler() # Minmax?
X = scaler.fit_transform(cdf_cluster.values)

In [None]:
# We choosed 3 as our number of clusters, proceede with the clusterization
kmeans = KMeans(n_clusters=3, init="k-means++", n_init=1000, max_iter=1000)
kmeans.fit(X)

sse = round(kmeans.inertia_, 2)
sil = round(silhouette_score(X, kmeans.labels_), 2)
db = round(davies_bouldin_score(X, kmeans.labels_), 2)
print("SSE:", sse)
print("Silhouette:", sil)
print("Davies Bouldin:", db)

# Save centroids
centers = scaler.inverse_transform(kmeans.cluster_centers_)

# Define a new colormap to employ in the visualizations
colors = np.array([
    (219, 42, 42, 255), # Red
    (219, 116, 42, 255), # Orange
    (80, 219, 42, 255), # Green
    (42, 219, 213, 255), # Cyan
    (42, 48, 219, 255), # Blue
])
colors = colors / 255.0
cm = LinearSegmentedColormap.from_list('clusters_6', colors, N=5)

In [None]:
centers

In [None]:
print(pd.Series(kmeans.labels_).value_counts())

cdf2 = cdf[["MeanSale", "Monetary"]]
cdf2['Labels'] = kmeans.labels_
cdf2.reset_index(drop=True, inplace=True)
cdf2.reset_index(inplace=True)
cdf2.plot.scatter(x="index", y="MeanSale", c="Labels", cmap=cm)
# cdf.plot.scatter(x='MeanSale', y='Labels', c=kmeans.labels_, cmap=cm)

In [None]:
cdf['Labels'] = kmeans.labels_
brutti['Labels'] = int(cdf[cdf['MeanSale'] > 2000].head(1)['Labels'])
cdf = cdf.append(brutti)
print(cdf['Labels'].value_counts())

In [None]:
cdf2 = cdf[["MeanSale", "Labels"]]
cdf2.reset_index(drop=True, inplace=True)
cdf2.reset_index(inplace=True)
cdf2.plot.scatter(x="index", y="MeanSale", c=cdf2["Labels"], cmap=cm)