# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 3.1) Predictive Analysis: new customer profile
### *Antonio Strippoli, Valerio Mariani*

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
from math import log
import seaborn as sn
import matplotlib.pyplot as plt
from datetime import timedelta, datetime
from matplotlib.colors import LinearSegmentedColormap

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import MinMaxScaler
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics import silhouette_score, davies_bouldin_score

pd.set_option('mode.chained_assignment', None)

In [None]:
def plot(ax, folder="predictive_pre", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        path = os.path.join("..", "report", "imgs", folder)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, filename))
    plt.show()
    plt.close()

### Define class using MeanSale

In [None]:
# Hyperparameters
min_baskets = 3
threshold_baskets = 2

In [None]:
# Load dataset
cdf = pd.read_csv("customer_profilation.csv", index_col=0)
# Sort by MeanSale
cdf.sort_values("MeanSale", inplace=True)

In [None]:
# Analyze MeanSale attribute
print("--- NUMBER OF BASKETS ---")
print(cdf['Frequency'].describe())
plot(cdf['Frequency'].plot.box())
plot(cdf['Frequency'].hist(bins=100))
print("--- MEAN SALE ---")
print(cdf['MeanSale'].describe())
plot(cdf['MeanSale'].plot.box())
plot(cdf['MeanSale'].hist(bins=100))

In [None]:
# Remove customers without a minimum number of baskets and from MeanSale
cdf = cdf[cdf['Frequency'] >= min_baskets]
print(len(cdf))
cdf = cdf[cdf['MeanSale'] < 3000]
print(len(cdf))

In [None]:
# Pop some high-values from the dataset, saving them for later (they're surely high-spending customers)
print(cdf['MeanSale'].describe())
plot(cdf['MeanSale'].plot.box())
plot(cdf['MeanSale'].hist(bins=100))

mean_sale_outliers = cdf[cdf['MeanSale'] >= 1000]
cdf = cdf[cdf['MeanSale'] < 1000]

In [None]:
# Execute K-means with k=3 to compute labels of non-outlier customers
cdf_cluster = cdf[['MeanSale']]

# Normalize values
scaler = MinMaxScaler() # Minmax?
X = scaler.fit_transform(cdf_cluster.values)

# We choosed 3 as our number of clusters, proceede with the clusterization
kmeans = KMeans(n_clusters=3, init="k-means++", n_init=1000, max_iter=1000)
kmeans.fit(X)

sse = round(kmeans.inertia_, 2)
sil = round(silhouette_score(X, kmeans.labels_), 2)
db = round(davies_bouldin_score(X, kmeans.labels_), 2)
print("SSE:", sse)
print("Silhouette:", sil)
print("Davies Bouldin:", db)

# Save centroids
centers = scaler.inverse_transform(kmeans.cluster_centers_)
print('Centers:')
print(centers)

# Define a new colormap to employ in the visualizations
colors = np.array([
    (80, 219, 42, 255), # Green
    (42, 48, 219, 255), # Blue
    (219, 42, 42, 255) # Red
])
colors = colors / 255.0
cm = LinearSegmentedColormap.from_list('clusters_3', colors, N=3)

cdf['Labels'] = kmeans.labels_

In [None]:
# Map labels to always have 0 -> low-spending, 1 -> medium-spending, 2 -> high-spending
l = int(cdf[cdf['MeanSale'] > 0].iloc[0]['Labels'])
m = int(cdf[cdf['MeanSale'] > 500].iloc[0]['Labels'])
h = int(cdf[cdf['MeanSale'] > 980].iloc[0]['Labels'])
mapping = {
    l: 0,
    m: 1,
    h: 2
}
cdf = cdf.replace({"Labels": mapping})
print(cdf['Labels'].value_counts())

In [None]:
# Visualize distribution of the clusters
cdf2 = cdf[["MeanSale", "Labels"]]
cdf2.reset_index(drop=True, inplace=True)
cdf2.reset_index(inplace=True)
plot(cdf2.plot.scatter(x="index", y="MeanSale", c=cdf2["Labels"], cmap=cm), filename="clustering_partial")

In [None]:
# assegnamo le etichette: quelli clasterizzati hanno ognuno la loro, quelli brutti sono tutti alto-spendenti
mean_sale_outliers['Labels'] = 2
cdf = cdf.append(mean_sale_outliers)
print(cdf['Labels'].value_counts())

In [None]:
# Visualize distribution of the clusters
cdf2 = cdf[["MeanSale", "Labels"]]
cdf2.reset_index(drop=True, inplace=True)
cdf2.reset_index(inplace=True)
plot(cdf2.plot.scatter(x="index", y="MeanSale", c=cdf2["Labels"], cmap=cm), filename="clustering_full")

In [None]:
print(cdf[cdf['Labels'] == 0]['MeanSale'].max())
print(cdf[cdf['Labels'] == 1]['MeanSale'].max())
print(cdf[cdf['Labels'] == 2]['MeanSale'].max())

In [None]:
cdf.to_csv("customer_classification.csv")
cdf

# Creazione dataset per allenamento classificatore
## Riustruazione sulla base dei dati relativi al primo periodo di di attivita' di ogni utente
Una volta assegnate etichette "SICURE" ad ogni utente (calcolate usando tutto il periodo di osservazione) vogliamo creare un nuovo dataset, da usare per allenare il classificatore in modo supervisionato.  
  
In particolare quello che si vuole ottenere e' una serie di coppie  
  
(input)_i --> (desired_output)_i per i = 1...N_utenti  
  
dove (input)_i = (attr1, attr2, ... , attrn)_i e' una tupla di attributi relativi all'utente i-esimo e calcolati su una parte del periodo di osservazione disponibile (ad esempio il primo mese, oppure il primo carrello ), mentre invece (desired_output)_i e' l'etichetta assegnata in precedenza all'utente i.  


## Fase 1: preprocessing del dataset originario
vogliamo eliminare le tuple relative ad acquisti eseguiti da utenti piu vecchi di ((((( un mese )))))

In [None]:
# Read datasets
df = pd.read_csv('customer_supermarket_2.csv', index_col=0, parse_dates=["PurchaseDate"])
cdf = pd.read_csv('customer_classification.csv', index_col=0)

In [None]:
# Keep only customers that we retained in the label computation step
good_customers = list(set(cdf.index))
print("Length of df BEFORE deletion:", len(df))
df = df[df['CustomerID'].isin(good_customers)]
print("Length of df AFTER deletion:", len(df))

In [None]:
# Do binning on the products' sale
# NOTE: could k-means be better to categorize the products???????????????????????????????????????????
df_sale = pd.Series(df['Sale'].unique(), name="Sale")
q2, q3 = df_sale.quantile([.25, .5])

def categorize(sale):
    if sale < q2:
        return 0
    elif sale < q3:
        return 1
    return 2

df['PriceCategory'] = df["Sale"].apply(categorize)
print(df['PriceCategory'])

In [None]:
df.sort_values("PurchaseDate", inplace=True)

# Filter entries by keeping only the first basket for each customer
def select_interesting_tuples(g):
  global threshold_baskets
  good_baskets = g.iloc[0:threshold_baskets]['BasketID']
  return g[g['BasketID'].isin(good_baskets)]

# Create an empty dataset
new_df = df.drop(index = df.index)

# Fill the dataset with only the customers having more than 'threshold_baskets' baskets
for g in df.groupby("CustomerID"):
  interesting_tuples = select_interesting_tuples(g[1])
  new_df = new_df.append(interesting_tuples)

new_df

## Fase due: Profilazione parziale
A partire dal Dataset creato, costruisco un dataset per la profilazione utente nel modo esatto in cui lo facevo prima. (Quello che cambia e' che questa volta uso solo le tuple relative al primo periodo di attivita' di ogni utente)

In [None]:
# Total money spent
monetary = lambda g: round( sum( g["Sale"]*g["Qta"] ), 2)
# Define attributes for the customer classification
max_sale = lambda g: round( max( [ sum( g1[1]["Sale"]*g1[1]["Qta"] ) for g1 in g.groupby("BasketID") ] ), 2)
# Medium amount for a basket
mean_sale = lambda g: round( np.mean( [ sum( g1[1]["Sale"]*g1[1]["Qta"] ) for g1 in g.groupby( "BasketID" ) ] ), 2)
# Sale entropy
def entropy(g, attr1, attr2, groupby, op):
    l = g[[attr1, attr2]].groupby(groupby).agg(op)
    m = l.values.sum()
    e = -sum( [ (mi/m)*log((mi/m), 2) for mi in l.values.flatten() ] )
    return round(e, 2)

# Total purchased items
tot_items = lambda g: sum( g["Qta"] )
# Maximum number of purchased items in a shopping session
max_items = lambda g: max( [ sum( g1[1]["Qta"] ) for g1 in g.groupby("BasketID") ] )
# Medium object in basket
mean_items = lambda g: int( np.mean( [ sum( g1[1]["Qta"] ) for g1 in g.groupby("BasketID") ] ))
# Number of distinct items
unique_items = lambda g: g["ProdID"].nunique()

# Medium amount paid for an item
mean_item_sale = lambda g: round( np.mean( g["Sale"].unique() ), 2)
# Quantities of items purchased for each category
cat_0 = lambda g: sum( g[g['PriceCategory'] == 0]['Qta'] )
cat_1 = lambda g: sum( g[g['PriceCategory'] == 1]['Qta'] )
cat_2 = lambda g: sum( g[g['PriceCategory'] == 2]['Qta'] )

In [None]:
groups = new_df.groupby("CustomerID")

cldf = pd.DataFrame(data=np.array( [
    [
    group[0],
    monetary(group[1]),
    max_sale(group[1]),
    mean_sale(group[1]),
    entropy(group[1], "ProdID", 'Sale', 'ProdID', 'sum'),
    tot_items(group[1]),
    max_items(group[1]),
    mean_items(group[1]),
    unique_items(group[1]),
    mean_item_sale(group[1]),
    cat_0(group[1]),
    cat_1(group[1]),
    cat_2(group[1]),
    ] for group in groups
] ), columns=["CustomerID","Monetary","MaxSale","MeanSale","E-Sale","TotItems","MaxItems","MeanItems","UniqueItems","MeanItemSale","Cat0","Cat1","Cat2"] )

for col in cldf.columns:
    if col not in ["E-Sale"]:
        cldf[col] = cldf[col].astype(np.int64)

cldf.set_index('CustomerID', inplace=True)
cldf.to_csv("customer_classification.csv")
cldf = pd.read_csv("customer_classification.csv", index_col=0)
cldf

In [None]:
print(len(cdf))
print(len(cldf))

In [None]:
# Align datasets' indexes
cldf.sort_index(inplace=True)
cdf.sort_index(inplace=True)

# Check for anomalies
print("NORM BETWEEN THE INDEXES:", np.linalg.norm( cdf.index - cldf.index ))

# Add labels to the classification dataset
cldf['Labels'] = cdf['Labels']
cldf.to_csv("customer_classification.csv")

In [None]:
cldf