# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 1.2) Data Preparation
### *Antonio Strippoli, Valerio Mariani*

In [None]:
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats
from math import log, ceil
from natsort import natsorted
import matplotlib.pyplot as plt

pd.set_option('mode.chained_assignment', None)

In [None]:
def plot(ax, filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        plt.savefig(f"../report/imgs/{filename}")
    plt.show()
    plt.close()

def plt_radar(df: pd.DataFrame, filepath=""):
    """Represent a DataFrame using a radar plot.
    """
    # Number of variable
    categories=list(df.index)
    N = len(categories)

    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]

    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)

    # If you want the first axis to be on top:
    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], categories)

    # Draw ylabels
    ax.set_rlabel_position(0)
    ylim = ceil(df.max().max())
    ticks = list(range(0,ylim,5))
    ticks_str = list(map(lambda x: str(x), ticks))
    plt.yticks(ticks, ticks_str, color="grey", size=7)
    plt.ylim(0,ylim)

    # PART 2: Add plots
    # Ind1
    values = list(df[df.columns[0]])
    values += values[:1]
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=df.columns[0])
    ax.fill(angles, values, 'b', alpha=0.1)
    
    # Ind2
    values = list(df[df.columns[1]])
    values += values[:1]
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=df.columns[1])
    ax.fill(angles, values, 'r', alpha=0.1)
    
    # Add legend and tight the layout
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.tight_layout()

    # Show or save?
    if not filepath:
        plt.show()
        plt.close()
    else:
        plt.savefig(filepath)

In [None]:
# Load the secondary data from the given file
df = pd.read_csv('customer_supermarket_2.csv', index_col=0, parse_dates=["PurchaseDate"])

### Create a new dataset with a profilation of each customer.

In [None]:
# Total purchased items
tot_items = lambda g: sum( g["Qta"] )
# Maximum number of purchased items in a shopping session
max_items = lambda g: max( [ sum( g1[1]["Qta"] ) for g1 in g.groupby("BasketID") ] )
# Medium object in basket
mean_items = lambda g: int( np.mean( [ sum( g1[1]["Qta"] ) for g1 in g.groupby("BasketID") ] ))
# Number of distinct items
unique_items = lambda g: g["ProdID"].nunique()
# Preferred item
preferred_item = lambda g: g.groupby('ProdID').agg({'Qta':'sum'}).idxmax()[0]
# Total money spent
tot_sale = lambda g: round( sum( g["Sale"]*g["Qta"] ), 2)
# Max amount for a basket
max_sale = lambda g: round( max( [ sum( g1[1]["Sale"]*g1[1]["Qta"] ) for g1 in g.groupby("BasketID") ] ), 2)
# Medium amount for a basket
mean_sale = lambda g: round( np.mean( [ sum( g1[1]["Sale"]*g1[1]["Qta"] ) for g1 in g.groupby( "BasketID" ) ] ), 2)
# Medium amount paid for an item
mean_item_sale = lambda g: round( np.mean( g["Sale"].unique() ), 2)
# Number of baskets
n_baskets = lambda g: g['BasketID'].nunique()
# Entropy
def entropy(g, attr1, attr2, op):
    l = g[[attr1, attr2]].groupby(attr1).agg(op)
    m = l.values.sum()
    e = -sum( [ (mi/m)*log((mi/m), 2) for mi in l.values.flatten() ] )
    return round(e, 2)
# Main country
main_country = lambda g: g[['BasketID','CustomerCountry']].groupby('CustomerCountry').nunique().idxmax()[0]

In [None]:
groups = df[df["Qta"]>0].groupby("CustomerID")
cdf = pd.DataFrame(data=np.array( [
    [
    group[0],
    tot_items(group[1]),
    max_items(group[1]),
    mean_items(group[1]),
    unique_items(group[1]),
    preferred_item(group[1]),
    tot_sale(group[1]),
    max_sale(group[1]),
    mean_sale(group[1]),
    mean_item_sale(group[1]),
    n_baskets(group[1]),
    entropy(group[1], "ProdID", 'Qta', 'sum'),
    entropy(group[1], "ProdID", 'Sale', 'sum'),
    entropy(group[1], "PurchaseDate", 'Qta', 'sum'),
    main_country(group[1])
    ] for group in groups
] ), columns=["CustomerID","TotItems","MaxItems","MeanItems","UniqueItems","PrefItem","TotSale","MaxSale","MeanSale","MeanItemSale","NBaskets","E-Qta","E-Sale","E-Date","MainCountry"] )
cdf.set_index('CustomerID', inplace=True)

# Workaround for Pandas' bug (not able to convert to correct dtypes)
# cdf.convert_dtypes()
cdf.to_csv("customer_profilation.csv")
cdf = pd.read_csv("customer_profilation.csv", index_col=0)

# calculate percentage of returned item for customer
groups = df[ (df["Qta"]<0) & ~(df["ProdID"].isin(['M', 'D', 'BANK CHARGES'])) ][['CustomerID','Qta']].groupby("CustomerID").agg('sum')
cdf['PReturn'] = pd.Series(
    [ round(-groups.loc[i]['Qta']/cdf.loc[i]['TotItems']*100, 2) if i in groups.index else 0 for i in cdf.index ],
    index=cdf.index
)
cdf['SaleRate'] = cdf['TotSale'] / cdf['TotItems']

cdf.to_csv("customer_profilation.csv")
cdf

### Distribution & Statistics

In [None]:
cdf = pd.read_csv("customer_profilation.csv", index_col=0)

print('---------- BASIC INFORMATION ----------')
print( cdf.info() )
print('---------- INDIVIDUAL ATTRIBUTE STATISTICS ----------')
print( cdf.describe() )

In [None]:
# Pandas' scatter matrix
plot(pd.plotting.scatter_matrix(cdf), figsize=(20,20), filename="cdf_ScatterMatrix")

In [None]:
# Distrituion of numerical attributes with histograms
plot(cdf.hist(bins=50), filename="cdf_Histograms", figsize=(10,10))

# Distrituion of numerical attributes with box-plots
plot(cdf.plot.box(), filename="cdf_Box_Plots")

# Pairwise xorrelations with heatmap on correlation matrix
plot(sn.heatmap(cdf.corr(), cmap='coolwarm', annot=True), filename="cdf_HeatMap_Pairwise_Correlations", figsize=(10,10))

In [None]:
# lu vs E
cdf['log(lu)'] = np.log(cdf['lu'])
_, axes = plt.subplots(nrows=1, ncols=2)
cdf.plot.scatter('lu','E',c='l',colormap='viridis',ax=axes[0])
cdf.plot.scatter('log(lu)','E',c='l',colormap='viridis' ,ax = axes[1])
plot(None, filename="cdf_lu_vs_E", figsize=(12,3))

# l vs TotSale
plot(cdf.plot.scatter('l','TotSale',c='E',colormap='viridis'), filename="cdf_l_vs_TotSale")

# l vs PReturn ( unico che droppa info nascosta: chi ha comprato tanti articoli solitamente non li riporta )
cdf['1/PReturn']=np.reciprocal(cdf['PReturn'])
fig, axes = plt.subplots(nrows=1, ncols=2)
cdf.plot.scatter('l','PReturn',c='E',colormap='viridis' ,ax = axes[0])
cdf.plot.scatter('l','1/PReturn',c='E',colormap='viridis' ,ax = axes[1])
plot(None, filename="cdf_lu_vs_PReturn", figsize=(12,3))

# TotSale vs MeanSale
plot(cdf.plot.scatter('TotSale','MeanSale',c='E',colormap='viridis'), filename="cdf_TotSale_vs_MeanSale")

# l vs MeanItems
plot(cdf.plot.scatter('l','MeanItems',c='E',colormap='viridis'), filename="cdf_l_vs_MeanItems")