# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 1.2) Data Preparation
### *Antonio Strippoli, Valerio Mariani*

In [None]:
%matplotlib inline
import os
import calendar
import numpy as np
import pandas as pd
import seaborn as sn
from math import log, ceil
import matplotlib.pyplot as plt

pd.set_option('mode.chained_assignment', None)

In [None]:
def plot(ax, folder="cdf_plots", filename="", figsize=(6.4, 4.8)):
    fig = plt.gcf()
    fig.set_size_inches(*figsize)
    plt.tight_layout()
    if filename:
        path = os.path.join("..", "report", "imgs", folder)
        if not os.path.exists(path):
            os.mkdir(path)
        plt.savefig(os.path.join(path, filename))
    plt.show()
    plt.close()

### Create a new dataset with a profilation of each customer.

In [None]:
# Load the secondary data
df = pd.read_csv('customer_supermarket_2.csv', index_col=0, parse_dates=["PurchaseDate"])

# Save Qta * Sale in a new column
df['Qta_Sale'] = df['Qta'] * df['Sale']

# Discretize Sale attribute
df['Sale_discr'] = pd.qcut(df['Sale'], 10, duplicates='drop', retbins=False)

# Discretize Sale for baskets
sale_baskets = df[df['Qta'] > 0][['CustomerID', 'BasketID', 'Qta_Sale']].groupby('BasketID').agg({'Qta_Sale': 'sum', 'CustomerID': 'max'})
sale_baskets['Qta_Sale'] = pd.qcut(sale_baskets['Qta_Sale'], 3, duplicates='drop', retbins=False)

In [None]:
# Recency
recency = lambda g: (df['PurchaseDate'].max() - g['PurchaseDate'].max()).days
# Frequency
frequency = lambda g: g['BasketID'].nunique()
# Total money spent
monetary = lambda g: round( sum( g["Sale"]*g["Qta"] ), 2)
# Total purchased items
tot_items = lambda g: sum( g["Qta"] )
# Maximum number of purchased items in a shopping session
max_items = lambda g: max( [ sum( g1[1]["Qta"] ) for g1 in g.groupby("BasketID") ] )
# Medium object in basket
mean_items = lambda g: int( np.mean( [ sum( g1[1]["Qta"] ) for g1 in g.groupby("BasketID") ] ))
# Number of distinct items
unique_items = lambda g: g["ProdID"].nunique()
# Preferred item
preferred_item = lambda g: g.groupby('ProdID').agg({'Qta':'sum'}).idxmax()[0]
# Max amount for a basket
max_sale = lambda g: round( max( [ sum( g1[1]["Sale"]*g1[1]["Qta"] ) for g1 in g.groupby("BasketID") ] ), 2)
# Medium amount for a basket
mean_sale = lambda g: round( np.mean( [ sum( g1[1]["Sale"]*g1[1]["Qta"] ) for g1 in g.groupby( "BasketID" ) ] ), 2)
# Medium amount paid for an item
mean_item_sale = lambda g: round( np.mean( g["Sale"].unique() ), 2)
# Entropies
def entropy_products(g):
    l = g[["ProdID", 'Qta']].groupby('ProdID').agg('sum')
    m = l.values.sum()
    e = -sum( [ (mi/m)*log((mi/m), 2) for mi in l.values.flatten() ] )
    return round(e, 2)
def entropy_sale(g):
    l = g['Sale_discr'].value_counts()
    l = l[l > 0]
    m = l.values.sum()
    e = -sum( [ (mi/m)*log((mi/m), 2) for mi in l.values.flatten() ] )
    return round(e, 2)
def entropy_baskets(customer_id, g):
    l = sale_baskets[sale_baskets['CustomerID'] == customer_id]['Qta_Sale'].value_counts()
    l = l[l > 0]
    m = l.values.sum()
    e = -sum( [ (mi/m)*log((mi/m), 2) for mi in l.values.flatten() ] )
    return round(e, 2)
def entropy_intervals(g):
    # Get unique dates (without considering time)
    dates = g['PurchaseDate']
    dates = pd.DataFrame(dates.dt.normalize().unique(), columns=["date"])

    # If we have only one record, duplicate it to be able to compute a fake interval
    if len(dates) == 1:
        dates = dates.append(pd.Series(dates.iloc[0]))
        dates.reset_index(drop=True, inplace=True)

    dates['date2'] = dates['date'].shift(1)
    dates.drop(0, inplace=True)
    l = (dates['date'] - dates['date2']).dt.days
    # Do binning of values by weeks and calculate entropies
    l = np.ceil(l / 7).value_counts()
    m = l.values.sum()
    e = -sum( [ (mi/m)*log((mi/m), 2) for mi in l.values.flatten() ] )
    return round(e, 2)
# Purchasing Frequency
purchasing_freq = lambda g: round((g['PurchaseDate'].max() - g['PurchaseDate'].min()).days / g['BasketID'].nunique(), 2)
# Weekday preference
weekday_pref = lambda g: int(pd.Series(g['PurchaseDate'].unique()).apply(lambda x: x.weekday()).mean())
# Number of week of the month preference
weekmonth_pref = lambda g: int(pd.Series(g['PurchaseDate'].unique()).apply( lambda x: int(ceil(x.day/7.0)) ).mean())
# Main country
main_country = lambda g: g[['BasketID','CustomerCountry']].groupby('CustomerCountry').nunique().idxmax()[0]

In [None]:
groups = df[df["Qta"]>0].groupby("CustomerID")
cdf = pd.DataFrame(data=np.array( [
    [
    group[0],
    recency(group[1]),
    frequency(group[1]),
    monetary(group[1]),
    tot_items(group[1]),
    max_items(group[1]),
    mean_items(group[1]),
    unique_items(group[1]),
    max_sale(group[1]),
    mean_sale(group[1]),
    mean_item_sale(group[1]),
    entropy_products(group[1]),
    entropy_sale(group[1]),
    entropy_baskets(group[0], group[1]),
    entropy_intervals(group[1]),
    purchasing_freq(group[1]),
    weekday_pref(group[1]),
    weekmonth_pref(group[1]),
    preferred_item(group[1]),
    main_country(group[1])
    ] for group in groups
] ), columns=["CustomerID","Recency","Frequency","Monetary","TotItems","MaxItems","MeanBasketItems","UniqueItems","MaxSale","MeanBasketSale","MeanItemSale","E-Prods","E-Sale","E-Baskets","E-Intervals","PurchasingFreq","WeekDayPref","WeekMonthPref","ItemPref","MainCountry"] )
cdf.set_index('CustomerID', inplace=True)

# Workaround for Pandas' bug (not able to convert to correct dtypes)
# cdf.convert_dtypes()
cdf.to_csv("customer_profilation.csv")
cdf = pd.read_csv("customer_profilation.csv", index_col=0)

# calculate percentage of returned item for customer
groups = df[ (df["Qta"]<0) & ~(df["ProdID"].isin(['M', 'D', 'BANK CHARGES'])) ][['CustomerID','Qta']].groupby("CustomerID").agg('sum')
cdf.insert(7, 'PReturn', pd.Series(
    [ round(-groups.loc[i]['Qta']/cdf.loc[i]['TotItems']*100, 2) if i in groups.index else 0 for i in cdf.index ],
    dtype='float64',
    index=cdf.index
))

print("N. ENTRIES:", len(cdf))
cdf.to_csv("customer_profilation.csv")

### Data Quality

In [None]:
# Load new created dataset do some final polishing
cdf = pd.read_csv("customer_profilation.csv", index_col=0)

cdf.info()
cdf.describe()

In [None]:
# Outliers in TotItems (users who purchased an abnormal number of items)
cdf_totitems = cdf['TotItems']
plot(cdf_totitems.plot.box(), figsize=(2, 4.8), folder="cdf_outliers", filename="TotItems_BP")
plot(sn.distplot(cdf_totitems[cdf_totitems < 25000], bins=100), folder="cdf_outliers", filename="TotItems_HIST")

# Search for a threshold
plot(cdf_totitems[cdf_totitems < 100000].plot.box(), figsize=(2, 4.8))
plot(cdf_totitems[cdf_totitems < 70000].plot.box(), figsize=(2, 4.8))

cdf = cdf[cdf_totitems < 70000]

In [None]:
# Outliers in Monetary (users who spent way too much money)
cdf_monetary = cdf['Monetary']
plot(cdf_monetary.plot.box(), figsize=(2, 4.8), folder="cdf_outliers", filename="Monetary_BP")
plot(sn.distplot(cdf_monetary[cdf_monetary < 25000], bins=100), folder="cdf_outliers", filename="Monetary_HIST")

# Search for a threshold
plot(cdf_monetary[cdf_monetary < 80000].plot.box(), figsize=(2, 4.8))

cdf = cdf[cdf_monetary < 80000]

In [None]:
print("N. ENTRIES:", len(cdf))

In [None]:
cdf.to_csv("customer_profilation.csv")

### Distribution & Statistics

In [None]:
# Load new dataset and start performing some analysis
cdf = pd.read_csv("customer_profilation.csv", index_col=0)

In [None]:
# Pandas' scatter matrix
plot(pd.plotting.scatter_matrix(cdf), figsize=(20,20), filename="ScatterMatrix")

In [None]:
# Distribution of numerical attributes with histograms
plot(cdf.hist(bins=50), figsize=(10,10), filename="Histograms")

# Distribution of numerical attributes with box-plots
plot(cdf.plot.box(), filename="Box_Plots")

# Pairwise correlations with heatmap on correlation matrix
plot(sn.heatmap(round(cdf.corr(), 2), cmap='coolwarm', annot=True), figsize=(10,10), filename="HeatMap_Correlations")

In [None]:
# Retrieve highest correlations
tmp = round(cdf.corr(), 2).abs().unstack()
tmp = tmp[tmp != 1]
tmp.sort_values(ascending=False).head(20)

In [None]:
# Analyze PurchasingFreq attribute
print(cdf['PurchasingFreq'].describe())
plot(cdf['PurchasingFreq'].hist(bins=100))
plot(cdf['PurchasingFreq'].plot.box())

plot(cdf.plot.scatter(x='PurchasingFreq', y='Frequency', c='PurchasingFreq', cmap='copper', colorbar=False, sharex=False), filename="Frequency_PurchasingFreq")

In [None]:
# Study and save some particular distributions and correlations
plot(sn.distplot(cdf['Recency'], bins=100), filename="Recency_HIST")
plot(cdf['WeekDayPref'].value_counts().sort_index().rename(lambda i: calendar.day_name[i]).plot.bar(), filename="WeekDayPref_HIST")
plot(cdf['WeekMonthPref'].value_counts().sort_index().plot.bar(), filename="WeekMonthPref_HIST")
plot(sn.distplot(cdf['PReturn'], bins=100), filename="PReturn_HIST")
plot(sn.distplot(cdf['E-Prods'], bins=100), filename="EProds_HIST")
plot(sn.distplot(cdf['E-Sale'], bins=100), filename="ESale_HIST")
plot(sn.distplot(cdf['E-Baskets'], bins=100), filename="EBaskets_HIST")
plot(sn.distplot(cdf['E-Intervals'], bins=100), filename="EIntervals_HIST")

In [None]:
# Scatter plot to show correlation of the 2 entropies
plot(cdf.plot.scatter('E-Prods', 'E-Sale', c='Recency', colormap='hot', sharex=False), figsize=(8,6), filename="Entropies")

In [None]:
# 3D Scatter plot to show correlations between entropies, frequency and unique items
fig = plt.figure(figsize = (10, 7))
ax = plt.axes(projection ="3d")

p = ax.scatter3D(cdf['E-Prods'], cdf['E-Sale'], cdf['UniqueItems'], c=cdf['Recency'], cmap='hot')
cbar = fig.colorbar(p)
ax.set_xlabel('E-Prods')
ax.set_ylabel('E-Sale')
ax.set_zlabel('UniqueItems')
cbar.set_label('Recency')

plot(ax, figsize=(8,6), filename="Entropies_3D")

In [None]:
# Scatter plot to show correlation of PReturn and Frequency
plot(cdf.plot.scatter('PReturn', 'Frequency', c='Recency', colormap='plasma', sharex=False), figsize=(8,6), filename="PReturn_Frequency")

In [None]:
# Check meanitemsale (not interesting)
plot(cdf['MeanItemSale'].hist(bins=100))
plot(cdf['MeanItemSale'].plot.box())

In [None]:
# Check frequency (not interesting)
print(cdf['Frequency'].describe())
print(cdf['Frequency'].quantile([.80, .85, .90, .95]))
plot(cdf['Frequency'].hist(bins=100))
plot(cdf['Frequency'].plot.box())

In [None]:
cdf[(cdf['Monetary'] < 20000) & (cdf['TotItems'] < 20000)].plot.scatter(x='Monetary', y='TotItems')