# DATA MINING PROJECT: Analysis of a Supermarket’s Customers
## 1.1) Data Understanding: Distribution, Statistics & Correlation
### *Antonio Strippoli, Valerio Mariani*

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sn
from scipy import stats
from math import log, ceil
from natsort import natsorted
import matplotlib.pyplot as plt

pd.set_option('mode.chained_assignment', None)

In [None]:
def plot(ax, filename="", figsize=(6.4, 4.8)):
    ax.figure.set_size_inches(*figsize)
    ax.figure.tight_layout()
    if filename:
        ax.figure.savefig(f"../report/imgs/{filename}")
    plt.show()
    plt.close()

def plt_radar(df: pd.DataFrame, filepath=""):
    """Represent a DataFrame using a radar plot.
    """
    # Number of variable
    categories=list(df.index)
    N = len(categories)

    # What will be the angle of each axis in the plot? (we divide the plot / number of variable)
    angles = [n / float(N) * 2 * np.pi for n in range(N)]
    angles += angles[:1]

    # Initialise the spider plot
    ax = plt.subplot(111, polar=True)

    # If you want the first axis to be on top:
    ax.set_theta_offset(np.pi / 2)
    ax.set_theta_direction(-1)

    # Draw one axe per variable + add labels labels yet
    plt.xticks(angles[:-1], categories)

    # Draw ylabels
    ax.set_rlabel_position(0)
    ylim = ceil(df.max().max())
    ticks = list(range(0,ylim,5))
    ticks_str = list(map(lambda x: str(x), ticks))
    plt.yticks(ticks, ticks_str, color="grey", size=7)
    plt.ylim(0,ylim)

    # PART 2: Add plots
    # Ind1
    values = list(df[df.columns[0]])
    values += values[:1]
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=df.columns[0])
    ax.fill(angles, values, 'b', alpha=0.1)
    
    # Ind2
    values = list(df[df.columns[1]])
    values += values[:1]
    ax.plot(angles, values, linewidth=1, linestyle='solid', label=df.columns[1])
    ax.fill(angles, values, 'r', alpha=0.1)
    
    # Add legend and tight the layout
    plt.legend(loc='upper right', bbox_to_anchor=(0.1, 0.1))
    plt.tight_layout()

    # Show or save?
    if not filepath:
        plt.show()
        plt.close()
    else:
        plt.savefig(filepath)

In [None]:
# Load the secondary data from the given file
df = pd.read_csv('customer_supermarket_2.csv', index_col=0, parse_dates=["PurchaseDate"])

In [None]:
# Prints data's samples and informations,
# including the number of not null values for each columns
df.info()
print("")
df.head()

In [None]:
# Sale statistics
print("SALE DESCRIBE:\n", df["Sale"].describe())

# Sale distribution
df_products_catalog = df[["ProdID", "Sale"]].drop_duplicates()["Sale"]
print("PRODUCTS CATALOG DESCRIBE:\n", df_products_catalog.describe())

plot(df_products_catalog[df_products_catalog < 100].hist(bins=50), filename="Sale_Distribution")
plot(df_products_catalog.plot.box(), filename="Sale_Box_Plot", figsize=(2, 4.8))

In [None]:
# Distribution of buys and returns
print("RATIO QTA POSITIVE/NEGATIVE:\n", (df["Qta"] > 0).value_counts())
print("STATISTICS QTA > 0:\n", df[df["Qta"] > 0]["Qta"].describe())
print("STATISTICS QTA < 0:\n", df[df["Qta"] < 0]["Qta"].describe())

plot(
    df[abs(df['Qta']) < 75].plot.scatter('Qta', 'Sale', c='Sale', colormap='winter', colorbar=False, figsize=(10,7)),
    filename="Sale_Qta_Distribution"
)

In [None]:
# === Monthly statistics ===
def year_month(i):
    x = df.loc[i]['PurchaseDate']
    return f"{x.year}/{x.month}"

# Number of baskets and profit per month
tmp = df[["PurchaseDate", "Sale", "Qta"]]
tmp["Profit"] = tmp["Sale"] * tmp["Qta"]
tmp.drop(["Sale", "Qta"], axis=1, inplace=True)
monthly_stats = tmp.groupby(year_month).agg('sum')

monthly_stats["Baskets"] = df[["PurchaseDate", "BasketID"]].drop_duplicates().groupby(year_month).size()
monthly_stats = monthly_stats.reindex(index=natsorted(monthly_stats.index))

print("MONTHLY STATS:\n", monthly_stats)
print("CORRELAZIONE:")
print(monthly_stats.corr())

monthly_stats['Baskets'] = monthly_stats['Baskets'] / sum(monthly_stats['Baskets']) * 100
monthly_stats['Profit'] = monthly_stats['Profit'] / sum(monthly_stats['Profit']) * 100

plot(monthly_stats.plot.bar(), filename="Monthly_Baskets_Profit", figsize=(14,7))

In [None]:
# Number of baskets and profit per country
tmp = df[["CustomerCountry", "Sale", "Qta"]]
tmp["Profit"] = tmp["Sale"] * tmp["Qta"]
tmp.drop(["Qta", "Sale"], axis=1, inplace=True)
country_stats = tmp.groupby("CustomerCountry").agg('sum')

country_stats["Baskets"] = df[['CustomerCountry', 'BasketID']].groupby('CustomerCountry').agg(lambda x: x.nunique())['BasketID']

print("COUNTRY STATS:\n", country_stats)
print("CORRELAZIONE:")
print(country_stats.corr())

# First plot: UK vs Other countries
tmp = country_stats[country_stats.index != 'United Kingdom'].agg('sum')
country_stats1 = country_stats[country_stats.index == 'United Kingdom']
country_stats1.loc["Others"] = tmp.values
# Normalize values
country_stats1['Baskets'] = country_stats1['Baskets'] / sum(country_stats1['Baskets']) * 100
country_stats1['Profit'] = country_stats1['Profit'] / sum(country_stats1['Profit']) * 100
# Plot
plot(country_stats1.plot.bar(), filename="Country_Baskets_Profit", figsize=(4,7))

# Second plot: Other countries
country_stats2 = country_stats.drop('United Kingdom')
# Aggregate small values
threshold = 25
tmp = country_stats2[country_stats2["Baskets"] < threshold].agg('sum')
country_stats2 = country_stats2[country_stats2["Baskets"] >= threshold]
country_stats2.loc["Others"] = tmp.values
# Normalize values
country_stats2['Baskets'] = country_stats2['Baskets'] / sum(country_stats2['Baskets']) * 100
country_stats2['Profit'] = country_stats2['Profit'] / sum(country_stats2['Profit']) * 100
# Plot
plt_radar(country_stats2, "../report/imgs/Country_Basket_Profit_No_UK")

In [None]:
# Monthly activity per country
ma_country = df.groupby(['CustomerCountry', year_month]).apply(lambda x: sum(x["Qta"] * x["Sale"]))
ma_country = ma_country.unstack(level=0)

ma_country = ma_country.reindex(index=natsorted(ma_country.index))
cols = list(ma_country.columns)
cols.sort(key=lambda x: ma_country[x].notnull().sum())
ma_country = ma_country[cols]
for i, c in enumerate(ma_country.columns):
    ma_country[c][ma_country[c].notnull()] = i

ax = ma_country.plot.line(legend=False, style='-o')
ax.set_xticks(range(0, len(ma_country.index)))
ax.set_xticklabels([x.replace('/', '\n') for x in ma_country.index])
ax.set_yticks(range(0, len(ma_country.columns)))
ax.set_yticklabels(list(ma_country.columns))
plot(ax, filename="Monthly_Activity_Country", figsize=(16,8))

In [None]:
# Most popular products
print("UNIQUE PRODUCTS:", len(df['ProdID'].unique()))

popular_prods = df[['ProdDescr', 'Qta']].groupby('ProdDescr').agg('sum').sort_values(by='Qta', ascending=False).head(10)
plot(popular_prods.plot.barh(color='darkred'), filename="Products_Popular", figsize=(10,3))