In [1]:
# Data Manipulation Libraries: Standard dataframes and array libraries
import pandas as pd
import numpy as np
from pandas import ExcelWriter
from pandas import ExcelFile
# from datetime import datetime
import datetime as dt

# Data Visualization Libraries:
import matplotlib.pyplot as plt
import seaborn as sns

# K-means clustering
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# ignore all future warnings
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Displaying plots in jupter notebook
%matplotlib inline
# Displaying pandas columns and rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# import data
df = pd.read_excel("static/data/online_retail.xlsx", sheet_name="Online Retail")

# Clean Data

 - Inspect Datatypes
 - Drop missing values in key column
 - Change datatypes as needed

In [5]:
df.dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID            float64
Country                object
dtype: object

In [4]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


## <font color="blue">Note: </font>Many missing values in the <code>CustomerID</code> Column

In [6]:
# Drop rows that have missing customerID values
df = df.dropna(subset=['CustomerID'])

In [9]:
# Convert customerID column to integers
df.astype({'CustomerID': 'int32'}).dtypes

InvoiceNo              object
StockCode              object
Description            object
Quantity                int64
InvoiceDate    datetime64[ns]
UnitPrice             float64
CustomerID              int32
Country                object
dtype: object

# Customer Segmentation using Recency, Frequency, and Monetary Segmentation

 - Recency (R) is based on the last purchase
 - Frequency (F) is based on how many purchases have been made in the last 12 months
 - Monetary Value (M)is based on how much customer spent in last 12 months
 - RFM can be grouped by percentiles

### Percenticles <code>pd.qcut()</code>
 - Sort customers based on metric
 - Break customers into a pre-defined number of groups of equal size
 - Assign a label to each group

# Calculate recency, frequency, and monteary calculation

# Step 1: Filter Data to 1 year of transactions

In [134]:
# Create a subset of the dataframe that is filtered for most recent year of activity
subset_df = df[df['InvoiceDate']>'2010-12-10'].copy()

In [135]:
# Confirm subset dates
print('Min: {}; Max: {}'.format(min(subset_df.InvoiceDate),
                              max(subset_df.InvoiceDate)))

Min: 2010-12-10 09:33:00; Max: 2011-12-09 12:50:00


# Step 2: Calculate the Sales Revenue per Transaction

 - This is calculated by <code>Quantity</code> * <code>UnitPrice</code>

In [136]:
# Create a sales revenue column named <code>TotalSum</code>
subset_df["TotalSum"] = subset_df["Quantity"]*subset_df["UnitPrice"]

# Step 3: Calcuate RFM Metrics

In [137]:
# Create snapshot_day
snapshot_date = max(subset_df.InvoiceDate) + dt.timedelta(days=1)

In [141]:
snapshot_date

Timestamp('2011-12-10 12:50:00')

In [138]:
# Aggregate data (Recent day - last transaction)
rfm_data = subset_df.groupby(["CustomerID"]).agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo' : 'count',
    'TotalSum' : 'sum'})

In [139]:
# Rename columns for easier interpretation
rfm_data.rename(columns = {'InvoiceDate' : 'Recency',
                          'InvoiceNo' : 'Frequency',
                          'TotalSum': 'Monetary Value'}, inplace=True)

In [140]:
rfm_data.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary Value
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,326,2,0.0
12347,2,151,3598.21
12348,75,31,1797.24
12349,19,73,1757.55
12350,310,17,334.4


# Building RFM segments

In [None]:
# Recency quartile (rate recent active greater)
r_labels = range(4,0,-1)

In [None]:
r_quartiles = pd.qcut(rfm_data["Recency"], 4, labels=r_labels)

In [None]:
rfm_data = rfm_data.assign(R = r_quartiles.values)

In [None]:
rfm_data.head()

In [None]:
f_labels = range(1,5)
m_labels = range(1,5)

In [None]:
f_quantiles = pd.qcut(rfm_data["Frequency"], 4, labels=f_labels)

In [None]:
m_quantiles = pd.qcut(rfm_data["Monetary Value"], 4, labels=m_labels)

In [None]:
rfm_data = rfm_data.assign(F = f_quantiles.values)

In [None]:
rfm_data = rfm_data.assign(M = m_quantiles.values)

In [None]:
rfm_data.head()

# Build RFM Segment and RFM Score

 - Concatenate RFM quartile values to RFM_Segment
 - Sum RFM quartiles values to RFM_Score

In [None]:
def join_rfm(x): return str(x['R']) + str(x['F']) + str(x['M'])

In [None]:
rfm_data['RFM_Segment'] = rfm_data.apply(join_rfm, axis=1)

In [None]:
rfm_data['RFM_Score'] = rfm_data[['R','F','M']].sum(axis=1)

In [None]:
rfm_data.head()

# Analyze RFM segments

In [None]:
# .size() function is used to get an int representing the number of elements in this object.
rfm_data.groupby('RFM_Segment').size().sort_values(ascending=False)[:10]

# Filtering on RFM Segments
 - select bottom RFM segment '111' and view top 5 rows

In [None]:
rfm_data[rfm_data["RFM_Segment"]=='111'][:5]

# Summary Metrics per RFM Score

In [None]:
rfm_data.groupby('RFM_Score').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary Value' : ['mean', 'count']}).round(1)

# Group into named segments

In [None]:
def segment_name(df):
    if df['RFM_Score'] >= 9:
        return 'Gold'
    elif (df['RFM_Score'] >= 5) and (df['RFM_Score'] < 9):
        return 'Silver'
    else:
        return 'Bronze'

In [None]:
rfm_data["General_Segment"] = rfm_data.apply(segment_name, axis=1)

In [None]:
rfm_data.groupby('General_Segment').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary Value' : ['mean', 'count']}).round(1)

# K-means clustering

 - Simple and fast
 - Most popular supervised learning model

### Assumptions
 - Distributions are symmetric
 - Variables have same average values (scale)
 - variables with same variance (scale)

### Evaluate data skewness
 - Apply logarithmic transformation if skewed
 - Only works on positive values

### Calculate statistics of variables

In [None]:
rfm_data.describe()

### Manage data skewness

In [None]:
# Plot distribution
rfm_variables = ["Recency", "Frequency", "Monetary Value"]
fig, axes = plt.subplots(1, 3, figsize=(12,4))
for axes, var in zip(axes.ravel(), rfm_variables):
    axes.set_title(var)
    sns.distplot(rfm_data[var], ax=axes)

plt.tight_layout()
# plt.savefig("filepath/filename.format", bbox_inches='tight')
plt.show()

### Data transformation

In [None]:
frequency_log = np.log(rfm_data.Frequency)
recency_log = np.log(rfm_data.Recency)

In [None]:
data = {"frequency_log": frequency_log, "recency_log": recency_log}
rf_log = pd.DataFrame.from_dict(data)

In [None]:
# Plot distribution
rfm_log_variables = rf_log.columns
fig, axes = plt.subplots(1, 2, figsize=(8,4))
for axes, var in zip(axes.ravel(), rfm_log_variables):
    axes.set_title(var)
    sns.distplot(rf_log[var], ax=axes)

plt.tight_layout()
# plt.savefig("filepath/filename.format", bbox_inches='tight')
plt.show()

### Dealing with negative values

# Centering and Scaling Variables

 - Centering variables id done by subtracting average values from each observation

In [None]:
# Aggregate data (Recent day - last transaction)
rfm_dataset = subset_df.groupby(["CustomerID"]).agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo' : 'count',
    'TotalSum' : 'sum'})

In [None]:
# Rename columns for easier interpretation
rfm_dataset.rename(columns = {'InvoiceDate' : 'Recency',
                          'InvoiceNo' : 'Frequency',
                          'TotalSum': 'Monetary Value'}, inplace=True)

In [None]:
rfm_dataset = rfm_dataset[rfm_dataset["Monetary Value"]>0]

In [None]:
rfm_dataset.head()

# Combining centering and scaling
 - use scaler from <code>scikit-learn</code>

In [None]:
scaler = StandardScaler()
scaler.fit(rfm_datav2)
rfm_normalized = scaler.transform(rfm_datav2)

In [None]:
print('mean:', rfm_normalized.mean(axis=0).round(2))
print('std:', rfm_normalized.std(axis=0).round(2))

# Sequence of structuring pre-processing steps

 - Unskew the data - log transformation
 - Standardize to the same average values
 - Scale to the same standard deviation
 - Store as a separate array to be used for clustering

In [None]:
# Unskew the data with log transformation
dataset_log = np.log(rfm_dataset)

In [None]:
dataset_log.head()

In [None]:
# Normalize the variables with <code>StandardScaler</code>
scaler = StandardScaler()
scaler.fit(dataset_log)

In [None]:
dataset_normalized = scaler.transform(dataset_log)
df_normalized = pd.DataFrame(dataset_normalized, index=rfm_dataset.index, columns=rfm_dataset.columns)

In [None]:
df_normalized.head()

In [None]:
# Plot distribution
rfm_variables = df_normalized.columns
fig, axes = plt.subplots(1, 3, figsize=(12,4))
for axes, var in zip(axes.ravel(), rfm_variables):
    axes.set_title(var)
    sns.distplot(df_normalized[var], ax=axes)

plt.tight_layout()
# plt.savefig("filepath/filename.format", bbox_inches='tight')
plt.show()

# Practical Implementation of k-means clustering

 - Data pre-processing
 - Chossing a number of clusters
 - Running k-means clustering on pre-processed data
 - Analyzing average RFM values of each cluster

# Methods to define cluster numbers

 - Visual methods: Elbow criterion
 - Mathematical methods: silhouette coefficient

In [None]:
kmeans = KMeans(n_clusters=2, random_state=1)

In [None]:
# compute k-means clustering on pre-processed data
kmeans.fit(df_normalized)

In [None]:
# Extract cluster labels using <code>labels_</code> attribute
cluster_labels = kmeans.labels_

### Analyze average RFM values of each cluster
 - Create a cluster label column in the original dataframe
 - Calculte average RFM values and size for each cluster

In [None]:
rfm_dataset_cluster = rfm_dataset.assign(Cluster = cluster_labels)

In [None]:
rfm_dataset_cluster.groupby(["Cluster"]).agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary Value': ["mean", 'count']
}).round(0)

## Elbow citerion method

 - Plot the number of clusters against within-cluster sum-of-squared-erros (SSE)- sum of squared distances from every data point to their cluster center
 - The 'Elbow' represents an 'optimal' number of clusters

In [None]:
# Create an empty dictionary
sse = {}
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=1)
    kmeans.fit(df_normalized)
    sse[k] = kmeans.inertia_

In [None]:
# Plot SSE for each k 'cluster'
plt.title('The Elbow Method')
plt.xlabel('k'); plt.ylabel('SSE')
sns.pointplot(x=list(sse.keys()), y=list(sse.values()))
# plt.savefig("filepath/filename.format", bbox_inches='tight')
plt.show()

### Criteria points to 2 or 3 cluster solutions

## Experimental Approach - Analyze Segments

 - Build clustering at and around elbow solution
 - Analyze average RFM values
 - Compare against other solutions and identify which provides most insight

In [None]:
kmeans3 = KMeans(n_clusters=3, random_state=1)
# compute k-means clustering on pre-processed data
kmeans3.fit(df_normalized)
# Extract cluster labels using <code>labels_</code> attribute
cluster_labels = kmeans3.labels_
rfm_dataset_cluster3 = rfm_dataset.assign(Cluster = cluster_labels)
rfm_dataset_cluster3.groupby(["Cluster"]).agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary Value': ["mean", 'count']
}).round(0)

### Profile and Interpret Segments

Approaches to build customer personals
 - Summary statistics for each cluster
 - Snake plots
 - Calcualte relative importance of cluster attributes compared to population

As done above

### Snake plots
 - Market research technique to compare different segments
 - Visual representation of each segment's attributes
 - Need to first normalize data (center and scale)
 - Plot each clusters average normalized values of each attribute

In [None]:
df_normalized['Cluster'] = rfm_dataset_cluster3['Cluster']

In [None]:
df_melt = pd.melt(df_normalized.reset_index(),
                 id_vars=['CustomerID', 'Cluster'],
                 value_vars = ['Recency', 'Frequency', 'Monetary Value'],
                  var_name='Attribute',
                  value_name='Value'
                 )

In [None]:
plt.title('Snake plot of standardized values')
sns.lineplot(x='Attribute', y='Value', hue='Cluster', data=df_melt)
# plt.savefig("filepath/filename.format", bbox_inches='tight')
plt.show()

### Relative importance of segment attributes
 - identify relative importance of each segment's attribute
 - calculate average values of each cluster
 - calcualte average values of population

In [None]:
cluster_avg = rfm_dataset_cluster3.groupby(['Cluster']).mean()
population_avg = rfm_dataset.mean()

In [None]:
relative_imp = cluster_avg/population_avg - 1

### Analyze and plot relative importance

 - further ratio is from 0, the more important that attribute is for a segment, relative to the total population

In [None]:
plt.figure(figsize=(8,8))
plt.title('Relative importance of attributes')
chart = sns.heatmap(data=relative_imp, annot=True, fmt='.2f', cmap='RdYlGn')
# Makes sure the plot is visible
chart.set_ylim(len(relative_imp)-0.25, -0.25)
# plt.savefig("filepath/filename.format", bbox_inches='tight')
plt.show()