In [89]:
# Importing pandas library
import pandas as pd

# Importing datetime library
from datetime import datetime, timedelta, date

# Importing plotly for visualization
import plotly
import plotly.graph_objs as go

# Importing matplotlib for visualization
import matplotlib.pyplot as plt

# Importing objects from sklearn library
from sklearn.cluster import KMeans

# Importing numpy library
import numpy as np

##### Reference

    Model:
    - https://towardsdatascience.com/predicting-next-purchase-day-15fae5548027

    Dataset:
    - https://www.kaggle.com/datasets/vijayuv/onlineretail

In [10]:
## Reading dataset
df_raw = pd.read_csv('OnlineRetail.csv', header=0, encoding='unicode_escape')  

In [12]:
# Convert "InvoiceDate" from string to datetime
df_raw['InvoiceDate'] = pd.to_datetime(df_raw['InvoiceDate'])

In [17]:
# Select only UK data
df_uk = df_raw.query("Country=='United Kingdom'").drop(columns=['Country'], axis=0).reset_index(drop=True)

In [23]:
# Selecting purchases before the cut-off date
df_uk_6m = df_uk[(df_uk['InvoiceDate'] <  datetime(2011,9,1)) & (df_uk['InvoiceDate'] >= datetime(2011, 3,1))].reset_index(drop=True)

# Selecting purchases after the cut-off date 
df_uk_nxt = df_uk[(df_uk['InvoiceDate'] >= datetime(2011,9,1)) & (df_uk['InvoiceDate']  < datetime(2011,12,1))].reset_index(drop=True)

In [21]:
# Selecting distinct users
df_uk_users = df_uk[['CustomerID']].drop_duplicates()

In [33]:
# Using the table df_uk_nx to calculate the number of days until the next purchase
from pyparsing import col


df_nxt_purch = df_uk_nxt[['CustomerID', 'InvoiceDate']]\
                .groupby(['CustomerID']).min()\
                .reset_index()\
                .rename(columns={'InvoiceDate':'NextPurchaseDate'})

# Using the table df_uk_6m to calculate the last purchase date
df_lst_purch = df_uk_6m[['CustomerID', 'InvoiceDate']]\
                .groupby(['CustomerID']).max()\
                .reset_index()\
                .rename(columns={'InvoiceDate':'LastPurchaseDate'})

# Merging purchase dates
df_purch_dt = df_uk_users\
                .merge(df_nxt_purch, how='left', on=['CustomerID'])\
                .merge(df_lst_purch, how='left', on=['CustomerID'])

# Calculate the time difference from the last purchase to the next purchase in days
df_purch_dt['DayUntilNextPurchase'] = (df_purch_dt['NextPurchaseDate'] - df_purch_dt['LastPurchaseDate']).dt.days

# Drop spare columns
df_purch_dt = df_purch_dt.drop(columns=['NextPurchaseDate', 'LastPurchaseDate'], axis=0)

# Fill missing values with 9999
df_purch_dt['DayUntilNextPurchase'] = df_purch_dt['DayUntilNextPurchase'].fillna(9999)

##### BUILDING RFM FEATURES

In [55]:
# Recency
df_lst_purch['CurrentDay'] = datetime(2011,9,1)
df_lst_purch['Recency'] = (df_lst_purch['CurrentDay'] - df_lst_purch['LastPurchaseDate']).dt.days
df_recency = df_lst_purch[['CustomerID', 'Recency']]

# Frequency
df_frequency = df_uk_6m[['CustomerID', 'InvoiceNo']]\
    .drop_duplicates()\
    .groupby(['CustomerID']).count()\
    .reset_index()\
    .rename(columns={'InvoiceNo':'Frequency'})

# Monetary
df_uk_6m['PurchValue'] = df_uk_6m['Quantity']*df_uk_6m['UnitPrice']
df_monetary = df_uk_6m[['CustomerID', 'PurchValue']]\
    .groupby(['CustomerID']).sum()\
    .reset_index()\
    .rename(columns={'PurchValue':'Monetary'})

# Creating table RFM
df_rfm = df_recency\
    .merge(df_frequency, how='inner', on=['CustomerID'])\
    .merge(df_monetary, how='inner', on=['CustomerID'])

In [56]:
df_rfm

Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,12747.0,9,5,1760.09
1,12748.0,1,88,8115.68
2,12749.0,13,6,2532.55
3,12821.0,114,1,92.72
4,12823.0,27,2,688.50
...,...,...,...,...
2563,18280.0,177,1,180.60
2564,18281.0,80,1,80.82
2565,18282.0,22,2,98.76
2566,18283.0,48,6,667.87


##### Looking at recency distribution

In [57]:
#plot recency
plot_data = [
    go.Histogram(
        x=df_rfm['Recency']
    )
]

plot_layout = go.Layout(
        title='Recency'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)

fig

In [61]:
# Clustering for Recency
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_rfm[['Recency']])
df_rfm['RecencyCluster'] = kmeans.predict(df_rfm[['Recency']])

In [100]:
def cluster_ordering(table, cluster_column, value_column, reverse_order):
    
    cluster_ordered = table[[cluster_column, value_column]]\
        .groupby([cluster_column]).mean()\
        .reset_index()\
        .sort_values(by=value_column)

    if(reverse_order==False):
        cluster_ordered[value_column[0] + '_score'] = list(range(cluster_ordered.shape[0]))
    else:
        cluster_ordered[value_column[0] + '_score'] = np.array(cluster_ordered.shape[0]) - np.array(list(range(cluster_ordered.shape[0]))) -1

    return table.merge(cluster_ordered[[cluster_column, value_column[0] + '_score']], how='inner', on=[cluster_column])

In [98]:
df_rfm_R = cluster_ordering(df_rfm, 'RecencyCluster', 'Recency', reverse_order=True)

##### Looking at frequency distribution

In [58]:
#plot recency
plot_data = [
    go.Histogram(
        x=df_rfm['Frequency']
    )
]

plot_layout = go.Layout(
        title='Frequency'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)

fig

In [108]:
# Clustering for Recency
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_rfm_R[['Frequency']])
df_rfm_R['FrequencyCluster'] = kmeans.predict(df_rfm_R[['Frequency']])

In [109]:
df_rfm_RF = cluster_ordering(df_rfm_R, 'FrequencyCluster', 'Frequency', reverse_order=False)

##### Looking at monetary distribution

In [59]:
#plot recency
plot_data = [
    go.Histogram(
        x=df_rfm['Monetary']
    )
]

plot_layout = go.Layout(
        title='Monetary'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)

fig

In [111]:
# Clustering for Recency
kmeans = KMeans(n_clusters=4)
kmeans.fit(df_rfm_RF[['Monetary']])
df_rfm_RF['MonetaryCluster'] = kmeans.predict(df_rfm_RF[['Monetary']])

In [112]:
df_rfm_RFM = cluster_ordering(df_rfm_RF, 'MonetaryCluster', 'Monetary', reverse_order=False)

In [113]:
RFM = df_rfm_RFM.copy()

In [115]:
RFM = RFM.drop(columns=['RecencyCluster', 'FrequencyCluster', 'MonetaryCluster'])

In [120]:

RFM.loc[(RFM['R_score'].isin([0,1])) & (RFM['M_score'] == 3), 'Cluster'] = 'Lost-High'
RFM.loc[(RFM['R_score'].isin([0,1])) & (RFM['M_score'] == 2), 'Cluster'] = 'Lost-Mid'
RFM.loc[(RFM['R_score'].isin([0,1])) & (RFM['M_score'].isin([0,1])), 'Cluster'] = 'Lost-Low'
RFM.loc[(RFM['R_score'].isin([2,3])) & (RFM['M_score'] == 3), 'Cluster'] = 'Active-High'
RFM.loc[(RFM['R_score'].isin([2,3])) & (RFM['M_score'] == 2), 'Cluster'] = 'Active-Mid'
RFM.loc[(RFM['R_score'].isin([2,3])) & (RFM['M_score'].isin([0,1])), 'Cluster'] = 'Active-Low'
RFM.loc[(RFM['R_score'] == 3) & (RFM['F_score'] == 3) & (RFM['M_score'] == 3), 'Cluster'] = 'Best'
RFM.loc[(RFM['R_score'] == 3) & (RFM['F_score'] == 0), 'Cluster'] = 'New'
RFM

Unnamed: 0,CustomerID,Recency,Frequency,Monetary,R_score,F_score,M_score,Cluster
0,12747.0,9,5,1760.09,3,1,0,Active-Low
1,12749.0,13,6,2532.55,3,1,0,Active-Low
2,12839.0,13,6,1591.50,3,1,0,Active-Low
3,12841.0,6,9,1438.52,3,1,0,Active-Low
4,12853.0,7,7,1470.75,3,1,0,Active-Low
...,...,...,...,...,...,...,...,...
2563,16684.0,33,14,23462.68,3,2,2,Active-Mid
2564,17511.0,14,21,37661.72,3,2,2,Active-Mid
2565,17949.0,0,30,37934.22,3,2,2,Active-Mid
2566,17450.0,0,20,64382.90,3,2,3,Active-High


In [122]:
RFM['overall_score'] = RFM['R_score'] + RFM['F_score'] + RFM['M_score']

In [126]:
#plot revenue vs frequency
tx_graph = RFM.copy()
#query("Monetary < 50000 and Frequency < 2000")

plot_data = [
    go.Scatter(
        x=tx_graph.query("Cluster == 'Active-Low'")['Frequency'],
        y=tx_graph.query("Cluster == 'Active-Low'")['Monetary'],
        mode='markers',
        name='Low',
        marker= dict(size= 7,
            line= dict(width=1),
            color= 'blue',
            opacity= 0.8
           )
    ),
        go.Scatter(
        x=tx_graph.query("Cluster == 'Active-Mid'")['Frequency'],
        y=tx_graph.query("Cluster == 'Active-Mid'")['Monetary'],
        mode='markers',
        name='Mid',
        marker= dict(size= 9,
            line= dict(width=1),
            color= 'green',
            opacity= 0.5
           )
    ),
        go.Scatter(
        x=tx_graph.query("Cluster == 'Active-High'")['Frequency'],
        y=tx_graph.query("Cluster == 'Active-High'")['Monetary'],
        mode='markers',
        name='High',
        marker= dict(size= 11,
            line= dict(width=1),
            color= 'red',
            opacity= 0.9
           )
    ),
]

plot_layout = go.Layout(
        yaxis= {'title': "Monetary"},
        xaxis= {'title': "Frequency"},
        title='Clusters'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)

fig

In [127]:
# plot monetary vs recency
tx_graph = RFM.copy()
#query("Monetary < 50000 and Frequency < 2000")

plot_data = [
    go.Scatter(
        x=tx_graph.query("Cluster == 'Active-Low'")['Recency'],
        y=tx_graph.query("Cluster == 'Active-Low'")['Monetary'],
        mode='markers',
        name='Low',
        marker= dict(size= 7,
            line= dict(width=1),
            color= 'blue',
            opacity= 0.8
           )
    ),
        go.Scatter(
        x=tx_graph.query("Cluster == 'Active-Mid'")['Recency'],
        y=tx_graph.query("Cluster == 'Active-Mid'")['Monetary'],
        mode='markers',
        name='Mid',
        marker= dict(size= 9,
            line= dict(width=1),
            color= 'green',
            opacity= 0.5
           )
    ),
        go.Scatter(
        x=tx_graph.query("Cluster == 'Active-High'")['Recency'],
        y=tx_graph.query("Cluster == 'Active-High'")['Monetary'],
        mode='markers',
        name='High',
        marker= dict(size= 11,
            line= dict(width=1),
            color= 'red',
            opacity= 0.9
           )
    ),
]

plot_layout = go.Layout(
        yaxis= {'title': "Monetary"},
        xaxis= {'title': "Recency"},
        title='Clusters'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)

fig

In [128]:
# plot recency vs frequency
tx_graph = RFM.copy()
#query("Monetary < 50000 and Frequency < 2000")

plot_data = [
    go.Scatter(
        x=tx_graph.query("Cluster == 'Active-Low'")['Recency'],
        y=tx_graph.query("Cluster == 'Active-Low'")['Frequency'],
        mode='markers',
        name='Low',
        marker= dict(size= 7,
            line= dict(width=1),
            color= 'blue',
            opacity= 0.8
           )
    ),
        go.Scatter(
        x=tx_graph.query("Cluster == 'Active-Mid'")['Recency'],
        y=tx_graph.query("Cluster == 'Active-Mid'")['Frequency'],
        mode='markers',
        name='Mid',
        marker= dict(size= 9,
            line= dict(width=1),
            color= 'green',
            opacity= 0.5
           )
    ),
        go.Scatter(
        x=tx_graph.query("Cluster == 'Active-High'")['Recency'],
        y=tx_graph.query("Cluster == 'Active-High'")['Frequency'],
        mode='markers',
        name='High',
        marker= dict(size= 11,
            line= dict(width=1),
            color= 'red',
            opacity= 0.9
           )
    ),
]

plot_layout = go.Layout(
        yaxis= {'title': "Frequency"},
        xaxis= {'title': "Recency"},
        title='Clusters'
    )
fig = go.Figure(data=plot_data, layout=plot_layout)

fig

In [129]:
1


1