In [1]:
# !pip install squarify

In [2]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import squarify
import pandas as pd
import numpy as np

In [3]:
def df_RMF_preprocessing(df):
    # drop rows where Quantity < 0
    df = df[df.Quantity >= 0]
    # drop rows where CustomerID == null
    df = df[df.CustomerID.notnull()]
    # drop rows where UnitPrice < 0
    df = df[df.UnitPrice >= 0]
    # drop duplicated rows
    df = df.drop_duplicates()
    # Convert column 'InvoiceDate' to datetime datatype
    df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
    # convert InvoiceNo data type to integer
    df.InvoiceNo = df.InvoiceNo.astype(int)
    # Create new column 'Amount'
    df['Amount'] = df['Quantity'] * df['UnitPrice']
    # Get max date of dataframe
    max_date = df['InvoiceDate'].max().date()
    # Calculate R, F, M
    Recency = lambda x: (max_date - x.max().date()).days
    Frequency = lambda x: x.nunique()
    Monetary = lambda x: round(sum(x),2)

    df_RFM = df.groupby('CustomerID').agg({'InvoiceDate' : Recency,
                                        'InvoiceNo' : Frequency,
                                        'Amount' : Monetary,
                                        })
    # Rename column names
    df_RFM.columns = ['Recency', 'Frequency', 'Monetary']
    df_RFM = df_RFM.sort_values('Monetary', ascending=False)
    return df_RFM

In [10]:
def visualization_countplot(count):
    fig, ax =  plt.subplots()
    ax.bar(count.index, count.values, color='lightskyblue')
    for container in ax.containers:
        ax.bar_label(container)
    plt.xticks(rotation=90)
    plt.xlabel('Cluster')
    plt.ylabel('Number of Customers')
    plt.show()

In [4]:
# Calculate average values for each GMM_segment, and return a size of each segment 
def create_df_agg(df, groupby_col):
    df.R = df.R.astype(int)
    df.F = df.F.astype(int)
    df.M = df.M.astype(int)
    
    df_agg = df.groupby(groupby_col).agg({
      'Recency': 'mean',
      'Frequency': 'mean',
      'Monetary': 'mean',
      'R': 'mean',
      'F': 'mean',
      'M': ['mean', 'count']}).round(0)

    df_agg.columns = df_agg.columns.droplevel()
    df_agg.columns = ['RecencyMean','FrequencyMean','MonetaryMean', 'RMean','FMean','MMean','Count']
    df_agg['Percent'] = round((df_agg['Count']/df_agg.Count.sum())*100, 2)

    # Reset the index
    df_agg = df_agg.reset_index()
    
    return df_agg

In [5]:
# Visualization - Treemap
import matplotlib
def treemap_customer_segmentation(df_agg,font_size):
    #Create our plot and resize it.
    fig = plt.gcf()
    ax = fig.add_subplot()
    fig.set_size_inches(15, 9)

    # create a color palette, mapped to these values
    cmap = matplotlib.cm.rainbow
    norm = matplotlib.colors.Normalize(vmin=min(df_agg['Count']), vmax=max(df_agg['Count']))
    colors = [cmap(norm(value)) for value in df_agg['Count']]

    squarify.plot(sizes=df_agg['Count'],
                text_kwargs={'fontsize':font_size,'weight':'bold', 'fontname':"sans serif"},
                color=colors,
                label=['Cluster {:.0f} \n{:.0f} days \n{:.0f} orders \n{:.0f} $ \n{:.0f} customers ({}%)'.format(*df_agg.iloc[i])
                        for i in range(0, len(df_agg))], alpha=0.5 )


    plt.title("Customers Segments",fontsize=26,fontweight="bold", loc='left')
    plt.axis('off')

    # plt.savefig('RFM Segments.png')
    plt.show()

In [None]:
# Visualization - Treemap
import matplotlib
def treemap_customer_segmentation_rfm(df_agg,font_size):
    #Create our plot and resize it.
    fig = plt.gcf()
    ax = fig.add_subplot()
    fig.set_size_inches(15, 9)

    # create a color palette, mapped to these values
    cmap = matplotlib.cm.rainbow
    norm = matplotlib.colors.Normalize(vmin=min(df_agg['Count']), vmax=max(df_agg['Count']))
    colors = [cmap(norm(value)) for value in df_agg['Count']]

    squarify.plot(sizes=df_agg['Count'],
                text_kwargs={'fontsize':font_size,'weight':'bold', 'fontname':"sans serif"},
                color=colors,
                label=['{} \n{:.0f} days \n{:.0f} orders \n{:.0f} $ \n{:.0f} customers ({}%)'.format(*df_agg.iloc[i])
                        for i in range(0, len(df_agg))], alpha=0.5 )


    plt.title("Customers Segments",fontsize=26,fontweight="bold", loc='left')
    plt.axis('off')

    # plt.savefig('RFM Segments.png')
    plt.show()

In [6]:
def drop_outliers(data_RFM):
    max_m = data_RFM['Monetary'].mean() + (3*data_RFM['Monetary'].std())
    max_f = data_RFM['Frequency'].mean() + (3*data_RFM['Frequency'].std())
    max_r = data_RFM['Recency'].mean() + (3*data_RFM['Recency'].std())
    data_RFM_no = data_RFM[(data_RFM['Monetary'] < max_m)]                   
    data_RFM_no = data_RFM_no[(data_RFM_no['Frequency'] < max_f)]
    data_RFM_no = data_RFM_no[(data_RFM_no['Recency'] < max_r)]
    data_outlier = data_RFM[~data_RFM.index.isin(data_RFM_no.index)]
    return data_RFM_no, data_outlier, max_m, max_f, max_r

In [7]:
def elite_regular_ghost_group(data_RFM, data_outlier, max_m, max_f, max_r, Monetary_mean):
    high_spenders = data_RFM[data_RFM['Monetary'] >= max_m]
    active_spenders = data_RFM[(data_RFM['Frequency'] >= max_f) & (data_RFM['Monetary'] >= Monetary_mean)]
    elite = pd.concat([high_spenders, active_spenders])
    elite = elite.drop_duplicates()
    regular = data_RFM[(data_RFM['Frequency'] >= max_f) & (data_RFM['Monetary'] < Monetary_mean)]
    ghost = data_RFM[data_RFM['Recency'] >= max_r]
    elite = elite[~elite.index.isin(ghost.index)]
    return elite, regular, ghost

In [8]:
def drop_outliers_predict(data_RFM, max_m, max_f, max_r):
    data_RFM_no = data_RFM[(data_RFM['Monetary'] < max_m)]                   
    data_RFM_no = data_RFM_no[(data_RFM_no['Frequency'] < max_f)]
    data_RFM_no = data_RFM_no[(data_RFM_no['Recency'] < max_r)]
    data_outlier = data_RFM[~data_RFM.index.isin(data_RFM_no.index)]
    return data_RFM_no, data_outlier

In [9]:
# Count outliers: lowoutliers, highoutliers, nonoutliers

def count_outliers(data_RFM):
  outliers = {}                 # lowoutliers, highoutliers, nonoutliers
  for i in range(data_RFM.shape[1]):
    min_t = data_RFM[data_RFM.columns[i]].mean() - (3*data_RFM[data_RFM.columns[i]].std())
    max_t = data_RFM[data_RFM.columns[i]].mean() + (3*data_RFM[data_RFM.columns[i]].std())
    lcount = 0
    hcount = 0
    for j in data_RFM[data_RFM.columns[i]]:
        if j<min_t:
            lcount += 1
        if j>max_t:
            hcount += 1
#     outliers[data_RFM.columns[i]] = [lcount, hcount, data_RFM.shape[0] - lcount - hcount]
        outliers[data_RFM.columns[i]] = lcount + hcount
  return outliers