# RFM Model

In [1]:
import numpy as np
import pandas as pd
from datetime import timedelta
import matplotlib.pyplot as plt
import seaborn as sns
import squarify
import zipfile37 as zipfile

In [2]:
from dateutil.relativedelta import relativedelta

In [3]:
ecomm = pd.read_csv('data.csv',
                    encoding='ISO-8859-1')

In [None]:
# InvoiceDate to datetime format
ecomm['InvoiceDate'] = pd.to_datetime(ecomm.InvoiceDate)

In [None]:
print('{:,} rows; {:,} columns'.format(len(ecomm),
                                       len(ecomm.columns)))

print('{:,}, null CustomerIDs'.format(sum(ecomm.CustomerID.isnull())))

print('Invoice dates range from {} to {}'.format(ecomm.InvoiceDate.min(),
                                                 ecomm.InvoiceDate.max()))

In [None]:
ecomm.dropna()

## Begin Recency, Frequency, Monetary Sorting

In [None]:
# Snapshot date as day after max date
snapshot_date = ecomm.InvoiceDate.max() + relativedelta(days=1)
print(snapshot_date)

In [None]:
ecomm['GrossSales'] = ecomm.Quantity * ecomm.UnitPrice

In [None]:
ecomm['InvoiceDate_2'] = ecomm.InvoiceDate

In [None]:
rfm_vals = ecomm.groupby(['CustomerID']).agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,
    'InvoiceNo': 'count',
    'GrossSales': 'sum',
    'InvoiceDate_2': lambda x: (snapshot_date - x.min()).days,
})
rfm_vals.rename(columns={'InvoiceDate':'Recency',
                         'InvoiceNo':'Frequency',
                         'GrossSales':'MonetaryValue',
                         'InvoiceDate_2':'Tenure'},
                inplace=True)

In [None]:
rfm_vals

In [None]:
print(rfm_vals.head(5))
print('{:,} rows and {:,} columns'.format(len(rfm_vals),
                                          len(rfm_vals.columns)))

### Take a look at distributions

In [None]:
sns.histplot(rfm_vals['Recency'], 
            kde=True, stat='density')

In [None]:
fig, axs = plt.subplots(3, figsize=(12,12))
sns.histplot(rfm_vals['Recency'], 
             kde=True, 
             stat='density',
             ax=axs[0])
sns.histplot(rfm_vals['Frequency'], 
             kde=True,
             stat='density',
             ax=axs[1])
sns.histplot(rfm_vals['MonetaryValue'], 
             kde=True,
             stat='density',
             ax=axs[2])
plt.show()

Highly skewed data, particularly in terms of Frequency and Value

#### Create quantiles for Recency and Frequency

In [None]:
r_labels = range(4, 0, -1)
f_labels = range(1, 5)
m_labels = range(1, 5)


r_groups = pd.qcut(rfm_vals['Recency'], 
                   q=4, 
                   labels=r_labels)

f_groups = pd.qcut(rfm_vals['Frequency'], 
                   q=4, 
                   labels=f_labels)

m_groups = pd.qcut(rfm_vals['MonetaryValue'], 
                   q=4, 
                   labels=f_labels)

rfm_vals = rfm_vals.assign(R = r_groups.values,
                           F = f_groups.values, 
                           M = m_groups.values)

In [None]:
rfm_vals.head()

#### Concat to RFM

In [None]:
def concat_rfm(x):
    return str(int(x['R'])) + str(int(x['F'])) + str(int(x['M']))
rfm_vals['RFM_concat'] = rfm_vals.apply(concat_rfm,
                                        axis=1)

In [None]:
rfm_vals.head()

In [None]:
rfm_vals.RFM_concat.nunique()

In [None]:
rfm_vals['RFM_Score'] = rfm_vals[['R', 'F', 'M']].sum(axis=1)
rfm_vals.head()

In [None]:
# Define RFM Levels
def rfm_level(df):
    if df['RFM_Score'] >= 9:
        return 'Can\'t Lose Them'
    elif 9 > df['RFM_Score'] >= 8:
        return 'Champions'
    elif 8 > df['RFM_Score'] >= 7:
        return 'Loyal'
    elif 7 > df['RFM_Score'] >= 6:
        return 'Potential'
    elif 6 > df['RFM_Score'] >= 5:
        return 'Promising'
    elif 5 > df['RFM_Score'] >= 4:
        return 'Needs Attention'
    else:
        return 'Require Activation'
rfm_vals['RFM_Level'] = rfm_vals.apply(rfm_level,
                                        axis=1)
rfm_vals.head()

#### Grouping by RFM

In [None]:
rfm_vals

In [None]:
rfm_groupings = rfm_vals.groupby('RFM_Level').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'MonetaryValue': ['mean', 'count']
}).round(1)

In [None]:
rfm_groupings

Note, Have not seen Needs Attention in a while, provide promo to bring back? Incentivize larger shopping carts? given low Monetary?
Potential very close to Loyal, but need to boost Monetary, promotion based on cart size?
Promising, need to bring back to buying, target discount on previously browsed products.

In [None]:
rfm_groupings.columns.droplevel()

In [None]:
rfm_groupings.columns = rfm_groupings.columns.droplevel()
rfm_groupings.columns = ['RecencyMean',
                         'FrequencyMean',
                         'MonetaryMean',
                         'Count']
#Create our plot and resize it.
fig = plt.gcf()
ax = fig.add_subplot()
fig.set_size_inches(16, 9)
squarify.plot(sizes=rfm_groupings['Count'], 
              label=rfm_groupings.index, alpha=.6 )
plt.title("RFM Segments",fontsize=18,fontweight="bold")
plt.axis('off')
plt.savefig('docs/RFM_Capture.png')
plt.show()

## Implement K-Means

Add tenute to analysis

In [None]:
fig, axs = plt.subplots(4, figsize=(12,12))
sns.histplot(rfm_vals['Recency'], 
             kde=True, 
             stat='density',
             ax=axs[0])
sns.histplot(rfm_vals['Frequency'], 
             kde=True,
             stat='density',
             ax=axs[1])
sns.histplot(rfm_vals['MonetaryValue'], 
             kde=True,
             stat='density',
             ax=axs[2])
sns.histplot(rfm_vals['Tenure'], 
             kde=True,
             stat='density',
             ax=axs[3])
plt.show()

Note the high skew of R, F, M. Log transform data

In [None]:
recency_log = np.log(rfm_vals['Recency'])
frequency_log = np.log(rfm_vals['Frequency'])
monetary_log = np.log(
    rfm_vals['MonetaryValue'] + 0.0000000001) #due to zeros
tenure_log = np.log(rfm_vals['Tenure'])

In [None]:
fig, axs = plt.subplots(4, figsize=(12,12))
sns.histplot(recency_log, 
             kde=True, 
             stat='density',
             ax=axs[0])
sns.histplot(frequency_log, 
             kde=True,
             stat='density',
             ax=axs[1])
sns.histplot(monetary_log, 
             kde=True,
             stat='density',
             ax=axs[2])
sns.histplot(tenure_log, 
             kde=True,
             stat='density',
             ax=axs[3])
plt.show()

But note the left-skew of Tenure. Standardization.

In [None]:
rfm_vals.describe()

In [None]:
rfm_vals_kmeans = rfm_vals[['Recency',
                            'Frequency',
                            'MonetaryValue',
                            'Tenure']]

In [None]:
rfm_vals_log = pd.DataFrame(dict(zip('R F M T'.split(),
                                     [recency_log.values,
                                      frequency_log.values, 
                                      monetary_log.values, 
                                      tenure_log.values])))

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(rfm_vals_log)
rfm_vals_norm = scaler.transform(rfm_vals_log)

In [None]:
rfm_vals_norm_df = pd.DataFrame(rfm_vals_norm,
                                columns='R F M T'.split())