[Reference](https://medium.com/@ugursavci/customer-segmentation-using-rfm-analysis-in-python-218a3255f714)

# Importing Libraries


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import datetime as dt
import warnings
warnings.filterwarnings('ignore')

# Reading the Dataset


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/ugursavci/Customer_Segmentation_using_RFM_Analysis/main/Online_Retail.csv')

# Understanding Data


In [3]:
def summary(df):
    display(df.head())
    print('-'*100)
    display(df.info())
    print('-'*100)
    display(df.describe([0.01,0.25,0.50,0.75,0.99]))
summary(df)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


----------------------------------------------------------------------------------------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 495478 entries, 0 to 495477
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    495478 non-null  object 
 1   StockCode    495478 non-null  object 
 2   Description  494024 non-null  object 
 3   Quantity     495478 non-null  int64  
 4   InvoiceDate  495478 non-null  object 
 5   UnitPrice    495478 non-null  float64
 6   CustomerID   361878 non-null  float64
 7   Country      495478 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 30.2+ MB


None

----------------------------------------------------------------------------------------------------


Unnamed: 0,Quantity,UnitPrice,CustomerID
count,495478.0,495478.0,361878.0
mean,8.605486,4.532422,15547.871368
std,227.588756,99.315438,1594.40259
min,-80995.0,-11062.06,12346.0
1%,-2.0,0.19,12748.0
25%,1.0,1.25,14194.0
50%,3.0,2.1,15514.0
75%,10.0,4.13,16931.0
99%,100.0,16.95,18223.0
max,80995.0,38970.0,18287.0


# Data Preparation


In [4]:
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

In [5]:
df = df[(df.Quantity>0) & (df.UnitPrice> 0)]

In [6]:
df = df[~df['StockCode'].str.contains('C')]

In [7]:
df = df.drop_duplicates()

In [8]:
df.isnull().sum()

InvoiceNo           0
StockCode           0
Description         0
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     128535
Country             0
dtype: int64

In [9]:
df.dropna(inplace=True)

In [10]:
df['Total_Price'] = df['UnitPrice']*df['Quantity']

# Creating RFM Dataframe


In [11]:
df['InvoiceDate'].max() # let's see the latest day 

Timestamp('2011-12-09 12:49:00')

In [12]:
now =  dt.datetime(2011,12,10)

In [13]:
rfm = df.groupby('CustomerID').agg({'InvoiceDate' : lambda day : (now - day.max()).days,
                               'InvoiceNo': lambda num : len(num),
                              'Total_Price': lambda price : price.sum()
                             
                             
                             })
col_list = ['Recency','Frequency','Monetary']
rfm.columns = col_list

In [14]:
rfm["R"] = pd.qcut(rfm["Recency"],5,labels=[5,4,3,2,1])
rfm["F"] = pd.qcut(rfm["Frequency"],5,labels=[1,2,3,4,5])
rfm["M"] = pd.qcut(rfm["Monetary"],5,labels=[1,2,3,4,5])
rfm["RFM_Score"] = rfm["R"].astype(str) +rfm["F"].astype(str) + rfm["M"].astype(str)

In [15]:
seg_map = {
    r'[1-2][1-2]': 'Hibernating',
    r'[1-2][3-4]': 'At Risk',
    r'[1-2]5': 'Can\'t Loose',
    r'3[1-2]': 'About to Sleep',
    r'33': 'Need Attention',
    r'[3-4][4-5]': 'Loyal Customers',
    r'41': 'Promising',
    r'51': 'New Customers',
    r'[4-5][2-3]': 'Potential Loyalists',
    r'5[4-5]': 'Champions'
}

In [16]:
rfm['Segment'] = rfm['R'].astype(str) + rfm['F'].astype(str)
rfm['Segment'] = rfm['Segment'].replace(seg_map, regex=True)
rfm.head()

Unnamed: 0_level_0,Recency,Frequency,Monetary,R,F,M,RFM_Score,Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
12346.0,325,1,77183.6,1,1,5,115,Hibernating
12747.0,2,100,4128.71,5,4,5,545,Champions
12748.0,0,4345,32509.54,5,5,5,555,Champions
12749.0,3,196,4014.18,5,5,5,555,Champions
12820.0,3,59,942.34,5,4,4,544,Champions


In [17]:
rfm.groupby('Segment').mean().sort_values('Monetary')

Unnamed: 0_level_0,Recency,Frequency,Monetary
Segment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
About to Sleep,52.219178,15.267123,435.219486
Promising,22.428571,7.183673,445.777347
Hibernating,207.501042,12.985417,537.732126
Need Attention,51.410256,40.523077,815.458821
Potential Loyalists,15.753846,33.876923,879.101826
At Risk,167.796154,55.523077,884.872462
Can't Loose,145.901408,180.098592,2268.05593
Loyal Customers,33.410788,152.181189,2428.635339
New Customers,6.145833,7.0625,4111.827083
Champions,5.471971,275.75226,6129.229024
