In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as pg
import plotly.colors

In [3]:
df_data=pd.read_csv('/content/online_retail.csv')

In [16]:
df_data.head(5)

Unnamed: 0,Frequency,StockCode,Description,Quantity,Recency,UnitPrice,CustomerID,Country,Monetary
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34


In [7]:
df_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
 8   TotalAmount  541909 non-null  float64
dtypes: float64(3), int64(1), object(5)
memory usage: 37.2+ MB


Identify columns for RFM Analysis


*   InvoiceDate --> Recency
*   InvoiceNo   --> Frequency
*   UnitPrice* Quantity --> Monetary    





In [8]:
df_data.shape

(541909, 9)

In [9]:
df_data["TotalAmount"]=df_data["UnitPrice"] * df_data["Quantity"]

In [10]:
df_data.dropna(inplace=True)

In [11]:
df_data=df_data.astype(
    {"TotalAmount":"float64",
     "InvoiceDate":"datetime64[ns]",
     "InvoiceNo":"string",
     "CustomerID":"int64"

     }
  )

In [12]:
df_data.rename(columns={"InvoiceDate":"Recency","InvoiceNo":"Frequency","TotalAmount":"Monetary"},inplace=True)

In [17]:
df_data["Recency"].max()

Timestamp('2011-12-09 12:50:00')

In [13]:
reference_date=df_data["Recency"].max()+pd.DateOffset(days=1)

In [19]:
reference_date

Timestamp('2011-12-10 12:50:00')

In [14]:
df_rfm=df_data.groupby("CustomerID").agg(
    {
        "Recency": lambda x  : (reference_date-max(x)).days ,
        "Frequency":"count",
        "Monetary":"sum"
    }
)

In [15]:
df_rfm

Unnamed: 0_level_0,Recency,Frequency,Monetary
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12346,326,2,0.00
12347,2,182,4310.00
12348,75,31,1797.24
12349,19,73,1757.55
12350,310,17,334.40
...,...,...,...
18280,278,10,180.60
18281,181,7,80.82
18282,8,13,176.60
18283,4,756,2094.88


In [16]:
r=pd.qcut(df_rfm["Recency"],5,labels=[5,4,3,2,1])
f=pd.qcut(df_rfm["Frequency"],5,labels=[1,2,3,4,5])
m=pd.qcut(df_rfm["Monetary"],5,labels=[1,2,3,4,5])

In [17]:
df_rfm["R_Score"]=r.values
df_rfm["F_Score"]=f.values
df_rfm["M_Score"]=m.values

In [18]:
df_rfm.head(5)

Unnamed: 0_level_0,Recency,Frequency,Monetary,R_Score,F_Score,M_Score
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
12346,326,2,0.0,1,1,1
12347,2,182,4310.0,5,5,5
12348,75,31,1797.24,2,3,4
12349,19,73,1757.55,4,4,4
12350,310,17,334.4,1,2,2


In [19]:
df_rfm["RFM_Segment"]=df_rfm["R_Score"].astype(str)+df_rfm["F_Score"].astype(str)+df_rfm["M_Score"].astype(str)

In [20]:
df_rfm['RFM_Score']=df_rfm[["R_Score","F_Score","M_Score"]].sum(axis=1)

Logic for Visualization of the RFM Segments

In [21]:
def get_Segemt(score):
  if score <5 :
    return "Low Value"
  elif  score <9 :
      return "Mid Value"
  else:
      return "High Value"


In [22]:
df_rfm["RFM_Segment_Label"]=df_rfm["RFM_Score"].apply(get_Segemt)

In [23]:
# Get the count of each segments
segment_counts=df_rfm["RFM_Segment_Label"].value_counts().reset_index()
# Rename the columns of the DataFrame segment_counts
segment_counts.columns=["Segment","Count"]

#Sort the Dataframe base on the values
segment_counts=segment_counts.sort_values(by="Count",ascending=False)

In [24]:
segment_counts

Unnamed: 0,Segment,Count
0,High Value,2337
1,Mid Value,1492
2,Low Value,543


In [25]:
# Drawaing Bar Plot for count of customers under various categories like High,Medium,and Low.
fig= px.bar(x="Segment",
            y="Count",
            title="Customer Distribution by RFM Segment",
            color="Segment",
            color_discrete_sequence=px.colors.qualitative.Pastel,
            labels={"Segment":"RFM Segmnet","Count":"Number of Customers"},
            data_frame=segment_counts
            )

fig.show()

In [34]:
df_rfm["RFM_Customer_Segment"]=''
df_rfm.loc[df_rfm['RFM_Score']>=9,'RFM_Customer_Segment']='Champions'
df_rfm.loc[(df_rfm['RFM_Score']>=6) & (df_rfm['RFM_Score']<9),'RFM_Customer_Segment']='Potential Loyalists'
df_rfm.loc[(df_rfm['RFM_Score']>=5) & (df_rfm['RFM_Score']<6),'RFM_Customer_Segment']='At Risk Customers'
df_rfm.loc[(df_rfm['RFM_Score']>=4) & (df_rfm['RFM_Score']<5),'RFM_Customer_Segment']='Cant Loose'
df_rfm.loc[(df_rfm['RFM_Score']>=3) & (df_rfm['RFM_Score']<4),'RFM_Customer_Segment']='Lost'
#segment_counts=df_rfm['RFM_Customer_Segment'].value_counts().reset_index()


In [35]:
# Get the size of each group of each Segment and Customer Segment
segment_product_counts=df_rfm.groupby(["RFM_Segment_Label",'RFM_Customer_Segment']).size().reset_index(name='Count')
# Sort the dataframe obtained above based on the column Count
segment_product_counts=segment_product_counts.sort_values('Count',ascending=False)



In [31]:
segment_product_counts

Unnamed: 0,RFM_Segment_Label,RFM_Customer_Segment,Count
0,High Value,Champions,2337
4,Mid Value,Potential Loyalists,1130
3,Mid Value,At Risk Customers,362
2,Low Value,Lost,287
1,Low Value,Cant Loose,256


In [36]:
fig=px.treemap(segment_product_counts,
              path=['RFM_Segment_Label','RFM_Customer_Segment'],
              values='Count',
              color='Count',
              color_continuous_scale='RdBu',
              title="RFM Customer Segment by Value",
              color_discrete_sequence=px.colors.qualitative.Pastel)
fig.show()

In [52]:
vip_segment=df_rfm[df_rfm['RFM_Customer_Segment']=='Champions']

In [53]:
vip_segment

Unnamed: 0_level_0,Recency,Frequency,Monetary,R_Score,F_Score,M_Score,RFM_Segment,RFM_Score,RFM_Segment_Label,RFM_Customer_Segment
CustomerID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
12347,2,182,4310.00,5,5,5,555,15,High Value,Champions
12348,75,31,1797.24,2,3,4,234,9,High Value,Champions
12349,19,73,1757.55,4,4,4,444,12,High Value,Champions
12352,36,95,1545.41,3,4,4,344,11,High Value,Champions
12356,23,59,2811.43,4,4,5,445,13,High Value,Champions
...,...,...,...,...,...,...,...,...,...,...
18260,173,140,2595.00,2,5,5,255,12,High Value,Champions
18263,23,62,1211.08,4,4,4,444,12,High Value,Champions
18272,3,170,3064.78,5,5,5,555,15,High Value,Champions
18283,4,756,2094.88,5,5,5,555,15,High Value,Champions


In [54]:
fig=pg.Figure()
fig.add_trace=(pg.Box(y=vip_segment['Recency'], name='Recency'))
#fig.add_trace=(pg.Box(y=vip_segment['Frequency'],name='Frequency'))
#fig.add_trace=(pg.Box(y=vip_segment['Monetary'], name='Monetary'))
fig.show()

In [50]:
correlation_matrix=vip_segment[['R_Score','F_Score','M_Score']].corr()

In [None]:
fig_heatmap=pg.Figure(data=go.Heatmap(



))
