In [3]:
#RFM Analysis is used to understand and segment customers based on their buying behaviour
#importing essential library
import pandas as pd
import plotly.express as px#operates on a variety of types of data and produces easy-to-style figure
import plotly.io as pio#return a copy of figure where all styling properties have been moved into figure's template
import plotly.graph_objects as go#contains an automatically-genrated hierachy of python classes which represent non-leaf nodes.
pio.templates.default = "plotly_white"

data = pd.read_csv(r"C:\Users\Diksha\Downloads\rfm_data.csv")
print(data.head())

   CustomerID PurchaseDate  TransactionAmount ProductInformation  OrderID  \
0        8814   2023-04-11             943.31          Product C   890075   
1        2188   2023-04-11             463.70          Product A   176819   
2        4608   2023-04-11              80.28          Product A   340062   
3        2559   2023-04-11             221.29          Product A   239145   
4        9482   2023-04-11             739.56          Product A   194545   

   Location  
0     Tokyo  
1    London  
2  New York  
3    London  
4     Paris  


In [4]:
from datetime import datetime

# Convert 'PurchaseDate' to datetime
data['PurchaseDate'] = pd.to_datetime(data['PurchaseDate'])
#print(data['PurchaseDate'])

# Calculate Recency

data['Recency'] = (datetime.now().date() - data['PurchaseDate'].dt.date)
datediff=[i.days for i in data['Recency']]
#print(datediff)
# Calculate Frequency
frequency_data = data.groupby('CustomerID')['OrderID'].count().reset_index()
frequency_data.rename(columns={'OrderID': 'Frequency'}, inplace=True)
data = data.merge(frequency_data, on='CustomerID', how='left')

#Calculate Monetary Value
monetary_data = data.groupby('CustomerID')['TransactionAmount'].sum().reset_index()
monetary_data.rename(columns={'TransactionAmount': 'MonetaryValue'}, inplace=True)
data = data.merge(monetary_data, on='CustomerID', how='left')

In [5]:
print(data.head())

   CustomerID PurchaseDate  TransactionAmount ProductInformation  OrderID  \
0        8814   2023-04-11             943.31          Product C   890075   
1        2188   2023-04-11             463.70          Product A   176819   
2        4608   2023-04-11              80.28          Product A   340062   
3        2559   2023-04-11             221.29          Product A   239145   
4        9482   2023-04-11             739.56          Product A   194545   

   Location            Recency  Frequency  MonetaryValue  
0     Tokyo  134 days, 0:00:00          1         943.31  
1    London  134 days, 0:00:00          1         463.70  
2  New York  134 days, 0:00:00          1          80.28  
3    London  134 days, 0:00:00          1         221.29  
4     Paris  134 days, 0:00:00          1         739.56  


In [11]:
# Define scoring criteria for each RFM value
recency_scores = [6,5, 4, 3, 2, 1]  # Higher score for lower recency (more recent)
frequency_scores = [1, 2, 3, 4, 5,6]  # Higher score for higher frequency
monetary_scores = [1, 2, 3, 4, 5,6]  # Higher score for higher monetary value
print(data['Recency'])
# Calculate RFM scores

data['RecencyScore'] = pd.cut(datediff, bins=6, labels=recency_scores)
data['FrequencyScore'] = pd.cut(data['Frequency'], bins=6, labels=frequency_scores)
data['MonetaryScore'] = pd.cut(data['MonetaryValue'], bins=6,labels=monetary_scores)
#pd.cut() function to divide recency, frequency, and monetary values into bins. We define 6 bins for each value and assign the 
#corresponding scores to each bin.

0      134 days, 0:00:00
1      134 days, 0:00:00
2      134 days, 0:00:00
3      134 days, 0:00:00
4      134 days, 0:00:00
             ...        
995     74 days, 0:00:00
996     74 days, 0:00:00
997     74 days, 0:00:00
998     74 days, 0:00:00
999     74 days, 0:00:00
Name: Recency, Length: 1000, dtype: object


In [12]:
data['RecencyScore']

0      1
1      1
2      1
3      1
4      1
      ..
995    6
996    6
997    6
998    6
999    6
Name: RecencyScore, Length: 1000, dtype: category
Categories (6, int64): [6 < 5 < 4 < 3 < 2 < 1]

In [13]:
# Convert RFM scores to numeric type
data['RecencyScore'] = data['RecencyScore'].astype(int)
data['FrequencyScore'] = data['FrequencyScore'].astype(int)
data['MonetaryScore'] = data['MonetaryScore'].astype(int)

In [14]:
# Calculate RFM score by combining the individual scores
data['RFM_Score'] = data['RecencyScore'] + data['FrequencyScore'] + data['MonetaryScore']

# Create RFM segments based on the RFM score
segment_labels = ['Low-Value', 'Mid-Value', 'High-Value']
data['Value Segment'] = pd.qcut(data['RFM_Score'], q=3, labels=segment_labels)
#after calculating RFM score, we created RFM segments based on the scores
#qcut() divides data so that the number of elements in each bin is as equal as possible where q is the number of bins

In [17]:
print(data.head(10))

   CustomerID PurchaseDate  TransactionAmount ProductInformation  OrderID  \
0        8814   2023-04-11             943.31          Product C   890075   
1        2188   2023-04-11             463.70          Product A   176819   
2        4608   2023-04-11              80.28          Product A   340062   
3        2559   2023-04-11             221.29          Product A   239145   
4        9482   2023-04-11             739.56          Product A   194545   
5        8483   2023-04-11             375.23          Product C   691194   
6        8317   2023-04-11             272.56          Product B   826847   
7        6911   2023-04-11             433.33          Product C   963918   
8        8993   2023-04-12              16.55          Product D   112426   
9        3519   2023-04-12             464.63          Product C   139726   

   Location            Recency  Frequency  MonetaryValue  RecencyScore  \
0     Tokyo  134 days, 0:00:00          1         943.31             1   
1   

In [None]:
#let's create and analyse RFM value segment represents the categorization of customers based on their RFM scores into groups
#such as “low value”, “medium value”, and “high value”. These segments are determined by dividing RFM scores into distinct 
#ranges or groups, allowing for a more granular analysis of overall customer RFM characteristics. The RFM value segment helps us
#understand the relative value of customers in terms of recency, frequency, and monetary aspects.

In [25]:
# RFM Segment Distribution
segment_counts = data['Value Segment'].value_counts().reset_index()
#.value_counts() powerful way to count the number of occurance of each value in each group,but the resulting DataFrame may not 
#be in desired format.So .reset_index() allows us to reformat yhe DataFrame by resetting the index and creating a new column 
#with the counts
segment_counts.columns = ['Value Segment', 'Count']

pastel_colors = px.colors.qualitative.Pastel#qualitative is used when info is not numerical or it emphasizes the relative size 
#of each of categories being measured by using vertical or horizontal bars OR to find themes and patterns.
#Pastel(library) help us colorize strings in our terminal,having predefined styles:-info:green.comment:
# Create the bar chart
fig_segment_dist = px.bar(segment_counts, x='Value Segment', y='Count', 
                          color='Value Segment', color_discrete_sequence=pastel_colors,
                          title='RFM Value Segment Distribution')

# Update the layout
fig_segment_dist.update_layout(xaxis_title='RFM Value Segment',
                              yaxis_title='Count',
                              showlegend=False)#to hide legend in plot

# Show the figure
fig_segment_dist.show()

In [None]:
#Now let’s create and analyze RFM Customer Segments that are broader classifications based on the RFM scores. These segments, 
#such as “Champions”, “Potential Loyalists”, and “Can’t Lose” provide a more strategic perspective on customer behaviour and 
#characteristics in terms of recency, frequency, and monetary aspects

In [19]:
# Create a new column for RFM Customer Segments
data['RFM Customer Segments'] = ''

# Assign RFM segments based on the RFM score
data.loc[data['RFM_Score'] >= 9, 'RFM Customer Segments'] = 'Champions'
data.loc[(data['RFM_Score'] >= 6) & (data['RFM_Score'] < 9), 'RFM Customer Segments'] = 'Potential Loyalists'
data.loc[(data['RFM_Score'] >= 5) & (data['RFM_Score'] < 6), 'RFM Customer Segments'] = 'At Risk Customers'
data.loc[(data['RFM_Score'] >= 4) & (data['RFM_Score'] < 5), 'RFM Customer Segments'] = "Can't Lose"
data.loc[(data['RFM_Score'] >= 3) & (data['RFM_Score'] < 4), 'RFM Customer Segments'] = "Lost"

# Print the updated data with RFM segments
print(data[['CustomerID', 'RFM Customer Segments']])
#In the above code, we are assigning RFM segments to customers based on their RFM scores and then creating a new column called 
#“RFM Customer Segments” in the data.

     CustomerID RFM Customer Segments
0          8814     At Risk Customers
1          2188            Can't Lose
2          4608                  Lost
3          2559                  Lost
4          9482            Can't Lose
..          ...                   ...
995        2970             Champions
996        6669             Champions
997        8836             Champions
998        1440             Champions
999        4759             Champions

[1000 rows x 2 columns]


In [20]:
#Now let’s analyze the distribution of customers across different RFM customer segments within each value segment:
segment_product_counts = data.groupby(['Value Segment', 'RFM Customer Segments']).size().reset_index(name='Count')

segment_product_counts = segment_product_counts.sort_values('Count', ascending=False)

fig_treemap_segment_product = px.treemap(segment_product_counts, 
                                         path=['Value Segment', 'RFM Customer Segments'], 
                                         values='Count',
                                         color='Value Segment', color_discrete_sequence=px.colors.qualitative.Pastel,
                                         title='RFM Customer Segments by Value')
fig_treemap_segment_product.show()

#This is an interactive visualization, so you will get more insights about each segment after clicking on the segment.

In [26]:
#Now let’s analyze the distribution of RFM values within the Champions segment:
# Filter the data to include only the customers in the Champions segment
champions_segment = data[data['RFM Customer Segments'] == 'Champions']

fig = go.Figure()
fig.add_trace(go.Box(y=champions_segment['RecencyScore'], name='Recency'))
fig.add_trace(go.Box(y=champions_segment['FrequencyScore'], name='Frequency'))
fig.add_trace(go.Box(y=champions_segment['MonetaryScore'], name='Monetary'))

fig.update_layout(title='Distribution of RFM Values within Champions Segment',
                  yaxis_title='RFM Value',
                  showlegend=True)

fig.show()
#Also,this is an interactive visualization, so you will get more insights about each segment after clicking on the segment.

In [27]:
#Now let’s analyze the correlation of the recency, frequency, and monetary scores within the champions segment:

correlation_matrix = champions_segment[['RecencyScore', 'FrequencyScore', 'MonetaryScore']].corr()

# Visualize the correlation matrix using a heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
                   z=correlation_matrix.values,
                   x=correlation_matrix.columns,
                   y=correlation_matrix.columns,
                   colorscale='RdBu',
                   colorbar=dict(title='Correlation')))

fig_heatmap.update_layout(title='Correlation Matrix of RFM Values within Champions Segment')

fig_heatmap.show()

In [32]:
#Now let’s have a look at the number of customers in all the segments:
import plotly.colors

pastel_colors = plotly.colors.qualitative.Pastel

segment_counts = data['RFM Customer Segments'].value_counts()

# Create a bar chart to compare segment counts
fig = go.Figure(data=[go.Bar(x=segment_counts.index, y=segment_counts.values,
                            marker=dict(color=pastel_colors))])

#marker used to simply enhance the visual of line size of a plot.
#dict  

# Set the color of the Champions segment as a different color
champions_color = 'rgb(158, 202, 225)'
fig.update_traces(marker_color=[champions_color if segment == 'Champions' else pastel_colors[i]
                                for i, segment in enumerate(segment_counts.index)],
                  marker_line_color='rgb(8, 48, 107)',
                  marker_line_width=1.5, opacity=0.8)

# Update the layout
fig.update_layout(title='Comparison of RFM Segments',
                  xaxis_title='RFM Segments',
                  yaxis_title='Number of Customers',
                  showlegend=False)

fig.show()

In [30]:
#Now let’s have a look at the recency, frequency, and monetary scores of all the segments:

# Calculate the average Recency, Frequency, and Monetary scores for each segment
segment_scores = data.groupby('RFM Customer Segments')[['RecencyScore', 'FrequencyScore', 'MonetaryScore']].mean().reset_index()

# Create a grouped bar chart to compare segment scores
fig = go.Figure()

# Add bars for Recency score
fig.add_trace(go.Bar(
    x=segment_scores['RFM Customer Segments'],
    y=segment_scores['RecencyScore'],
    name='Recency Score',
    marker_color='rgb(158,202,225)'
))

# Add bars for Frequency score
fig.add_trace(go.Bar(
    x=segment_scores['RFM Customer Segments'],
    y=segment_scores['FrequencyScore'],
    name='Frequency Score',
    marker_color='rgb(94,158,217)'
))

# Add bars for Monetary score
fig.add_trace(go.Bar(
    x=segment_scores['RFM Customer Segments'],
    y=segment_scores['MonetaryScore'],
    name='Monetary Score',
    marker_color='rgb(32,102,148)'
))

# Update the layout
fig.update_layout(
    title='Comparison of RFM Segments based on Recency, Frequency, and Monetary Scores',
    xaxis_title='RFM Segments',
    yaxis_title='Score',
    barmode='group',#
    showlegend=True
)

fig.show()