In [16]:
from datetime import datetime
import pandas as pd
import plotly.express as px
import plotly.io as pio
import plotly.graph_objects as go
from plotly.offline import init_notebook_mode, iplot
import warnings
warnings.filterwarnings('ignore')
pio.templates.default = 'plotly_dark'
init_notebook_mode(connected=True)

In [2]:
df= pd.read_csv("data/rfm_data.csv")
display(df.head(3))

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location
0,8814,2023-04-11,943.31,Product C,890075,Tokyo
1,2188,2023-04-11,463.7,Product A,176819,London
2,4608,2023-04-11,80.28,Product A,340062,New York


In [3]:
# Convert 'PurchaseDate' to datetime
df['PurchaseDate'] = pd.to_datetime(df['PurchaseDate'])

for col in df.columns:
    if df[col].dtype == 'O':
        df[col] = df[col].astype('category')

In [4]:
# Calculate Recency
df['Recency'] = (pd.Timestamp(datetime.now().date()) - df['PurchaseDate']).dt.days

# Calculate Frequency
frequency_data = df.groupby('CustomerID')['OrderID'].count().reset_index()
frequency_data.rename(columns={'OrderID': 'Frequency'}, inplace=True)
df = df.merge(frequency_data, on='CustomerID', how='left')

# Calculate Monetary Value
monetary_data = df.groupby('CustomerID')['TransactionAmount'].sum().reset_index()
monetary_data.rename(columns={'TransactionAmount': 'MonetaryValue'}, inplace=True)
df = df.merge(monetary_data, on='CustomerID', how='left')

In [5]:
# Define scoring criteria for each RFM value
recency_scores = [5, 4, 3, 2, 1]  # Higher score for lower recency (more recent)
frequency_scores = [1, 2, 3, 4, 5]  # Higher score for higher frequency
monetary_scores = [1, 2, 3, 4, 5]  # Higher score for higher monetary value

# Calculate RFM scores
df['RecencyScore'] = pd.cut(df['Recency'], bins=5, labels=recency_scores)
df['FrequencyScore'] = pd.cut(df['Frequency'], bins=5, labels=frequency_scores)
df['MonetaryValueScore'] = pd.cut(df['MonetaryValue'], bins=5, labels=monetary_scores)

In [6]:
for col in ['RecencyScore', 'FrequencyScore', 'MonetaryValueScore']:
    df[col] = df[col].astype('int')

In [7]:
# Calculate RFM score by combining the individual scores
df['RFM_Score'] = df['RecencyScore'] + df['FrequencyScore'] + df['MonetaryValueScore']

# Create RFM segments based on the RFM score
segment_labels = ['Low-Value', 'Mid-Value', 'High-Value']
df['Value Segment'] = pd.qcut(df['RFM_Score'], q=3, labels=segment_labels)

In [8]:
display(df.tail(2))

Unnamed: 0,CustomerID,PurchaseDate,TransactionAmount,ProductInformation,OrderID,Location,Recency,Frequency,MonetaryValue,RecencyScore,FrequencyScore,MonetaryValueScore,RFM_Score,Value Segment
998,1440,2023-06-10,729.94,Product B,559753,Paris,124,1,729.94,5,1,2,8,High-Value
999,4759,2023-06-10,804.28,Product D,467544,New York,124,1,804.28,5,1,2,8,High-Value


In [18]:
segments = df['Value Segment'].value_counts().reset_index()
segments.columns = ['SegmentValue', 'Count']

pastel_colors = px.colors.qualitative.Pastel1

# bar chart
fig_segments = px.bar(segments, x='SegmentValue', y='Count', color='SegmentValue', 
                      color_discrete_sequence=pastel_colors, title='RFM Score Segment Distribution')
fig_segments.update_layout(xaxis_title='RFM Score Segment', yaxis_title='Count', showlegend=False)
fig_segments.show()

In [12]:
# Customer Segments
df['RFMCustomerSegment'] = ''

df.loc[df['RFM_Score']  >= 9, 'RFMCustomerSegment'] = 'Champions'
df.loc[(df['RFM_Score']  >= 6) & (df['RFM_Score'] < 9), 'RFMCustomerSegment'] = 'Potential_loyalists'
df.loc[(df['RFM_Score']  >= 5) & (df['RFM_Score'] < 6), 'RFMCustomerSegment'] = 'At_risk_customers'
df.loc[(df['RFM_Score']  >= 4) & (df['RFM_Score'] < 5), 'RFMCustomerSegment'] = 'Cant_lose'
df.loc[(df['RFM_Score']  >= 3) & (df['RFM_Score'] < 4), 'RFMCustomerSegment'] = 'Lost'

display(df[['CustomerID', 'RFM_Score', 'RFMCustomerSegment']].head(3))

Unnamed: 0,CustomerID,RFM_Score,RFMCustomerSegment
0,8814,4,Cant_lose
1,2188,3,Lost
2,4608,3,Lost


In [17]:
# Customer distribution
product_segments = df.groupby(['Value Segment', 'RFMCustomerSegment']).size().reset_index(name='Count')
product_segments = product_segments.sort_values('Count', ascending=False)

fig_treemap_segment = px.treemap(product_segments, 
                                         path=['Value Segment', 'RFMCustomerSegment'], 
                                         values='Count',
                                         color='Value Segment', color_discrete_sequence=px.colors.qualitative.Pastel1,
                                         title='RFM Customer Segments by Value')
fig_treemap_segment.show()

In [20]:
# Filter the data to include only the customers in the Champions segment
champions_segment = df[df['RFMCustomerSegment'] == 'Champions']

fig = go.Figure()
fig.add_trace(go.Box(y=champions_segment['RecencyScore'], name='Recency'))
fig.add_trace(go.Box(y=champions_segment['FrequencyScore'], name='Frequency'))
fig.add_trace(go.Box(y=champions_segment['MonetaryValueScore'], name='Monetary value'))

fig.update_layout(title='Distribution of RFM Values within Champions Segment',
                  yaxis_title='RFM Value',
                  showlegend=True)

fig.show()

In [27]:
correlation_matrix = champions_segment[['RecencyScore', 'FrequencyScore', 'MonetaryValueScore']].corr()

# Visualize the correlation matrix using a heatmap
fig_heatmap = go.Figure(data=go.Heatmap(
                   z=correlation_matrix.values,
                   x=correlation_matrix.columns,
                   y=correlation_matrix.columns,
                   colorscale='RdBu',
                   colorbar=dict(title='Correlation')))

fig_heatmap.update_layout(title='Correlation Matrix of RFM Values within Champions Segment')

fig_heatmap.show()

In [29]:
segment_counts = df['RFMCustomerSegment'].value_counts()

# Create a bar chart to compare segment counts
fig = go.Figure(data=[go.Bar(x=segment_counts.index, y=segment_counts.values,
                            marker=dict(color=pastel_colors))])

# Set the color of the Champions segment as a different color
champions_color = 'rgb(158, 202, 225)'
fig.update_traces(marker_color=[champions_color if segment == 'Champions' else pastel_colors[i]
                                for i, segment in enumerate(segment_counts.index)],
                  marker_line_color='rgb(8, 48, 107)',
                  marker_line_width=1.5, opacity=0.8)

# Update the layout
fig.update_layout(title='Comparison of RFM Segments',
                  xaxis_title='RFM Segments',
                  yaxis_title='Number of Customers',
                  showlegend=False)

fig.show()

In [33]:
# Calculate the average Recency, Frequency, and Monetary scores for each segment
segment_scores = df.groupby('RFMCustomerSegment')[['RecencyScore', 'FrequencyScore', 'MonetaryValueScore']].mean().reset_index()
# Create a grouped bar chart to compare segment scores
fig = go.Figure()

# Add bars for Recency score
fig.add_trace(go.Bar(
    x=segment_scores['RFMCustomerSegment'],
    y=segment_scores['RecencyScore'],
    name='Recency Score',
    marker_color='rgb(158,202,225)'
))

# Add bars for Frequency score
fig.add_trace(go.Bar(
    x=segment_scores['RFMCustomerSegment'],
    y=segment_scores['FrequencyScore'],
    name='Frequency Score',
    marker_color='rgb(94,158,217)'
))

# Add bars for Monetary score
fig.add_trace(go.Bar(
    x=segment_scores['RFMCustomerSegment'],
    y=segment_scores['MonetaryValueScore'],
    name='Monetary Value Score',
    marker_color='rgb(32,102,148)'
))

# Update the layout
fig.update_layout(
    title='Comparison of RFM Segments based on Recency, Frequency, and Monetary Value Scores',
    xaxis_title='RFM Segments',
    yaxis_title='Score',
    barmode='group',
    showlegend=True
)

fig.show()