### Customer Segmentation

In [3]:
# Core
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

# Statistics
from scipy.stats import norm, probplot

# Machine Learning
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

In [4]:
rfm_table = pd.read_csv('../data/rfm/rfm_table.csv' )
rfm_table.head(2)

Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,12346,325,1,77183.6
1,12347,1,7,4310.0


In [5]:
quantil = rfm_table.quantile(q=[0.25, 0.50, 0.75]).to_dict()

# Define scoring functions
def RecencyPoints(y, rfm, q):
    if y <= q[rfm][0.25]:
        return 4
    elif y <= q[rfm][0.50]:
        return 3
    elif y <= q[rfm][0.75]:
        return 2
    else:
        return 1

def Freq_MonetaryPoints(y, rfm, q):
    if y <= q[rfm][0.25]:
        return 1
    elif y <= q[rfm][0.50]:
        return 2
    elif y <= q[rfm][0.75]:
        return 3
    else:
        return 4

In [6]:
rfm_segment = rfm_table.copy()

rfm_segment['Recency_Quartile'] = rfm_segment['Recency'].apply(RecencyPoints, args=('Recency', quantil))
rfm_segment['Frequency_Quartile'] = rfm_segment['Frequency'].apply(Freq_MonetaryPoints, args=('Frequency', quantil))
rfm_segment['Monetary_Quartile'] = rfm_segment['Monetary'].apply(Freq_MonetaryPoints, args=('Monetary', quantil))

rfm_segment['RFMPoints'] = (
    rfm_segment['Recency_Quartile'].astype(str) +
    rfm_segment['Frequency_Quartile'].astype(str) +
    rfm_segment['Monetary_Quartile'].astype(str)
)

customer_dict = {
    'Best Customers': '444',
    'Loyal Customers': '344',
    'Big Spender': '334',
    'Almost Lost': '244',
    'Lost Customers': '144',
    'Recent Customers': '443',
    'Lost Cheap Customers': '122'
}
dict_segment = {v: k for k, v in customer_dict.items()}

rfm_segment['Segment'] = rfm_segment['RFMPoints'].map(lambda x: dict_segment.get(x, 'Others'))
rfm_segment.head()


Unnamed: 0,CustomerID,Recency,Frequency,Monetary,Recency_Quartile,Frequency_Quartile,Monetary_Quartile,RFMPoints,Segment
0,12346,325,1,77183.6,1,1,4,114,Others
1,12347,1,7,4310.0,4,4,4,444,Best Customers
2,12348,74,4,1797.24,2,3,4,234,Others
3,12349,18,1,1757.55,3,1,4,314,Others
4,12350,309,1,334.4,1,1,2,112,Others


In [7]:
def rfm_distribution_plot(rfm_table):
    measures = ['Recency', 'Frequency', 'Monetary']
    fig = make_subplots(rows=3, cols=1, shared_xaxes=False,
                        subplot_titles=[f"{m} Distribution" for m in measures])

    for i, measure in enumerate(measures):
        data = rfm_table[measure]
        mu, sigma = norm.fit(data)
        x = np.linspace(data.min(), data.max(), 100)
        pdf = norm.pdf(x, mu, sigma)

        # Histogram
        fig.add_trace(go.Histogram(
            x=data, histnorm='probability density', nbinsx=30,
            name=f"{measure} Histogram", marker_color='royalblue', opacity=0.6
        ), row=i+1, col=1)

        # Fitted curve
        fig.add_trace(go.Scatter(
            x=x, y=pdf, mode='lines', name=f"{measure} Fit",
            line=dict(color='crimson', width=2)
        ), row=i+1, col=1)

    fig.update_layout(height=1000, width=800, title_text="RFM Distributions (Interactive)",
                      showlegend=False, template='plotly_white')
    fig.show()

rfm_distribution_plot(rfm_table)


In [None]:
scaled_rfm = rfm_table.copy()
scaled_rfm['Recency'] += abs(rfm_table['Recency'].min()) + 1
scaled_rfm['Monetary'] += abs(rfm_table['Monetary'].min()) + 1

# Log transform and standardization
log_df = np.log(scaled_rfm)
scaler = StandardScaler()
normal_ = pd.DataFrame(scaler.fit_transform(log_df), columns=rfm_table.columns, index=rfm_table.index)

normal_.head()

Unnamed: 0,CustomerID,Recency,Frequency,Monetary
0,-1.833344,1.409894,-1.04861,3.731854
1,-1.832631,-2.146498,1.111836,1.41716
2,-1.831918,0.383971,0.490522,0.716423
3,-1.831204,-0.574674,-1.04861,0.698549
4,-1.830491,1.374758,-1.04861,-0.623983


In [13]:
fig = px.scatter(
    normal_,
    x='Recency',
    y='Monetary',
    color='Frequency',
    title='RFM Normalized Data Distribution',
    template='plotly_white',
    labels={
        'Recency': 'Normalized Recency',
        'Monetary': 'Normalized Monetary',
        'Frequency': 'Normalized Frequency'
    }
)
fig.update_traces(marker=dict(size=8, opacity=0.6))
fig.show()

In [11]:
wcss_silhouette = []
for i in range(2, 12):
    kmean = KMeans(n_clusters=i, random_state=0, init='k-means++')
    preds = kmean.fit_predict(normal_)
    silhouette = silhouette_score(normal_, preds)
    wcss_silhouette.append(silhouette)

fig = px.line(
    x=list(range(2, 12)), y=wcss_silhouette,
    title="Silhouette Coefficient vs Number of Clusters",
    labels={'x': 'Number of Clusters', 'y': 'Silhouette Score'},
    template='plotly_white',
    markers=True
)
fig.update_traces(marker=dict(size=10, color='royalblue'))
fig.show()


In [None]:
kme = KMeans(n_clusters=4, random_state=1, init='k-means++')
rfm_segment['Cluster'] = kme.fit_predict(normal_)
rfm_segment['Cluster'].value_counts()


Cluster
2    1257
3    1219
0    1099
1     763
Name: count, dtype: int64

In [14]:
# Pairwise scatter plots of normalized RFM features
fig = px.scatter_matrix(
    normal_.assign(Cluster=rfm_segment['Cluster']),
    dimensions=['Recency', 'Frequency', 'Monetary'],
    color='Cluster',
    title="Cluster Scatter Matrix (RFM)",
    template='plotly_white'
)
fig.update_traces(diagonal_visible=False, marker=dict(size=5, opacity=0.7))
fig.show()


In [15]:
normal_melt = pd.melt(
    normal_.assign(Cluster=rfm_segment['Cluster']).reset_index(),
    id_vars=['CustomerID', 'Cluster'],
    value_vars=['Recency', 'Frequency', 'Monetary'],
    var_name='Metric', value_name='Value'
)

fig = px.line(
    normal_melt, x='Metric', y='Value', color='Cluster',
    title="Snake Plot of RFM Clusters",
    markers=True, template='plotly_white'
)
fig.show()
