In [None]:
import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import plotly.graph_objects as go


In [None]:

# --- Step 1: Create Realistic Sample Data ---
# In a real project, you would load your data here. For this example, we'll create it.
print("Step 1: Simulating customer data...")
np.random.seed(42)
num_customers = 500
num_transactions = 2000

customer_ids = np.arange(1, num_customers + 1)
product_prices = np.random.uniform(10, 200, size=50)

data = []
for _ in range(num_transactions):
    customer_id = np.random.choice(customer_ids)
    transaction_date = dt.datetime(2025, 1, 1) + dt.timedelta(days=np.random.randint(0, 270))
    price = np.random.choice(product_prices)
    data.append([customer_id, transaction_date, price])

df = pd.DataFrame(data, columns=['CustomerID', 'TransactionDate', 'Amount'])
print("Sample data created successfully.")
print(df.head())



Step 1: Simulating customer data...
Sample data created successfully.
   CustomerID TransactionDate      Amount
0         191      2025-08-06  182.770876
1         162      2025-07-21  163.595496
2         270      2025-08-03   93.628974
3         213      2025-07-27   59.168197
4         338      2025-02-22   79.608750


In [None]:
# --- Step 2: Calculate Recency, Frequency, and Monetary (RFM) ---
print("\nStep 2: Calculating RFM values...")
# Set a snapshot date for analysis (e.g., the day after the last transaction)
snapshot_date = df['TransactionDate'].max() + dt.timedelta(days=1)

# Calculate RFM values for each customer
rfm_df = df.groupby('CustomerID').agg({
    'TransactionDate': lambda date: (snapshot_date - date.max()).days,
    'CustomerID': 'count',
    'Amount': 'sum'
})

# Rename columns for clarity
rfm_df.rename(columns={'TransactionDate': 'Recency',
                       'CustomerID': 'Frequency',
                       'Amount': 'MonetaryValue'}, inplace=True)

print("RFM values calculated.")
print(rfm_df.head())





Step 2: Calculating RFM values...
RFM values calculated.
            Recency  Frequency  MonetaryValue
CustomerID                                   
1               227          1     193.470086
2                17          5     517.317434
3                55          5     442.374591
4                50          5     538.244286
5                10          8     483.964322


In [None]:
# --- Step 3: Scale the Data and Apply K-Means Clustering ---
print("\nStep 3: Applying K-Means clustering...")
# Scale the data as K-Means is sensitive to the scale of features
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_df)

# Determine the optimal number of clusters (e.g., using the elbow method, but we'll fix it at 4 for simplicity)
num_clusters = 4
kmeans = KMeans(n_clusters=num_clusters, init='k-means++', n_init=10, max_iter=300, random_state=42)
rfm_df['Cluster'] = kmeans.fit_predict(rfm_scaled)
print(f"Customers segmented into {num_clusters} clusters.")





Step 3: Applying K-Means clustering...
Customers segmented into 4 clusters.


In [None]:
# --- Step 4: Analyze and Interpret the Clusters ---
print("\nStep 4: Analyzing cluster characteristics...")
# Calculate the average RFM values for each cluster
cluster_analysis = rfm_df.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'MonetaryValue': 'mean'
}).round(1)

print(cluster_analysis)

# Assign meaningful names to the clusters based on their characteristics
# Note: Cluster numbers might change on each run, but the logic remains the same.
# We'll sort by MonetaryValue to create consistent labels.
cluster_analysis_sorted = cluster_analysis.sort_values(by='MonetaryValue', ascending=False)

cluster_map = {
    cluster_analysis_sorted.index[0]: '01 - Champions (High Value)',
    cluster_analysis_sorted.index[1]: '02 - Loyal Customers',
    cluster_analysis_sorted.index[2]: '03 - At-Risk / Lapsing',
    cluster_analysis_sorted.index[3]: '04 - Lost / Low Value'
}

rfm_df['Segment'] = rfm_df['Cluster'].map(cluster_map)
print("\nCustomer segments assigned.")
print(rfm_df.head())





Step 4: Analyzing cluster characteristics...
         Recency  Frequency  MonetaryValue
Cluster                                   
0          163.8        1.9          197.1
1           50.6        4.6          438.0
2           29.8        7.0          702.2
3           49.3        2.7          223.9

Customer segments assigned.
            Recency  Frequency  MonetaryValue  Cluster  \
CustomerID                                               
1               227          1     193.470086        0   
2                17          5     517.317434        1   
3                55          5     442.374591        1   
4                50          5     538.244286        1   
5                10          8     483.964322        2   

                                Segment  
CustomerID                               
1                 04 - Lost / Low Value  
2                  02 - Loyal Customers  
3                  02 - Loyal Customers  
4                  02 - Loyal Customers  
5       

In [None]:
# --- Step 5: Visualize the Customer Segments ---
print("\nStep 5: Generating 3D plot for visualization...")
fig = go.Figure()

for segment_name in rfm_df['Segment'].unique():
    segment_df = rfm_df[rfm_df['Segment'] == segment_name]
    fig.add_trace(go.Scatter3d(
        x=segment_df['Recency'],
        y=segment_df['Frequency'],
        z=segment_df['MonetaryValue'],
        mode='markers',
        marker=dict(size=5),
        name=segment_name
    ))

fig.update_layout(
    title='Customer Segments based on RFM (3D Scatter Plot)',
    scene=dict(
        xaxis_title='Recency (Days)',
        yaxis_title='Frequency (Purchases)',
        zaxis_title='Monetary Value ($)'
    ),
    margin=dict(l=0, r=0, b=0, t=40)
)

fig.show()


Step 5: Generating 3D plot for visualization...
